diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index df58193f95e2fc2f1ff7e4b7af76dd1f7c9837ef..94934629e28726d15348c5c692eaf31f7598110c 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -95,9 +95,10 @@ void DatasetImpl<T>::SetHdfsConfig(const std::string& fs_name,
                                    const std::string& fs_ugi) {
   fs_name_ = fs_name;
   fs_ugi_ = fs_ugi;
-  std::string cmd = std::string("hadoop fs");
+  std::string cmd = std::string("$HADOOP_HOME/bin/hadoop fs");
   cmd += " -D fs.default.name=" + fs_name;
   cmd += " -D hadoop.job.ugi=" + fs_ugi;
+  cmd += " -Ddfs.client.block.write.retries=15 -Ddfs.rpc.timeout=500000";
   paddle::framework::hdfs_set_command(cmd);
 }
 
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 4d8bd101258664f6cafd71784ae070e0cb8b9215..a3cc4d1721e20a72817606bd773129230a8154ce 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -3,6 +3,7 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context
 
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(fetch_async_op_handle SRCS fetch_async_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 
 cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry) 
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
@@ -98,7 +99,7 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
-        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
+        DEPS fetch_async_op_handle ssa_graph_executor scope simple_threadpool device_context)
 cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
 
 cc_test(exception_holder_test SRCS exception_holder_test.cc )
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index f5ec78f44b5ebb780cc569c24ccdca6336195961..e440dff2af6b5649d34f47c3b696edeb8a1ba0a2 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -18,7 +18,8 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -120,6 +121,11 @@ FetchResultType FastThreadedSSAGraphExecutor::Run(
   }
   // Wait FetchOps.
   ClearFetchOp(graph_, &fetch_ops);
+
+  for (auto &place : places_) {
+    fetch_ctxs_.Get(place)->Wait();
+  }
+
   return fetches;
 }
 
@@ -162,8 +168,8 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
 
     ir::Node *fetch_node =
         graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation);
-    auto *op = new FetchOpHandle(fetch_node, fetches, i, &local_scopes_,
-                                 &local_exec_scopes_, return_merged);
+    auto *op = new FetchAsyncOpHandle(fetch_node, fetches, i, &local_scopes_,
+                                      &local_exec_scopes_, return_merged);
     fetch_ops->emplace_back(op);
 
     for (auto &p : places_) {
@@ -174,6 +180,14 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
       op->AddInput(var);
     }
 
+    for (auto *var : vars) {
+      auto *op = var->GeneratedOp();
+      auto *compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
+      if (compute_op) {
+        compute_op->SetLockAndRecordEventFree(false);
+      }
+    }
+
     int dep = static_cast<int>(op->NotReadyInputSize());
     (*op_deps)[op] = dep;
     if (dep == 0) {
@@ -261,7 +275,7 @@ void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
 const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
 
 void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) {
-  if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchOpHandle *>(op)) {
+  if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchAsyncOpHandle *>(op)) {
     traced_ops_.emplace_back(op);
   }
 }
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6aae523365ed50e78a78b318ac0990490c801eb3
--- /dev/null
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -0,0 +1,275 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+FetchAsyncOpHandle::FetchAsyncOpHandle(ir::Node *node, FetchResultType *data,
+                                       size_t offset,
+                                       std::vector<Scope *> *local_scopes,
+                                       std::vector<Scope *> *local_exec_scopes,
+                                       bool return_merged)
+    : OpHandleBase(node),
+      data_(data),
+      offset_(offset),
+      local_scopes_(local_scopes),
+      local_exec_scopes_(local_exec_scopes),
+      return_merged_(return_merged) {}
+
+FetchAsyncOpHandle::~FetchAsyncOpHandle() {}
+
+void FetchAsyncOpHandle::RecordWaitEventOnCtx(
+    platform::DeviceContext *waited_ctx) {
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "No nodes need to wait FetchAsyncOp. Unexpceted Error."));
+}
+
+static void CheckTensorAttrs(const LoDTensor *tensor,
+                             const proto::VarType::Type &type,
+                             const DataLayout &layout, const DDim &dims,
+                             const LoD &lod, const size_t offset) {
+  if (tensor->numel() && tensor->IsInitialized()) {
+    // step1: check type
+    PADDLE_ENFORCE_EQ(
+        type, tensor->type(),
+        platform::errors::InvalidArgument(
+            "The data type of fetched Tensors or the items of fetched "
+            "LoDTensorArray are different from each other on different "
+            "devices(%s vs %s). And the error is caused by the %zu "
+            "(th) fetched variable. Please set the "
+            "parameter `return_merged = False` when you "
+            "call the `Executor.run()` method.",
+            DataTypeToString(type), DataTypeToString(tensor->type()), offset));
+
+    // step2: check layout
+    PADDLE_ENFORCE_EQ(
+        layout, tensor->layout(),
+        platform::errors::InvalidArgument(
+            "The layout of fetched Tensors or the items of fetched "
+            "LoDTensorArray are different from each other on different "
+            "devices(%s vs %s). And the error is caused by the %zu "
+            "(th) fetched variable. Please set the "
+            "parameter `return_merged = False` when you "
+            "call the `Executor.run()` method.",
+            DataLayoutToString(layout), DataLayoutToString(tensor->layout()),
+            offset));
+  }
+
+  // step3: check dims
+  auto tensor_dims = tensor->dims();
+  PADDLE_ENFORCE_EQ(dims.size(), tensor_dims.size(),
+                    platform::errors::InvalidArgument(
+                        "The dimension sizes of fetched Tensors or "
+                        "the items of fetched LoDTensorArray are "
+                        "different from each other on different "
+                        "devices(%s vs %s). And the error is caused by the %zu "
+                        "(th) fetched variable. Please set the "
+                        "parameter `return_merged = False` when you "
+                        "call the `Executor.run()` method.",
+                        dims, tensor_dims, offset));
+  for (int j = 1; j < dims.size(); j++) {
+    PADDLE_ENFORCE_EQ(dims[j], tensor_dims[j],
+                      platform::errors::InvalidArgument(
+                          "The dimensions of fetched Tensors or "
+                          "the items of fetched LoDTensorArray are "
+                          "different from each other on different "
+                          "devices(%s vs %s). And the error is caused by the "
+                          "%zu (th) fetched variable. Please set the "
+                          "parameter `return_merged = False` when "
+                          "you call the `Executor.run()` method.",
+                          dims, tensor_dims, offset));
+  }
+
+  // step4: check lod
+  PADDLE_ENFORCE_EQ(
+      lod.size(), tensor->lod().size(),
+      platform::errors::InvalidArgument(
+          "The LoD information of fetched Tensors or the items of fetched "
+          "LoDTensorArray are different from each other on different "
+          "devices(%s vs %s). And the error is caused by the %zu "
+          "(th) fetched variable. Please set the "
+          "parameter `return_merged = False` when you "
+          "call the `Executor.run()` method.",
+          lod, tensor->lod(), offset));
+}
+
+static void TransData(const framework::Tensor *src_item,
+                      framework::Tensor *dst_item,
+                      const platform::DeviceContext &ctx) {
+  if (src_item->IsInitialized() && src_item->numel() > 0) {
+    if (platform::is_gpu_place(src_item->place())) {
+#ifdef PADDLE_WITH_CUDA
+      TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
+#endif
+    } else {
+      TensorCopy(*src_item, platform::CPUPlace(), dst_item);
+    }
+  }
+}
+
+void FetchAsyncOpHandle::FetchMergedLodTensor(
+    const std::vector<const LoDTensor *> &src_lodtensors,
+    LoDTensor *dst_lodtensor) {
+  // calc dst type,layout,dim,lod and calc check dim
+  proto::VarType::Type new_type = proto::VarType::FP32;
+  framework::DataLayout new_layout;
+  framework::DDim new_dim;
+  LoD new_lod = src_lodtensors[0]->lod();
+
+  framework::DDim check_dim;
+
+  for (auto *t : src_lodtensors) {
+    if (t->numel() && t->IsInitialized()) {
+      check_dim = t->dims();
+      new_type = t->type();
+      new_layout = t->layout();
+      break;
+    }
+  }
+
+  bool find_first_dims = false;
+  for (auto *t : src_lodtensors) {
+    if (t->numel() && t->IsInitialized()) {
+      if (!find_first_dims) {
+        new_dim = t->dims();
+        find_first_dims = true;
+      } else {
+        new_dim[0] += t->dims()[0];
+      }
+    }
+  }
+
+  // check src type,layout,dim,lod consistence
+  for (size_t i = 1; i < src_lodtensors.size(); ++i) {
+    CheckTensorAttrs(src_lodtensors[i], new_type, new_layout, check_dim,
+                     new_lod, offset_);
+  }
+
+  // set dst tensor
+  dst_lodtensor->Resize(new_dim);
+  dst_lodtensor->set_layout(src_lodtensors[0]->layout());
+  dst_lodtensor->set_lod(src_lodtensors[0]->lod());
+  if (platform::is_gpu_place(src_lodtensors[0]->place())) {
+    dst_lodtensor->mutable_data(platform::CUDAPinnedPlace(),
+                                src_lodtensors[0]->type());
+  } else {
+    dst_lodtensor->mutable_data(platform::CPUPlace(),
+                                src_lodtensors[0]->type());
+  }
+
+  // slice and memcpy
+  int begin = 0;
+  for (auto *src : src_lodtensors) {
+    int end = begin + src->dims()[0];
+    if (end == begin) {
+      continue;
+    }
+    auto dst = dst_lodtensor->Slice(begin, end);
+    TransData(src, &dst, *dev_ctxes_[src->place()]);
+    begin = end;
+  }
+}
+
+void FetchAsyncOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name());
+  WaitInputVarGenerated();
+
+  // get src vars
+  auto &scopes = *local_exec_scopes_;
+  std::vector<Variable *> src_vars;
+  src_vars.reserve(inputs_.size());
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
+    auto &scope = scopes.at(var_handle->scope_idx());
+    auto *var = scope->FindVar(var_handle->name());
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::NotFound(
+            "Cannot find variable %s in execution scope.", var_handle->name()));
+    src_vars.emplace_back(var);
+  }
+
+  if (return_merged_) {
+    auto &val = BOOST_GET(FetchList, *data_);
+    if (src_vars[0]->IsType<LoDTensor>()) {
+      // to lodtensor type
+      std::vector<const LoDTensor *> src_lodtensors;
+      src_lodtensors.reserve(src_vars.size());
+      for (size_t i = 0; i < src_vars.size(); ++i) {
+        src_lodtensors.emplace_back(&src_vars[i]->Get<framework::LoDTensor>());
+      }
+
+      LoDTensor dst_lodtensor;
+      FetchMergedLodTensor(src_lodtensors, &dst_lodtensor);
+      val.at(offset_) = std::move(dst_lodtensor);
+    } else {
+      // to lodtensorarray type
+      std::vector<const LoDTensorArray *> src_lodtensor_arrays;
+      src_lodtensor_arrays.reserve(src_vars.size());
+      for (size_t i = 0; i < src_vars.size(); ++i) {
+        src_lodtensor_arrays.emplace_back(
+            &src_vars[i]->Get<framework::LoDTensorArray>());
+      }
+
+      LoDTensorArray dst_lodtensor_array;
+      dst_lodtensor_array.resize(src_lodtensor_arrays[0]->size());
+
+      for (size_t i = 0; i < dst_lodtensor_array.size(); ++i) {
+        std::vector<const LoDTensor *> src_lodtensors;
+        src_lodtensors.reserve(src_lodtensor_arrays.size());
+        for (size_t j = 0; j < src_lodtensor_arrays.size(); ++j) {
+          src_lodtensors.emplace_back(&(*src_lodtensor_arrays[j])[i]);
+        }
+        FetchMergedLodTensor(src_lodtensors, &dst_lodtensor_array[i]);
+      }
+      val.at(offset_) = std::move(dst_lodtensor_array);
+    }
+  } else {
+    auto &val = BOOST_GET(FetchUnmergedList, *data_);
+    auto &dst_tensors = val.at(offset_);
+    dst_tensors.reserve(src_vars.size());
+
+    for (size_t i = 0; i < src_vars.size(); ++i) {
+      if (src_vars[i]->IsType<LoDTensor>()) {
+        auto &t = src_vars[i]->Get<framework::LoDTensor>();
+        LoDTensor item;
+        TransData(&t, &item, *dev_ctxes_[t.place()]);
+        dst_tensors.emplace_back(std::move(item));
+      } else {
+        auto &t = src_vars[i]->Get<framework::LoDTensorArray>();
+        LoDTensorArray item;
+        item.resize(t.size());
+        for (size_t j = 0; j < t.size(); ++j) {
+          TransData(&t[j], &item[j], *dev_ctxes_[t[j].place()]);
+        }
+        dst_tensors.emplace_back(std::move(item));
+      }
+    }
+  }
+}
+
+bool FetchAsyncOpHandle::IsMultiDeviceTransfer() { return true; }
+
+std::string FetchAsyncOpHandle::Name() const { return "FetchAsync"; }
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.h b/paddle/fluid/framework/details/fetch_async_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..691a3286c270badad938610811cc6e73d63c2c04
--- /dev/null
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.h
@@ -0,0 +1,63 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FetchAsyncOpHandle : public OpHandleBase {
+ public:
+  FetchAsyncOpHandle(ir::Node *node, FetchResultType *data, size_t offset,
+                     std::vector<Scope *> *local_scopes,
+                     std::vector<Scope *> *local_exec_scopes,
+                     bool return_merged);
+
+  ~FetchAsyncOpHandle();
+
+  void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override;
+
+ protected:
+  void RunImpl() override;
+
+  std::vector<Scope *> GetLocalScopes() override { return *local_scopes_; }
+
+  void FetchMergedLodTensor(
+      const std::vector<const LoDTensor *> &src_lodtensors,
+      LoDTensor *dst_lodtensor);
+
+ private:
+  FetchResultType *data_;
+  size_t offset_;
+  std::vector<Scope *> *local_scopes_;
+  std::vector<Scope *> *local_exec_scopes_;
+  bool return_merged_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 5574a55e18c6d9806cb878dc69ec597f81da97d8..ae69960ef78c3e35143c66226133bd0dceac8b79 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -36,7 +36,8 @@ FetchOpHandle::FetchOpHandle(ir::Node *node, FetchResultType *data,
 FetchOpHandle::~FetchOpHandle() {}
 
 void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-  PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "No nodes need to wait FetchOp. Unexpceted Error."));
 }
 
 static void CheckDims(const framework::DDim &tensor_dims,
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
index 4f1e44ca26cb65468da6eded74653f34dbf00336..71123f708e3ca149d9fd634f55652cede5a57b50 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 
 namespace paddle {
 namespace framework {
@@ -23,9 +24,11 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
   if (fetch_ops->empty()) return;
 
   for (auto& op : *fetch_ops) {
-    PADDLE_ENFORCE_NOT_NULL(
-        dynamic_cast<FetchOpHandle*>(op),
-        "The input ops of ClearFetchOp function should be FetchOpHandle.");
+    PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
+                          dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
+                      true,
+                      "The input ops of ClearFetchOp function should be "
+                      "FetchOpHandle or FetchAsyncOpHandle.");
     for (auto& out_var : op->Node()->outputs) {
       graph->RemoveNode(out_var);
     }
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 335cbc382c178b1a14949764f2908dc402298868..34fff042770c5f50a280408d8f7f925488b3879c 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -857,7 +857,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
     float* g = g_tensor->data<float>();
 
     if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) {
-      int dim = emb_dim + offset;
+      int dim = emb_dim;
       Eigen::Map<
           Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
           g_mat(g, g_tensor->numel() / dim, dim);
@@ -1170,6 +1170,21 @@ void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
 #endif
 }
 
+void FleetWrapper::LoadWithWhitelist(const uint64_t table_id,
+                                     const std::string& path, const int mode) {
+#ifdef PADDLE_WITH_PSLIB
+  auto ret = pslib_ptr_->_worker_ptr->load_with_whitelist(table_id, path,
+                                                          std::to_string(mode));
+  ret.wait();
+  if (ret.get() != 0) {
+    LOG(ERROR) << "load model of table id: " << table_id
+               << ", from path: " << path << " failed";
+  }
+#else
+  VLOG(0) << "FleetWrapper::LoadWhitelist does nothing when no pslib";
+#endif
+}
+
 void FleetWrapper::SaveModel(const std::string& path, const int mode) {
 #ifdef PADDLE_WITH_PSLIB
   auto ret = pslib_ptr_->_worker_ptr->save(path, std::to_string(mode));
@@ -1285,6 +1300,26 @@ int32_t FleetWrapper::SaveCache(int table_id, const std::string& path,
 #endif
 }
 
+int32_t FleetWrapper::SaveWithWhitelist(int table_id, const std::string& path,
+                                        const int mode,
+                                        const std::string& whitelist_path) {
+#ifdef PADDLE_WITH_PSLIB
+  auto ret = pslib_ptr_->_worker_ptr->save_with_whitelist(
+      table_id, path, std::to_string(mode), whitelist_path);
+  ret.wait();
+  int32_t feasign_cnt = ret.get();
+  if (feasign_cnt == -1) {
+    LOG(ERROR) << "table save cache failed";
+    sleep(sleep_seconds_before_fail_exit_);
+    exit(-1);
+  }
+  return feasign_cnt;
+#else
+  VLOG(0) << "FleetWrapper::SaveCache does nothing when no pslib";
+  return -1;
+#endif
+}
+
 void FleetWrapper::ShrinkSparseTable(int table_id) {
 #ifdef PADDLE_WITH_PSLIB
   auto ret = pslib_ptr_->_worker_ptr->shrink(table_id);
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index 92f3a625a755bba4989033c0cd41d9b25591c960..cc13a50160a94c63345bcbd5633f2d3f8555ae0c 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -273,6 +273,11 @@ class FleetWrapper {
   // save cache model
   // cache model can speed up online predict
   int32_t SaveCache(int table_id, const std::string& path, const int mode);
+  // save sparse table filtered by user-defined whitelist
+  int32_t SaveWithWhitelist(int table_id, const std::string& path,
+                            const int mode, const std::string& whitelist_path);
+  void LoadWithWhitelist(const uint64_t table_id, const std::string& path,
+                         const int mode);
   // copy feasign key/value from src_table_id to dest_table_id
   int32_t CopyTable(const uint64_t src_table_id, const uint64_t dest_table_id);
   // copy feasign key/value from src_table_id to dest_table_id
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index d09cb03360363088bb021285af4574ffbbb81ef0..1c364300d2c633eca4beca9fb8044206e0103796 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -21,6 +21,8 @@
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
 
+DECLARE_bool(use_mkldnn);
+
 namespace paddle {
 namespace imperative {
 
@@ -47,6 +49,9 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
                      const NameVarBaseMap& outs, framework::AttributeMap attrs,
                      const platform::Place& place, bool trace_backward) {
   VLOG(1) << "Trace Op: " << type;
+  if (FLAGS_use_mkldnn) {
+    attrs["use_mkldnn"] = true;
+  }
   auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
   const auto& op_info = op->Info();
   auto* attr_checker = op_info.Checker();
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 127a41aee890808258367fb40804a9547b8fdbb0..500aa8341d6a61056f6f80f82c6f28bb569eb772 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1058,6 +1058,7 @@ USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
 USE_TRT_CONVERTER(skip_layernorm);
 USE_TRT_CONVERTER(slice);
 USE_TRT_CONVERTER(scale);
+USE_TRT_CONVERTER(stack);
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 8b7371490c09068fd4b84ddb541014204806a2b2..39d02909abd1f1d96f73cc9f3e3ea9d26a1f5c72 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -3,8 +3,8 @@ nv_library(tensorrt_converter
            SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                 batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
                 pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc
-emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc
+                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc
+                emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/scale_op.cc b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
index 19e1895635aa7670a0ca453656c3407d132e8db4..f9a1fe41ddc046aad8cc3a5397453b0f68c1a112 100644
--- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
@@ -58,6 +58,24 @@ class ScaleOpConverter : public OpConverter {
     TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                          0};
     nvinfer1::ILayer* layer = nullptr;
+
+    auto input_dim = input->getDimensions();
+    PADDLE_ENFORCE_GE(input_dim.nbDims, 3,
+                      platform::errors::Fatal(
+                          "Paddle-TRT scale mode only support dimension >= 3"));
+
+    nvinfer1::IShuffleLayer* expand_layer = nullptr;
+    nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
+
+    if (input_dim.nbDims == 3) {
+      // TensorRT scale layer is not supporting input dims < 4 when using
+      // explicit batch
+      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      nvinfer1::Dims4 target_shape(0, 0, 0, 1);  // expand 1 dims
+      expand_layer->setReshapeDimensions(target_shape);
+      input = expand_layer->getOutput(0);
+    }
+
     if (bias_after_scale) {
       layer = TRT_ENGINE_ADD_LAYER(
           engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM,
@@ -73,6 +91,18 @@ class ScaleOpConverter : public OpConverter {
           power_weights.get(), scale_weights.get(), power_weights.get());
     }
 
+    PADDLE_ENFORCE_EQ(layer != nullptr, true,
+                      platform::errors::Fatal("Create scale layer failed."));
+
+    if (input_dim.nbDims == 3) {
+      // TensorRT scale layer is not supporting input dims < 4 when using
+      // explicit batch
+      squeeze_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
+      nvinfer1::Dims3 target_shape(0, 0, 0);  // expand 1 dims
+      squeeze_layer->setReshapeDimensions(target_shape);
+      layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+    }
     RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f35024529c61a253f314e5eca985713227d3f343
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Stack converter from fluid to tensorRT.
+ */
+class StackOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid stack op to tensorrt stack layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    auto input = op_desc.Input("X");
+    int input_num = input.size();
+    nvinfer1::ITensor** inputs =
+        (nvinfer1::ITensor**)malloc(input_num * sizeof(nvinfer1::ITensor*));
+
+    for (int i = 0; i < input_num; ++i) {
+      inputs[i] = engine_->GetITensor(input[i]);
+    }
+
+    int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
+    if (axis < 0) {
+      axis = axis + inputs[0]->getDimensions().nbDims + 1;
+    }
+
+    nvinfer1::ILayer* layer = nullptr;
+    if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
+      plugin::StackPluginDynamic* plugin =
+          new plugin::StackPluginDynamic(axis, input_num);
+      layer = engine_->AddPluginV2(inputs, input_num, plugin);
+      assert(layer != nullptr);
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the TRT Dynamic Shape mode, need to confirm that "
+          "your TRT version is no less than 6.0"));
+#endif
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the Ernie(Bert) model in static"
+          "shape mode, which is not supported for the time being.\n"
+          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
+          " to set the shape information to run the dynamic shape mode."));
+    }
+    auto output_name = op_desc.Output("Y").front();
+    RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode);
+    free(inputs);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(stack, StackOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 03f5a751511adba7b508db9944c30d17866bad2d..22be877493272cd393538fd4f04184e77d38e2db 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -186,6 +186,14 @@ void TensorRTEngine::FreezeNetwork() {
           Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
     }
     infer_builder_config_->addOptimizationProfile(optim_profile_);
+    infer_builder_config_->setMaxWorkspaceSize(max_workspace_);
+    if (enable_int8) {
+      // Due to a bug of TRT, we must set precision BuilderFlag to kFP16 before
+      // kINT8 here to perform INT8 inference.
+      infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
+      infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8);
+      infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES);
+    }
     if (WithFp16()) {
       infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
       if (disable_trt_plugin_fp16()) {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index f5d22b982de2b41474d97565a44dedc67c8a85d7..a5b71356d0eca43555f4190b8cac2055a3eb679c 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -51,6 +51,7 @@ struct SimpleOpTypeSetTeller : public Teller {
                                                   "relu",
                                                   "depthwise_conv2d",
                                                   "softmax",
+                                                  "sigmoid",
                                                   "batch_norm",
                                                   "elementwise_add",
                                                   "leaky_relu",
@@ -87,6 +88,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "gelu",
       "layer_norm",
       "scale",
+      "stack",
   };
 };
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index e417fcbb2ce9267ad491996063e5725799815f55..98afdbe254a4b0a086d4a4aa88096a06c40138d1 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1,7 +1,8 @@
 nv_library(tensorrt_plugin
            SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
-           prelu_op_plugin.cu  trt_plugin_factory.cc gelu_op_plugin.cu 
+           prelu_op_plugin.cu trt_plugin_factory.cc gelu_op_plugin.cu
            pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu
-instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
-qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu
-           DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) 
+           instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
+           qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu
+           hard_swish_op_plugin.cu stack_op_plugin.cu
+           DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
index 48afcfce347d681fbbb291e478ead1fa28475a22..1fa5b3228e1158fe0423c457d974e0bbf970a30f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -104,32 +104,51 @@ nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions(
 
   auto stri_0 = expr_builder.constant(strides_[0]);
   auto stri_1 = expr_builder.constant(strides_[1]);
+  auto one_value = expr_builder.constant(1);
 
-  auto tmp1_0 =
-      expr_builder.constant((-ksize_[0] + 2 * paddings_[0]) / strides_[0] + 1);
-  auto tmp1_1 =
-      expr_builder.constant((-ksize_[1] + 2 * paddings_[1]) / strides_[1] + 1);
+  auto v0_tmp = expr_builder.constant(-ksize_[0] + 2 * paddings_[0]);
+  auto v1_tmp = expr_builder.constant(-ksize_[1] + 2 * paddings_[1]);
 
-  auto tmp2_0 = expr_builder.constant(
-      (-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1) / strides_[0] + 1);
-  auto tmp2_1 = expr_builder.constant(
-      (-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1) / strides_[1] + 1);
-
-  auto *a_d = expr_builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,
-                                     *inputs[0].d[2], *stri_0);
-  auto *b_d = expr_builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,
-                                     *inputs[0].d[3], *stri_1);
+  auto ceil_tmp =
+      expr_builder.constant(-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1);
+  auto ceil1_tmp =
+      expr_builder.constant(-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1);
 
   if (!ceil_mode_) {
-    output.d[2] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
-                                         *a_d, *tmp1_0);
-    output.d[3] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
-                                         *b_d, *tmp1_1);
+    output.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[2], *v0_tmp),
+            *stri_0),
+        *one_value);
+    output.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[3], *v1_tmp),
+            *stri_1),
+        *one_value);
+
   } else {
-    output.d[2] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
-                                         *a_d, *tmp2_0);
-    output.d[3] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
-                                         *b_d, *tmp2_1);
+    output.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[2], *ceil_tmp),
+            *stri_0),
+        *one_value);
+    output.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(
+            nvinfer1::DimensionOperation::kFLOOR_DIV,
+            *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                    *inputs[0].d[3], *ceil1_tmp),
+            *stri_1),
+        *one_value);
   }
 
   return output;
diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1ecbf4be154f01059ef33e2d510d8329d6726314
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
@@ -0,0 +1,247 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <cstring>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+StackPluginDynamic::StackPluginDynamic(int axis, int num_stack)
+    : axis_(axis), num_stack_(num_stack) {}
+
+StackPluginDynamic::StackPluginDynamic(void const* serial_data,
+                                       size_t serial_length) {
+  DeserializeValue(&serial_data, &serial_length, &axis_);
+  DeserializeValue(&serial_data, &serial_length, &num_stack_);
+}
+
+StackPluginDynamic::~StackPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt* StackPluginDynamic::clone() const {
+  return new StackPluginDynamic(axis_, num_stack_);
+}
+
+const char* StackPluginDynamic::getPluginType() const { return "stack_plugin"; }
+
+int StackPluginDynamic::getNbOutputs() const { return 1; }
+
+int StackPluginDynamic::initialize() { return 0; }
+
+size_t StackPluginDynamic::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(axis_);
+  serialize_size += SerializedSize(num_stack_);
+  return serialize_size;
+}
+
+void StackPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, axis_);
+  SerializeValue(&buffer, num_stack_);
+}
+
+nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) {
+  nvinfer1::DimsExprs output(inputs[0]);
+  output.nbDims = inputs[0].nbDims + 1;
+
+  for (int i = inputs[0].nbDims; i > axis_; --i) {
+    output.d[i] = inputs[0].d[i - 1];
+  }
+  output.d[axis_] = expr_builder.constant(nb_inputs);
+  return output;
+}
+
+void StackPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+size_t StackPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  return num_stack_ * sizeof(uintptr_t);
+}
+
+void StackPluginDynamic::destroy() { delete this; }
+
+void StackPluginDynamic::terminate() {}
+
+bool StackPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of stack plugin should not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+
+  const nvinfer1::PluginTensorDesc& in = in_out[pos];
+  if (pos == 0) {
+#ifdef SUPPORTS_CUDA_FP16
+    return (in.type == nvinfer1::DataType::kFLOAT ||
+            in.type == nvinfer1::DataType::kHALF) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+#else
+    return (in.type == nvinfer1::DataType::kFLOAT) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+#endif
+  }
+  const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
+  // output
+  return in.type == prev.type && in.format == prev.format;
+}
+
+nvinfer1::DataType StackPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be equal to 0"));
+  return input_types[0];
+}
+
+template <typename T>
+__global__ void StackKernel(const T* const* input, T* output, int num_stack,
+                            int base_unit) {
+  int stack_id = blockIdx.x;
+  int lead_id = blockIdx.y;
+
+  for (int i = threadIdx.x; i < base_unit; i += blockDim.x) {
+    output[lead_id * num_stack * base_unit + stack_id * base_unit + i] =
+        input[stack_id][lead_id * base_unit + i];
+  }
+}
+
+int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+                                const nvinfer1::PluginTensorDesc* output_desc,
+                                const void* const* inputs, void* const* outputs,
+                                void* workspace, cudaStream_t stream) {
+  auto input_dims = input_desc[0].dims;  // (batch, seq, seq)
+  auto out_dims = output_desc[0].dims;   // (batch, num_head, seq, seq)
+  auto out_num_dims = out_dims.nbDims;
+
+  int base_unit = 1;
+  for (int i = axis_ + 1; i < out_num_dims; ++i) {
+    PADDLE_ENFORCE_GT(out_dims.d[i], 0,
+                      platform::errors::InvalidArgument(
+                          "Input dimensions should be greater than 0"));
+    base_unit *= out_dims.d[i];
+  }
+
+  int lead_unit = 1;
+  for (int i = 0; i < axis_; ++i) {
+    PADDLE_ENFORCE_GT(out_dims.d[i], 0,
+                      platform::errors::InvalidArgument(
+                          "Input dimensions should be greater than 0"));
+    lead_unit *= out_dims.d[i];
+  }
+
+  PADDLE_ENFORCE_EQ(
+      out_dims.d[axis_], num_stack_,
+      platform::errors::InvalidArgument("number of stack axis should be same"));
+
+  cudaMemcpyAsync(workspace, reinterpret_cast<const void* const>(inputs),
+                  sizeof(void*) * out_dims.d[axis_], cudaMemcpyHostToDevice,
+                  stream);
+
+  const int num_stacks = out_dims.d[axis_];
+  dim3 num_blocks(num_stacks, lead_unit);
+  const int num_threads = 256;
+  auto infer_type = input_desc[0].type;
+
+  if (infer_type == nvinfer1::DataType::kFLOAT) {
+    float* output = static_cast<float*>(outputs[0]);
+    StackKernel<float><<<num_blocks, num_threads, 0, stream>>>(
+        reinterpret_cast<const float* const*>(workspace), output, num_stacks,
+        base_unit);
+  } else if (infer_type == nvinfer1::DataType::kHALF) {
+#ifdef SUPPORTS_CUDA_FP16
+    __half* output = static_cast<__half*>(outputs[0]);
+    StackKernel<__half><<<num_blocks, num_threads, 0, stream>>>(
+        reinterpret_cast<const __half* const*>(workspace), output, num_stacks,
+        base_unit);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The cuda archs you specific should greater than 600."));
+#endif
+  } else {
+    PADDLE_THROW(
+        platform::errors::Fatal("The Stack TRT Plugin's input type only "
+                                "support float or half currently."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+
+StackPluginDynamicCreator::StackPluginDynamicCreator() {}
+
+const char* StackPluginDynamicCreator::getPluginName() const {
+  return "stack_plugin";
+}
+
+const char* StackPluginDynamicCreator::getPluginVersion() const { return "1"; }
+
+const nvinfer1::PluginFieldCollection*
+StackPluginDynamicCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  int axis = -1;
+  int num_stack = -1;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string name(fc->fields[i].name);
+    if (name == "axis") {
+      axis = static_cast<const int*>(fc->fields[i].data)[0];
+    } else if (name == "num_stack") {
+      num_stack = static_cast<const int*>(fc->fields[i].data)[0];
+    } else {
+      PADDLE_THROW(platform::errors::Fatal("Meet an unknown plugin field '" +
+                                           name +
+                                           "' when creating stack op plugin."));
+    }
+  }
+  return new StackPluginDynamic(axis, num_stack);
+}
+
+nvinfer1::IPluginV2* StackPluginDynamicCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new StackPluginDynamic(serial_data, serial_length);
+  return plugin;
+}
+
+void StackPluginDynamicCreator::setPluginNamespace(const char* lib_namespace) {
+  plugin_namespace_ = lib_namespace;
+}
+
+const char* StackPluginDynamicCreator::getPluginNamespace() const {
+  return plugin_namespace_.c_str();
+}
+
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4f6cde6f87ea97c514e68bc2862bb163b0aa448
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdio.h>
+#include <cassert>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class StackPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit StackPluginDynamic(int axis, int num_stack);
+  StackPluginDynamic(void const* serial_data, size_t serial_length);
+  ~StackPluginDynamic();
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  const char* getPluginType() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+
+ private:
+  int axis_;
+  int num_stack_;
+};
+
+class StackPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  StackPluginDynamicCreator();
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+
+ private:
+  std::string plugin_namespace_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(StackPluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 2bd30bc05179e2881c4ecb321d76d5506233cc0e..ac05b08b8f2a038234e7192f47a37b3ef3bcf461 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -132,7 +132,9 @@ if(NOT APPLE AND WITH_MKLML)
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
     download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
-    set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
+    if(NOT WIN32)
+        set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
+    endif()
 else()
     # TODO: fix this test on MACOS and OPENBLAS, the reason is that
     # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
@@ -192,8 +194,9 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
 inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
     EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
-
-set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
+if(NOT WIN32 AND NOT APPLE)
+    set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
+endif()
 
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
@@ -215,7 +218,7 @@ inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_test
 
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
-if (NOT EXISTS ${OCR_INSTALL_DIR})
+if (NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz)
     inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
@@ -231,7 +234,7 @@ set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysi
 
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
-if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
+if (NOT EXISTS ${MOBILENET_INSTALL_DIR}/mobilenet.tar.gz)
     inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
@@ -395,15 +398,15 @@ inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models")
-    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models.tar.gz)
         inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz")
     endif()
     set(TEST_SPLIT_CONVERTER_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_split_op_converter_test")
-    if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL})
+    if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz)
         inference_download_and_uncompress(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz")
     endif()
     set(TEST_INSTANCE_NORM_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_instance_norm_test")
-    if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL})
+    if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL}/instance_norm.tgz)
         inference_download_and_uncompress(${TEST_INSTANCE_NORM_MODEL} ${INFERENCE_URL}/tensorrt_test "instance_norm.tgz")
     endif()
     inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc
@@ -431,16 +434,16 @@ if(WITH_GPU AND TENSORRT_FOUND)
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
      
-    set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant_small_model")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR})
-        inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "quant_small_model.tar.gz")
+    set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
+    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz)
+        inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
             ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
 
     set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR})
+    if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR}/yolov3_r50_quant_aware.tgz)
         inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc
@@ -448,12 +451,12 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_QUANT_YOLOV3_DIR})
 
     set(TEST_TRT_DYNAMIC_MODEL2 "${TRT_MODEL_INSTALL_DIR}/complex_model_dynamic")
-    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2})
+    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2}/complex_model_dynamic2.tar.gz)
         inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test "complex_model_dynamic2.tar.gz")
     endif()
 
     set(TEST_TRT_DYNAMIC_MODEL "${TRT_MODEL_INSTALL_DIR}/conv_bn_swish_split_gelu")
-    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL})
+    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL}/conv_bn_swish_split_gelu.tar.gz)
         inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test "conv_bn_swish_split_gelu.tar.gz")
     endif()
     inference_analysis_test(trt_dynamic_shape_test SRCS trt_dynamic_shape_test.cc
@@ -461,7 +464,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR})
 
     set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL})
+    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4.tar.gz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4.tar.gz")
     endif()
 
@@ -470,7 +473,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
 
     set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL})
+    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
     endif()
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
index da0c93d21b7852e06b6805230078540063c2b243..c60e0a25f28c01c453276a8ef04eb79b35b7dda2 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_gpu_tester.cc
@@ -54,9 +54,6 @@ TEST(PD_AnalysisConfig, use_gpu) {
   PD_SwitchIrOptim(config, true);
   bool ir_optim = PD_IrOptim(config);
   CHECK(ir_optim) << "NO";
-  PD_EnableMkldnnBfloat16(config);
-  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
-  CHECK(!bfloat16_enable) << "NO";
   PD_EnableTensorRtEngine(config, 1 << 20, 1, 3, Precision::kFloat32, false,
                           false);
   bool trt_enable = PD_TensorrtEngineEnabled(config);
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
index 7e5dfa2424dbca4fb3a8a08e3d7fa7fbc3060d3d..524e08891f4e90d8a322822e26d75689526d30f5 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -90,7 +90,6 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
 
   config.SwitchUseFeedFetchOps(false);
 
-  int head_number = 12;
   int batch = 1;
   int min_seq_len = 1;
   int max_seq_len = 128;
@@ -104,17 +103,17 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
       {"read_file_0.tmp_0", min_shape},
       {"read_file_0.tmp_1", min_shape},
       {"read_file_0.tmp_2", min_shape},
-      {"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
+      {"matmul_0.tmp_0", {batch, min_seq_len, min_seq_len}}};
   std::map<std::string, std::vector<int>> max_input_shape = {
       {"read_file_0.tmp_0", max_shape},
       {"read_file_0.tmp_1", max_shape},
       {"read_file_0.tmp_2", max_shape},
-      {"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
+      {"matmul_0.tmp_0", {batch, max_seq_len, max_seq_len}}};
   std::map<std::string, std::vector<int>> opt_input_shape = {
       {"read_file_0.tmp_0", opt_shape},
       {"read_file_0.tmp_1", opt_shape},
       {"read_file_0.tmp_2", opt_shape},
-      {"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
+      {"matmul_0.tmp_0", {batch, opt_seq_len, opt_seq_len}}};
 
   auto precision = AnalysisConfig::Precision::kFloat32;
   if (with_fp16) {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
index c99ebcdcb5f319f73b7fd931d13f27684db39cad..17fedc3d3b8bb8451fac76f6c7dec4ac057fd1d2 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -90,7 +90,6 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
 
   config.SwitchUseFeedFetchOps(false);
 
-  int head_number = 12;
   int batch = 1;
   int min_seq_len = 1;
   int max_seq_len = 128;
@@ -104,17 +103,17 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
       {"read_file_0.tmp_0", min_shape},
       {"read_file_0.tmp_1", min_shape},
       {"read_file_0.tmp_2", min_shape},
-      {"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
+      {"matmul_0.tmp_0", {batch, min_seq_len, min_seq_len}}};
   std::map<std::string, std::vector<int>> max_input_shape = {
       {"read_file_0.tmp_0", max_shape},
       {"read_file_0.tmp_1", max_shape},
       {"read_file_0.tmp_2", max_shape},
-      {"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
+      {"matmul_0.tmp_0", {batch, max_seq_len, max_seq_len}}};
   std::map<std::string, std::vector<int>> opt_input_shape = {
       {"read_file_0.tmp_0", opt_shape},
       {"read_file_0.tmp_1", opt_shape},
       {"read_file_0.tmp_2", opt_shape},
-      {"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
+      {"matmul_0.tmp_0", {batch, opt_seq_len, opt_seq_len}}};
 
   auto precision = AnalysisConfig::Precision::kFloat32;
   if (with_fp16) {
diff --git a/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc b/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
index ca5cdbbcb26c81d028d87a7ec186c73d6e2b7fde..6adf3cf743b0e3c303b28c1e4cdc3b4be376bf32 100644
--- a/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_quant_int8_test.cc
@@ -25,12 +25,20 @@ namespace inference {
 TEST(quant_int8, resnet50) {
   std::string model_dir = FLAGS_infer_model;
   AnalysisConfig config;
-  config.EnableUseGpu(100, 0);
+  config.EnableUseGpu(1000, 0);
   config.SetModel(model_dir);
   config.SwitchUseFeedFetchOps(false);
   config.EnableTensorRtEngine(1 << 30, 1, 1, AnalysisConfig::Precision::kInt8,
                               false, false);
+  std::map<std::string, std::vector<int>> min_input_shape = {
+      {"image", {1, 1, 3, 3}}};
+  std::map<std::string, std::vector<int>> max_input_shape = {
+      {"image", {1, 1, 10, 10}}};
+  std::map<std::string, std::vector<int>> opt_input_shape = {
+      {"image", {1, 1, 3, 3}}};
 
+  config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+                                opt_input_shape);
   auto predictor = CreatePaddlePredictor(config);
   auto input_names = predictor->GetInputNames();
   int channels = 1;
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index b9f979f96d4b106642795151fb8e34b025b2caef..8bc10f2147fa29102b242ce22e78a88453d6cee4 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -45,7 +45,7 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
-if(NOT EXISTS ${WORD2VEC_INSTALL_DIR})
+if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}/word2vec.inference.model.tar.gz)
   inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 81bb6881fae69b7af494449164f4fed35ade24da..5a3660cee85762f3d76129dfb694eeb6d87bb52c 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -244,7 +244,6 @@ UNUSED constexpr char CosDoc[] = R"DOC(
 Cosine Operator. Computes cosine of x element-wise.
 
 Input range is `(-inf, inf)` and output range is `[-1,1]`.
-Return `nan` if input is out of boundary.
 
 $$out = cos(x)$$
 
@@ -342,7 +341,9 @@ $$out = \cos^{-1}(x)$$
 class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "Input of asin operator");
+    AddInput("X",
+             "Input of asin operator, an N-D Tensor, with data type float32, "
+             "float64 or float16.");
     AddOutput("Out", "Output of asin operator");
     AddComment(R"DOC(
 Arcsine Operator.
@@ -356,7 +357,9 @@ $$out = \sin^{-1}(x)$$
 class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "Input of atan operator");
+    AddInput("X",
+             "Input of atan operator, an N-D Tensor, with data type float32, "
+             "float64 or float16.");
     AddOutput("Out", "Output of atan operator");
     AddComment(R"DOC(
 Arctangent Operator.
@@ -1252,4 +1255,14 @@ REGISTER_OP_VERSION(hard_shrink)
                 "((x < -threshold) + (x > threshold)); after checkpoint: out = "
                 "x * (((x < -threshold) + (x > threshold)) > 0)"));
 
+REGISTER_OP_VERSION(softplus)
+    .AddCheckpoint(
+        R"ROC(add new attributes [beta] and [threshold], and the formula is changed to "
+         " softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\ \\text{For numerical"
+         " stability, the implementation reverts to the linear function when: beta * x > threshold.})ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewAttr("beta", "The beta value of the new formula", 1.0f)
+            .NewAttr("threshold", "The threshold value of the new formula",
+                     20.0f));
+
 /* ========================================================================== */
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
index fd7fa17ac9ae5e540176bb583cf87fa3d00d2945..a82134921ef64f89151eb9c521ea3cbb6f83ee7b 100644
--- a/paddle/fluid/operators/arg_max_op.cc
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 
 REGISTER_OPERATOR(
@@ -31,3 +32,20 @@ REGISTER_OP_CPU_KERNEL(
                                     int16_t>,
     paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
                                     uint8_t>);
+REGISTER_OP_VERSION(arg_max)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade argmax add a new attribute [flatten] and modify the attribute of dtype)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewAttr("flatten",
+                     "In order to compute the argmax over the flattened array "
+                     "when the "
+                     "argument `axis` in python API is None.",
+                     false)
+            .ModifyAttr(
+                "dtype",
+                "change the default value of dtype, the older version "
+                "is -1, means return the int64 indices."
+                "The new version is 3, return the int64 indices directly."
+                "And supporting the dtype of -1 in new version.",
+                3));
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index ae3637f6f99783d70bd57a3935a979b0387692de..69365357084b660b7c2f90149fe250854ea6a014 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -70,6 +70,8 @@ struct VisitDataArgMinMaxFunctor {
     auto axis = ctx.Attr<int64_t>("axis");
     auto keepdims = ctx.Attr<bool>("keepdims");
     const bool& flatten = ctx.Attr<bool>("flatten");
+    // paddle do not have the scalar tensor, just return the shape [1] tensor
+    if (flatten) keepdims = true;
 
     // if flatten, will construct the new dims for the cacluate
     framework::DDim x_dims;
@@ -164,15 +166,30 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
 
+    auto x_rank = x_dims.size();
+    if (axis < 0) axis += x_rank;
+    if (ctx->IsRuntime()) {
+      const int& dtype = ctx->Attrs().Get<int>("dtype");
+      if (dtype == framework::proto::VarType::INT32) {
+        int64_t all_element_num = 0;
+        if (flatten) {
+          all_element_num = framework::product(x_dims);
+
+        } else {
+          all_element_num = x_dims[axis];
+        }
+        PADDLE_ENFORCE_LE(
+            all_element_num, INT_MAX,
+            "The element num of the argmin/argmax input at axis is "
+            "%d, is larger than int32 maximum value:%d, you must "
+            "set the dtype of argmin/argmax to 'int64'.",
+            all_element_num, INT_MAX);
+      }
+    }
     std::vector<int64_t> vec;
     if (flatten) {
-      // if is flatten, will return the only on element
-      if (keepdims) {
-        vec.emplace_back(static_cast<int64_t>(1));
-      }
+      vec.emplace_back(static_cast<int64_t>(1));
     } else {
-      auto x_rank = x_dims.size();
-      if (axis < 0) axis += x_rank;
       for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
       if (keepdims) {
         vec.emplace_back(static_cast<int64_t>(1));
@@ -194,10 +211,14 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "Output tensor.");
     AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
     AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false);
-    AddAttr<int>("dtype", "Keep the dim that to reduce.").SetDefault(-1);
     AddAttr<bool>("flatten",
                   "Flatten the input value, and search the min or max indices")
         .SetDefault(false);
+    AddAttr<int>("dtype",
+                 "(int, 3), the dtype of indices, the indices dtype must be "
+                 "int32, int64."
+                 "default dtype is int64, and proto value is 3.")
+        .SetDefault(3);
     AddComment(string::Sprintf(R"DOC(
       %s Operator.
 
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
index 74fc3292746d26a983fa81ed8cac67b30e23d476..23ed7d727c536225a98a1ea9e6e3af723b4352c3 100644
--- a/paddle/fluid/operators/arg_min_op.cc
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/arg_min_max_op_base.h"
 
 REGISTER_OPERATOR(
@@ -31,3 +32,20 @@ REGISTER_OP_CPU_KERNEL(
                                     int16_t>,
     paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
                                     uint8_t>);
+REGISTER_OP_VERSION(arg_min)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade argmin add a new attribute [flatten] and modify the attribute of dtype)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewAttr("flatten",
+                     "In order to compute the argmin over the flattened array "
+                     "when the "
+                     "argument `axis` in python API is None.",
+                     false)
+            .ModifyAttr(
+                "dtype",
+                "change the default value of dtype, the older version "
+                "is -1, means return the int64 indices."
+                "The new version is 3, return the int64 indices directly."
+                "And supporting the dtype of -1 in new version.",
+                3));
diff --git a/paddle/fluid/operators/bernoulli_op.cu b/paddle/fluid/operators/bernoulli_op.cu
index d0837071d456068f64ebc74b115f1a7904eba41c..f665d2dd0e991847de2ad35bf6b18741fb3a6e26 100644
--- a/paddle/fluid/operators/bernoulli_op.cu
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -31,6 +31,10 @@ struct BernoulliCudaFunctor {
   __host__ __device__ BernoulliCudaFunctor(int seed) : seed_(seed) {}
 
   __host__ __device__ T operator()(const unsigned int n, const T p) const {
+    // NOTE(zhiqiu): currently, PADDLE_ENFORCE in cuda kernel may print several
+    // lines of error messages if, and it should be refined.
+    PADDLE_ENFORCE(p >= 0.0 && p <= 1.0,
+                   "The probability should be >=0 and <= 1, but got %f", p);
     thrust::minstd_rand rng;
     rng.seed(seed_);
     thrust::uniform_real_distribution<T> dist(0.0, 1.0);
diff --git a/paddle/fluid/operators/bernoulli_op.h b/paddle/fluid/operators/bernoulli_op.h
index 06a83ada17bb926d6f7d4eef10750986d00f048c..40f285d11f194057d950f45798bea07439398ab0 100644
--- a/paddle/fluid/operators/bernoulli_op.h
+++ b/paddle/fluid/operators/bernoulli_op.h
@@ -25,10 +25,12 @@ namespace operators {
 
 template <typename T>
 inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
-  PADDLE_ENFORCE_LE(p, 1, platform::errors::OutOfRange(
-                              "The probability should be <= 1, but got %f", p));
-  PADDLE_ENFORCE_GE(p, 0, platform::errors::OutOfRange(
-                              "The probability should be >= 1, but got %f", p));
+  PADDLE_ENFORCE_LE(p, 1.0,
+                    platform::errors::OutOfRange(
+                        "The probability should be <= 1, but got %f", p));
+  PADDLE_ENFORCE_GE(p, 0.0,
+                    platform::errors::OutOfRange(
+                        "The probability should be >= 0, but got %f", p));
   return static_cast<T>(rand < p);
 }
 
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
index 8c093d12585981ee681ae13f0d2e493197c6b9b3..6dfa2670c140fcfb4c409c0f9e9cef49c02a7064 100644
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
@@ -25,25 +25,32 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs("Ids"),
-                   "Input(Ids) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("W"),
-                   "Input(W) of LookupTableOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutputs("Outputs"),
-                   "Output(Outs) of LookupTableOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Ids) of LookupTableOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(W) of LookupTableOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Outs) of LookupTableOp should not be null."));
 
     auto ids_dims = ctx->GetInputsDim("Ids");
     auto table_dims = ctx->GetInputDim("W");
 
-    PADDLE_ENFORCE_EQ(table_dims.size(), 2,
-                      "Only 2 dimensions of the 'Embedding' is supported.");
+    PADDLE_ENFORCE_EQ(
+        table_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "Only 2 dimensions of the 'Embedding' is supported."));
 
     for (auto &ids_dim : ids_dims) {
       PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
-                        "The dimension of the 'Ids' tensor must be 2.");
+                        platform::errors::InvalidArgument(
+                            "The dimension of the 'Ids' tensor must be 2."));
     }
 
     auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints");
+    // for fluid.embedding
     auto lookup_table_version =
         ctx->Attrs().Get<std::string>("lookup_table_version");
 
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
index a71451c78a870b71c05b41bdcfb34a85b3e2213b..6387120bc87fc94f40574a3ab7f0aabc98f41e95 100644
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
@@ -35,9 +35,30 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
     auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
     auto is_distributed = context.Attr<bool>("is_distributed");
 
+    auto lookup_table_version =
+        context.Attr<std::string>("lookup_table_version");
+
     operators::distributed::prefetchs(id_names, out_names, embedding_name,
                                       is_distributed, lookup_tables, endpoints,
                                       context, context.scope());
+
+    if (lookup_table_version == "lookup_table_v2") {
+      auto &scope = context.scope();
+      auto emb_dim =
+          scope.FindVar(embedding_name)->Get<framework::LoDTensor>().dims()[1];
+
+      for (size_t i = 0; i < id_names.size(); ++i) {
+        auto *id_var = scope.FindVar(id_names[i]);
+        auto *out_var = scope.FindVar(out_names[i]);
+        auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
+        auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
+
+        auto id_dims = id_tensor->dims();
+        out_tensor->Resize(framework::make_ddim(
+            {static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
+             static_cast<int64_t>(emb_dim)}));
+      }
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h
index 1b4db94b298c53382ee4c657e24b1b6fe6b7f62b..589df8821b3e7fc034df7504fd8d4ce802cc4ecb 100644
--- a/paddle/fluid/operators/load_combine_op.h
+++ b/paddle/fluid/operators/load_combine_op.h
@@ -70,6 +70,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
     auto out_vars = context.MultiOutputVar("Out");
 
     for (size_t i = 0; i < out_var_names.size(); i++) {
+      VLOG(4) << "loading tensor: " << out_var_names[i];
       PADDLE_ENFORCE_NOT_NULL(
           out_vars[i], platform::errors::InvalidArgument(
                            "The variable %s to be loaded cannot be found.",
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cc b/paddle/fluid/operators/lookup_table_v2_op.cc
index 122e01f146ccddbdc8e72aba67d47855ad30b0eb..4a6680d76c4de7f7f47445b593b1cf50cd6e1311 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_v2_op.h"
 
 #include <memory>
-
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
 
 namespace paddle {
@@ -196,3 +196,14 @@ REGISTER_OP_CPU_KERNEL(lookup_table_v2, ops::LookupTableV2Kernel<float>,
 REGISTER_OP_CPU_KERNEL(lookup_table_v2_grad,
                        ops::LookupTableV2GradKernel<float>,
                        ops::LookupTableV2GradKernel<double>);
+
+/* ==========================  register checkpoint ===========================*/
+REGISTER_OP_VERSION(lookup_table_v2)
+    .AddCheckpoint(
+        R"ROC(fix lookup_table_v2, add input type `int32`)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .BugfixWithBehaviorChanged("lookup_table_v2 support input type "
+                                       "`int64`; after support input type "
+                                       "`int32/int64`"));
+
+/* ========================================================================== */
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index b3b0f8f1960901226a2f4d5e59e7aac47907a5bf..551f0d3c6412e46deb311fac58e5b9638feb30a6 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -85,6 +85,14 @@ __global__ void LookupTableV2Grad(T *table, const T *output, const int64_t *ids,
   }
 }
 
+template <typename T>
+__global__ void InputTypeCovert(const T *in_ids, const int64_t K,
+                                int64_t *out_ids) {
+  for (int i = 0; i < K; i++) {
+    out_ids[i] = (int64_t)(in_ids[i]);
+  }
+}
+
 template <typename T>
 class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
  public:
@@ -101,23 +109,37 @@ class LookupTableV2CUDAKernel : public framework::OpKernel<T> {
     size_t D = table_t->dims()[1];
     size_t K = ids_t->numel();
 
-    auto *ids = ids_t->data<int64_t>();
-    auto *table = table_t->data<T>();
-    auto *output = output_t->mutable_data<T>(context.GetPlace());
-
     dim3 threads(256, 4);
     dim3 grids(80, 1);
 
+    // copy GPU memory to CPU pinned memory
+    framework::Vector<int64_t> ids;
+    ids.resize(K);
+
+    const int64_t *ids_p = nullptr;
+
+    if (ids_t->type() == framework::proto::VarType::INT32) {
+      InputTypeCovert<
+          int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+          ids_t->data<int>(), K, ids.MutableData(context.GetPlace()));
+      ids_p = ids.MutableData(context.GetPlace());
+    } else {
+      ids_p = ids_t->data<int64_t>();
+    }
+
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
+
     if (padding_idx == -1)
       LookupTableV2<
           T, 256, 4, 80,
           false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+          output, table, ids_p, N, K, D, padding_idx);
     else
       LookupTableV2<
           T, 256, 4, 80,
           true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+          output, table, ids_p, N, K, D, padding_idx);
   }
 };
 
@@ -139,16 +161,24 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
 
       auto *ids_data = ids->data<int64_t>();
       int64_t ids_num = ids->numel();
-
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
       auto stream = dev_ctx.stream();
       // copy GPU memory to CPU pinned memory
       framework::Vector<int64_t> new_rows;
       new_rows.resize(ids_num);
       auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
 
-      // TODO(yuyang18): Strange code here.
-      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
-                   gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
+      if (ids->type() == framework::proto::VarType::INT32) {
+        InputTypeCovert<
+            int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+            ids->data<int>(), ids_num,
+            new_rows.MutableData(context.GetPlace()));
+      } else {
+        memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
+                     gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
+      }
+
       d_table->set_rows(new_rows);
 
       auto *d_table_value = d_table->mutable_value();
@@ -177,17 +207,32 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
       int N = d_table_t->dims()[0];
       int D = d_table_t->dims()[1];
       int K = ids_t->numel();
-      const int64_t *ids = ids_t->data<int64_t>();
+
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
+      // copy GPU memory to CPU pinned memory
+      framework::Vector<int64_t> ids;
+      ids.resize(K);
+
+      const int64_t *ids_p = nullptr;
+
+      if (ids_t->type() == framework::proto::VarType::INT32) {
+        InputTypeCovert<
+            int><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+            ids_t->data<int>(), K, ids.MutableData(context.GetPlace()));
+        ids_p = ids.MutableData(context.GetPlace());
+      } else {
+        ids_p = ids_t->data<int64_t>();
+      }
+
       const T *d_output = d_output_t->data<T>();
       T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
 
       auto t = framework::EigenVector<T>::Flatten(*d_table_t);
       t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
 
-      dim3 threads(128, 8);
-      dim3 grids(8, 1);
       LookupTableV2Grad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
-          d_table, d_output, ids, N, K, D);
+          d_table, d_output, ids_p, N, K, D);
     }
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
index 9aab90d84796ca5c7f37a818595ce87fb3a554b5..092c5f3b03305608f96fcc2834ad74a3388ed7ed 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <string>
 #include <vector>
 
@@ -45,84 +46,70 @@ class LookupTableV2Kernel : public framework::OpKernel<T> {
     auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
     auto *table_var = context.InputVar("W");
 
-    auto id_name = context.InputNames("Ids").front();
-    auto embedding_name = context.InputNames("W").front();
-    auto out_name = context.OutputNames("Out").front();
-
-    // for remote prefetch
-    auto epmap = context.Attr<std::vector<std::string>>("epmap");
-    auto remote_prefetch = context.Attr<bool>("remote_prefetch");
-    auto table_names = context.Attr<std::vector<std::string>>("table_names");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    int64_t ids_numel = ids_t->numel();
 
-    if (remote_prefetch && !epmap.empty()) {
-// if epmap is not empty, then the parameter will be fetched from remote
-// parameter server
+    std::vector<int64_t> ids;
+    ids.reserve(ids_numel);
 
-#ifdef PADDLE_WITH_DISTRIBUTE
-      operators::distributed::prefetch(id_name, out_name, embedding_name, false,
-                                       table_names, epmap, context,
-                                       context.scope());
-#else
-      PADDLE_THROW(
-          "paddle is not compiled with distribute support, can not do "
-          "parameter prefetch!");
-#endif
+    if (ids_t->type() == framework::proto::VarType::INT32) {
+      std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_numel,
+                     std::back_inserter(ids),
+                     [&](int id) { return static_cast<int64_t>(id); });
     } else {
-      int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-      int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-      int64_t ids_numel = ids_t->numel();
-
-      if (table_var->IsType<LoDTensor>()) {
-        auto *table_t = context.Input<LoDTensor>("W");
-        int64_t row_number = table_t->dims()[0];
-        int64_t row_width = table_t->dims()[1];
-
-        auto *table = table_t->data<T>();
-        auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-        for (int64_t i = 0; i < ids_numel; ++i) {
-          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-            memset(output + i * row_width, 0, row_width * sizeof(T));
-          } else {
-            PADDLE_ENFORCE_LT(
-                ids[i], row_number,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0 and < %ld, but got %ld. Please check input "
-                "value.",
-                row_number, ids[i]);
-            PADDLE_ENFORCE_GE(
-                ids[i], 0,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0 and < %ld, but got %ld. Please check input "
-                "value.",
-                row_number, ids[i]);
-            memcpy(output + i * row_width, table + ids[i] * row_width,
-                   row_width * sizeof(T));
-          }
+      framework::TensorToVector(*ids_t, &ids);
+    }
+
+    if (table_var->IsType<LoDTensor>()) {
+      auto *table_t = context.Input<LoDTensor>("W");
+      int64_t row_number = table_t->dims()[0];
+      int64_t row_width = table_t->dims()[1];
+
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+      for (int64_t i = 0; i < ids_numel; ++i) {
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(
+              ids[i], row_number,
+              "Variable value (input) of OP(fluid.layers.embedding) "
+              "expected >= 0 and < %ld, but got %ld. Please check input "
+              "value.",
+              row_number, ids[i]);
+          PADDLE_ENFORCE_GE(
+              ids[i], 0,
+              "Variable value (input) of OP(fluid.layers.embedding) "
+              "expected >= 0 and < %ld, but got %ld. Please check input "
+              "value.",
+              row_number, ids[i]);
+          memcpy(output + i * row_width, table + ids[i] * row_width,
+                 row_width * sizeof(T));
         }
-      } else if (table_var->IsType<SelectedRows>()) {
-        const auto &table_t = table_var->Get<SelectedRows>();
-        int64_t row_width = table_t.value().dims()[1];
-        const auto *table = table_t.value().data<T>();
-        auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-        auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-        for (int64_t i = 0; i < ids_numel; ++i) {
-          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-            memset(output + i * row_width, 0, row_width * sizeof(T));
-          } else {
-            PADDLE_ENFORCE_GE(
-                ids[i], 0,
-                "Variable value (input) of OP(fluid.layers.embedding) "
-                "expected >= 0. But received %ld",
-                ids[i]);
-            auto id_index = table_t.Index(ids[i]);
-            PADDLE_ENFORCE_GE(
-                id_index, 0, "the input key should be exists. But received %d.",
-                id_index);
-            blas.VCOPY(row_width, table + id_index * row_width,
-                       output + i * row_width);
-          }
+      }
+    } else if (table_var->IsType<SelectedRows>()) {
+      const auto &table_t = table_var->Get<SelectedRows>();
+      int64_t row_width = table_t.value().dims()[1];
+      const auto *table = table_t.value().data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      for (int64_t i = 0; i < ids_numel; ++i) {
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_GE(
+              ids[i], 0,
+              "Variable value (input) of OP(fluid.layers.embedding) "
+              "expected >= 0. But received %ld",
+              ids[i]);
+          auto id_index = table_t.Index(ids[i]);
+          PADDLE_ENFORCE_GE(id_index, 0,
+                            "the input key should be exists. But received %d.",
+                            id_index);
+          blas.VCOPY(row_width, table + id_index * row_width,
+                     output + i * row_width);
         }
       }
     }
@@ -151,17 +138,23 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *ids_t = context.Input<LoDTensor>("Ids");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
       auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      int64_t ids_num = ids_t->numel();
+
+      std::vector<int64_t> ids;
+      ids.reserve(ids_num);
 
-      auto *ids_data = ids->data<int64_t>();
-      int64_t ids_num = ids->numel();
+      if (ids_t->type() == framework::proto::VarType::INT32) {
+        std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_num,
+                       std::back_inserter(ids),
+                       [&](int id) { return static_cast<int64_t>(id); });
+      } else {
+        framework::TensorToVector(*ids_t, &ids);
+      }
 
-      std::vector<int64_t> new_rows;
-      new_rows.resize(ids_num);
-      std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t));
-      d_table->set_rows(new_rows);
+      d_table->set_rows(ids);
 
       auto *d_table_value = d_table->mutable_value();
       d_table_value->Resize({ids_num, table_dim[1]});
@@ -185,11 +178,23 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
       memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
 
     } else {
-      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *ids_t = context.Input<LoDTensor>("Ids");
       auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
       auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
+      int64_t ids_num = ids_t->numel();
+
+      std::vector<int64_t> ids;
+      ids.reserve(ids_num);
+
+      if (ids_t->type() == framework::proto::VarType::INT32) {
+        std::transform(ids_t->data<int>(), ids_t->data<int>() + ids_num,
+                       std::back_inserter(ids),
+                       [&](int id) { return static_cast<int64_t>(id); });
+      } else {
+        framework::TensorToVector(*ids_t, &ids);
+      }
 
-      auto *ids_data = ids->data<int64_t>();
+      auto *ids_data = ids.data();
 
       int64_t N = table_dim[0];
       int64_t D = table_dim[1];
@@ -199,7 +204,7 @@ class LookupTableV2GradKernel : public framework::OpKernel<T> {
 
       memset(d_table_data, 0, d_table->numel() * sizeof(T));
 
-      for (int64_t i = 0; i < ids->numel(); ++i) {
+      for (int64_t i = 0; i < ids_num; ++i) {
         if (padding_idx != kNoPadding && ids_data[i] == padding_idx) {
           // the gradient of padding_idx should be 0, already done by memset, so
           // do nothing.
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 5ca9216d0c8d6b3f773a1eb1a0cec216ca6ed4f3..487deb11b48687a91174c8d9baf072a5ca929de8 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -33,10 +33,12 @@ class MKLDNNActivationKernel
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *x = ctx.Input<Tensor>("X");
-    PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for X tensor");
-    PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for X tensor");
+    PADDLE_ENFORCE_EQ(
+        x->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for X tensor"));
+    PADDLE_ENFORCE_NE(
+        x->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for X tensor"));
 
     Functor functor;
     functor(ctx);
@@ -50,9 +52,11 @@ class MKLDNNActivationGradKernel
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
     PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input OutGrad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input OutGrad tensor"));
     PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input OutGrad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input OutGrad tensor"));
 
     Functor functor;
     functor(ctx);
@@ -82,7 +86,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
 
   PADDLE_ENFORCE(
       x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4,
-      "Input dim must be with 2, 3 or 4");
+      platform::errors::Unimplemented("Input dim must be with 2, 3 or 4"));
 
   auto src_tz = framework::vectorize<int64_t>(x->dims());
 
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index 33cf00b2c01da8e346e4c7e6be81fce3fd47f54f..8a02a697cbb21b28e14f19c6202ae0777b5102de 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -262,9 +262,11 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input diff_y tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input diff_y tensor"));
     PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input diff_y tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input diff_y tensor"));
 
     auto src_tz = paddle::framework::vectorize<int64_t>(x->dims());
     auto scale_tz = paddle::framework::vectorize<int64_t>(scale->dims());
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 40f64800a0b81a161805857cb3e0a3855f386720..3cafb0e9fc6147626f066bbeba1b10d074a37b87 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -30,10 +30,12 @@ using platform::to_void_cast;
 
 static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
   for (auto* input : inputs) {
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input tensor");
+    PADDLE_ENFORCE_EQ(
+        input->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for Input tensor"));
+    PADDLE_ENFORCE_NE(
+        input->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for Input tensor"));
   }
 }
 
@@ -49,7 +51,7 @@ static platform::CPUPlace GetCpuPlace(
     const paddle::framework::ExecutionContext& ctx) {
   auto place = ctx.GetPlace();
   PADDLE_ENFORCE(paddle::platform::is_cpu_place(place),
-                 "It must use CPUPlace.");
+                 platform::errors::InvalidArgument("It must use CPUPlace."));
   return BOOST_GET_CONST(platform::CPUPlace, place);
 }
 
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 7d99bb7d2b7a7049c67788df4c507afc14880815..aba32c6ec92854f3ea9248d92f7e435551ae83b0 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -561,7 +561,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
       PADDLE_ENFORCE_EQ(
           !fuse_residual_conn || !force_fp32_output, true,
-          "residual fusion does not support force output with fp32");
+          platform::errors::Unimplemented(
+              "residual fusion does not support force output with fp32"));
 
       auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
 
@@ -625,7 +626,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
               ? dilations.size() == 3 && dilations[0] == 1 &&
                     dilations[1] == 1 && dilations[2] == 1
               : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-          true, "dilation in convolution is not implemented yet");
+          true, platform::errors::Unimplemented(
+                    "dilation in convolution is not implemented yet"));
 
       const K* filter_data = filter->data<K>();
       auto scale_in_data = ctx.Attr<float>("Scale_in");
@@ -887,7 +889,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
             "The output_grad tensor's layout should be %d, but got %d.",
             DataLayout::kMKLDNN, output_grad->layout()));
     PADDLE_ENFORCE_NE(output_grad->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for output_grad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for output_grad tensor"));
 
     PADDLE_ENFORCE_EQ(
         ctx.Attr<bool>("is_test"), false,
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 48279658c80e93428f940c40e61d7b9af23f4ee3..56537900216a8a4e4e96791123c7d50da621ab62 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -117,7 +117,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     PADDLE_ENFORCE(
         dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-        "dilation in convolution is not implemented yet");
+        platform::errors::Unimplemented(
+            "dilation in convolution is not implemented yet"));
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 2a8b332521804ccebdbd4e6914b2763abfb5dbdc..9df30b3295c00e69a956ee84770dfeb19a83487c 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -83,19 +83,24 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    PADDLE_ENFORCE_EQ(in_x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(in_x->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input tensor");
+    PADDLE_ENFORCE_EQ(
+        in_x->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for Input tensor"));
+    PADDLE_ENFORCE_NE(
+        in_x->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for Input tensor"));
 
     PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input output_grad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input output_grad tensor"));
     PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input output_grad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input output_grad tensor"));
 
     PADDLE_ENFORCE_EQ(
         ctx.Attr<bool>("is_test"), false,
-        "is_test attribute should be set to False in training phase.");
+        platform::errors::InvalidArgument(
+            "is_test attribute should be set to False in training phase."));
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
 
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 4d825e4ee279bc2c505cfabff1917d1a5319d1dd..5014381a4e215917883f45288de4482db5cbf79c 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -140,7 +140,8 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
 
     PADDLE_ENFORCE_EQ(
         dout->dims(), dx->dims(),
-        "The shape of softmax_grad's input and output must be identical.");
+        platform::errors::InvalidArgument(
+            "The shape of softmax_grad's input and output must be identical."));
 
     auto dims = dout->dims();  // input and output share the same shape
     const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index ad3e5543f10ae05865565110ba2231c897c205b8..94e54266f0f922efef5ea4a1b23338b6ce02d131 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -12,60 +12,90 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/platform/cudnn_desc.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
 namespace operators {
 
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using DataLayout = platform::DataLayout;
 using Tensor = framework::Tensor;
 
+static inline int SizeOutAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
 template <typename T>
 class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<Tensor>("X");
-    auto* Out = context.Output<Tensor>("Out");
-
-    // allocate memory on device.
-    Out->mutable_data<T>(context.GetPlace());
-
-    auto dims = X->dims();
-    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
-    framework::LoDTensor flattened_x;
-    framework::LoDTensor flattened_out;
-    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
-    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
-
-    math::SoftmaxCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(),
-        &flattened_x, &flattened_out);
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto* out_data = out->data<T>();
+
+    auto dims = x->dims();
+    const int rank = dims.size();
+    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int dim = dims[axis];
+    const int N = SizeToAxis(axis, dims);
+    const int D = SizeOutAxis(axis, dims);
+
+    ScopedTensorDescriptor desc;
+    std::vector<int> tensor_dims = {N, dim, D, 1};
+    DataLayout layout = DataLayout::kNCHW;
+    cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+        handle, CUDNN_SOFTMAX_ACCURATE, mode,
+        platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
+        platform::CudnnDataType<T>::kZero(), desc_, out_data));
   }
 };
 
 template <typename T>
 class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<Tensor>("Out");
-    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
-
-    // allocate memory on device.
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto dims = Out->dims();
-    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
-    framework::LoDTensor flattened_out;
-    framework::LoDTensor flattened_d_out;
-    framework::LoDTensor flattened_d_x;
-    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
-    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
-    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
-
-    math::SoftmaxGradCUDNNFunctor<T>()(
-        context.template device_context<platform::CUDADeviceContext>(),
-        &flattened_out, &flattened_d_out, &flattened_d_x);
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+    auto* dx_data = dx->data<T>();
+
+    auto dims = out->dims();
+    const int rank = dims.size();
+    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    const int dim = dims[axis];
+    const int N = SizeToAxis(axis, dims);
+    const int D = SizeOutAxis(axis, dims);
+
+    ScopedTensorDescriptor desc;
+    std::vector<int> tensor_dims = {N, dim, D, 1};
+    DataLayout layout = DataLayout::kNCHW;
+    cudnnTensorDescriptor_t desc_ = desc.descriptor<T>(layout, tensor_dims);
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
+                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+        handle, CUDNN_SOFTMAX_ACCURATE, mode,
+        platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
+        dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_, dx_data));
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 2a6ca7975f0c591701400e71feab0be36300480b..cf46b4fc3bdad486f65afa5ac994d506c20344cb 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -53,13 +53,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
                           "Attr(axis) value should be in range [-R, R-1], "
                           "R is the rank of Input(X)."));
 
-    auto use_cudnn = ctx->Attrs().Get<bool>("use_cudnn");
-    if (axis != rank_x - 1 && axis != -1) {
-      PADDLE_ENFORCE_EQ(use_cudnn, false,
-                        platform::errors::InvalidArgument(
-                            "CUDNN kernel only support axis as -1."));
-    }
-
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index c33e7c6068648d019a38450a92fec79032411598..ee1361e3618302816200efc759ebd18ee05c9274 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unsqueeze_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -327,6 +329,7 @@ REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
 REGISTER_OP_CPU_KERNEL(
     unsqueeze, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -334,12 +337,14 @@ REGISTER_OP_CPU_KERNEL(
     unsqueeze_grad,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     unsqueeze2, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -347,6 +352,7 @@ REGISTER_OP_CPU_KERNEL(
     unsqueeze2_grad,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cu.cc b/paddle/fluid/operators/unsqueeze_op.cu.cc
index 3258de53b8b7cda994c9555bf6a62502f3c04c23..0e8f47a692380cc96a371bb7a5319af89a3d28c4 100644
--- a/paddle/fluid/operators/unsqueeze_op.cu.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cu.cc
@@ -21,6 +21,7 @@ REGISTER_OP_CUDA_KERNEL(
     unsqueeze, ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -30,6 +31,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
                              plat::float16>,
+    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -38,6 +40,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -47,6 +50,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
                               plat::float16>,
+    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc
index 4b72b09adddf24f63814e8e4872af289b38bcb44..1e70bd9381b9d683af82f77959db9ad680f06bd3 100644
--- a/paddle/fluid/pybind/fleet_wrapper_py.cc
+++ b/paddle/fluid/pybind/fleet_wrapper_py.cc
@@ -57,7 +57,11 @@ void BindFleetWrapper(py::module* m) {
       .def("get_cache_threshold", &framework::FleetWrapper::GetCacheThreshold)
       .def("cache_shuffle", &framework::FleetWrapper::CacheShuffle)
       .def("save_cache", &framework::FleetWrapper::SaveCache)
+      .def("save_model_with_whitelist",
+           &framework::FleetWrapper::SaveWithWhitelist)
       .def("load_model", &framework::FleetWrapper::LoadModel)
+      .def("load_table_with_whitelist",
+           &framework::FleetWrapper::LoadWithWhitelist)
       .def("clear_model", &framework::FleetWrapper::ClearModel)
       .def("clear_one_table", &framework::FleetWrapper::ClearOneTable)
       .def("stop_server", &framework::FleetWrapper::StopServer)
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index f1084018d9c79e46c33098dafdb48dc395dac652..318178d5eb927e45fa6472a695ce57f4b2a058b8 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -334,8 +334,7 @@ void BindGlobalValueGetterSetter(pybind11::module *module) {
   } while (0)
 
 static void RegisterGlobalVarGetterSetter() {
-  REGISTER_PRIVATE_GLOBAL_VAR(/*is_writable=*/false, FLAGS_use_mkldnn,
-                              FLAGS_free_idle_chunk,
+  REGISTER_PRIVATE_GLOBAL_VAR(/*is_writable=*/false, FLAGS_free_idle_chunk,
                               FLAGS_free_when_no_cache_hit);
 
   REGISTER_PUBLIC_GLOBAL_VAR(
@@ -349,7 +348,7 @@ static void RegisterGlobalVarGetterSetter() {
       FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
       FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
       FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
-      FLAGS_paddle_num_threads);
+      FLAGS_paddle_num_threads, FLAGS_use_mkldnn);
 
 #ifdef PADDLE_WITH_CUDA
   REGISTER_PUBLIC_GLOBAL_VAR(
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index 1f88eb2109aa23b6b60104451908b0a70c41c898..7eab677fac1683fdc95c9e338b1099d78b5cabc3 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -29,8 +29,10 @@ function(train_test TARGET_NAME)
                 PROPERTIES DEPENDS test_${TARGET_NAME})
         set_tests_properties(test_train_${TARGET_NAME}${arg}
                 PROPERTIES LABELS "RUN_TYPE=DIST")
-        set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES TIMEOUT 150)
+        if(NOT WIN32 AND NOT APPLE)
+            set_tests_properties(test_train_${TARGET_NAME}${arg}
+                    PROPERTIES TIMEOUT 150)
+        endif()
     endforeach()
 endfunction(train_test)
 
diff --git a/paddle/scripts/conda_build.py b/paddle/scripts/conda_build.py
index 648819c8cc3f6652ca48a95ba4fda0f3bbed8e80..0a0736f35a58dbcd2f47bd4c4c9849f7d146fbf4 100644
--- a/paddle/scripts/conda_build.py
+++ b/paddle/scripts/conda_build.py
@@ -42,7 +42,6 @@ requirements:
     - nltk
     - scipy
     - requests
-    - pyyaml
     - pillow
     - graphviz
     - protobuf
@@ -62,7 +61,6 @@ requirements:
     - nltk
     - scipy
     - requests
-    - pyyaml
     - pillow
     - graphviz
     - protobuf
@@ -89,13 +87,11 @@ about:
 pip install /package/objgraph-3.4.1.tar.gz
 pip install /package/prettytable-0.7.tar.gz
 pip install /package/rarfile-3.0.tar.gz --no-deps
-pip install /package/funcsigs-1.0.2.tar.gz
 """
 
         self.blt_const = r""" 
 pip install C:\package\objgraph-3.4.1.tar.gz
 pip install C:\package\prettytable-0.7.tar.gz
-pip install C:\package\funcsigs-1.0.2.tar.gz
 pip install C:\package\rarfile-3.0.tar.gz --no-deps
 git clone https://github.com/PaddlePaddle/recordio.git
 cd recordio\python
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 862ab2e8db1fdc353db826204d759d99951d5142..8050f07af03803c971c33c888b09df38d3d0a1f0 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -19,10 +19,14 @@ rem =================================================
 rem       Paddle CI Task On Windows Platform
 rem =================================================
 
+rem -------clean up environment-----------
 set work_dir=%cd%
 if exist build rmdir build /s/q
 mkdir build
 cd /d build
+tree .
+dir paddle\fluid\pybind\Release
+taskkill /f /im op_function_generator.exe  2>NUL
 
 rem ------initialize the virtual environment------
 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
@@ -59,13 +63,12 @@ if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=OFF
 if not defined WITH_TPCACHE set WITH_TPCACHE=ON
 
 rem ------set cache third_party------
-set cache_dir=%work_dir%\..\cache
+set cache_dir=%work_dir:Paddle=cache%
 dir %cache_dir%
 set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo
 
 if not exist %cache_dir%\tools (
     git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
-    if %ERRORLEVEL% NEQ 0 exit /b %ERRORLEVEL%
 )
 
 if "%WITH_TPCACHE%"=="OFF" (
@@ -125,6 +128,8 @@ echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
 
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
+set start=%start:~4,10%
 echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
 -DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
@@ -150,7 +155,7 @@ call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd6
 
 set build_times=1
 :build_tp
-echo Build third_party for %build_times% time:
+echo Build third_party the %build_times% time:
 msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
@@ -165,7 +170,7 @@ echo Build third_party successfully!
 
 set build_times=1
 :build_paddle
-echo Build Paddle for %build_times% time:
+echo Build Paddle the %build_times% time:
 msbuild /m /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
@@ -176,7 +181,9 @@ if %ERRORLEVEL% NEQ 0 (
         goto :build_paddle
     )
 )
+
 echo Build Paddle successfully!
+
 goto:eof
 
 :build_error
@@ -189,6 +196,17 @@ rem ----------------------------------------------------------------------------
 echo    ========================================
 echo    Step 3. Test pip install whl package ...
 echo    ========================================
+
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
+set end=%end:~4,10%
+call :timestamp "%start%" "%end%" "Build"
+tree /F %cd%\fluid_inference_install_dir\paddle
+%cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\fluid_inference_install_dir\paddle\lib > lib_size.txt
+set /p libsize=< lib_size.txt
+for /F %%i in ("%libsize%") do echo "Windows FLuid_Inference Size: %%i"
+%cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\python\dist > whl_size.txt
+set /p whlsize=< whl_size.txt
+for /F %%i in ("%whlsize%") do echo "Windows PR whl Size: %%i"
 dir /s /b python\dist\*.whl > whl_file.txt
 set /p PADDLE_WHL_FILE_WIN=< whl_file.txt
 
@@ -198,7 +216,7 @@ pip install -U %PADDLE_WHL_FILE_WIN% --user
 if %ERRORLEVEL% NEQ 0 (
     call paddle_winci\Scripts\deactivate.bat 2>NUL
     echo pip install whl package failed!
-    exit /b 3
+    exit /b 1
 )
 
 python %work_dir%\paddle\scripts\installation_validate.py
@@ -207,7 +225,7 @@ goto:eof
 :test_whl_pacakage_error
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 echo Test import paddle failed, will exit!
-exit /b 3
+exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
 :unit_test
@@ -215,6 +233,8 @@ echo    ========================================
 echo    Step 4. Running unit tests ...
 echo    ========================================
 
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
+set start=%start:~4,10%
 dir %THIRD_PARTY_PATH:/=\%\install\openblas\lib
 dir %THIRD_PARTY_PATH:/=\%\install\openblas\bin
 dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin
@@ -237,15 +257,18 @@ echo    ========================================
 echo    Step 5. Testing fluid library for inference ...
 echo    ========================================
 
-cd %work_dir%\paddle\fluid\inference\api\demo_ci
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
+set end=%end:~4,10%
+call :timestamp "%start%" "%end%" "TestCases Total"
 
+cd %work_dir%\paddle\fluid\inference\api\demo_ci
 %cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo
 goto:eof
 
 :test_inference_error
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 echo Testing fluid library for inference failed!
-exit /b 5
+exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
 :check_change_of_unittest
@@ -253,7 +276,6 @@ echo    ========================================
 echo    Step 6. Check whether deleting a unit test ...
 echo    ========================================
 
-set PATH=%PYTHON_ROOT%;%PATH%
 cd /d %work_dir%\build
 echo set -ex>  check_change_of_unittest.sh
 echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >>  check_change_of_unittest.sh
@@ -325,6 +347,43 @@ call paddle_winci\Scripts\deactivate.bat 2>NUL
 exit /b 1
 
 
+:timestamp
+echo on
+setlocal enabledelayedexpansion
+set start=%~1
+set dd=%start:~2,2%
+set /a dd=100%dd%%%100
+set hh=%start:~4,2%
+set /a hh=100%hh%%%100
+set nn=%start:~6,2%
+set /a nn=100%nn%%%100
+set ss=%start:~8,2%
+set /a ss=100%ss%%%100
+set /a start_sec=dd*86400+hh*3600+nn*60+ss
+echo %start_sec%
+
+set end=%~2
+set dd=%end:~2,2%
+set /a dd=100%dd%%%100
+if %start:~0,2% NEQ %end:~0,2% (
+    set month_day=0
+    for %%i in (01 03 05 07 08 10 12) DO if %%i EQU %start:~0,2% set month_day=31
+    for %%i in (04 06 09 11) DO if %%i EQU %start:~0,2% set month_day=30
+    for %%i in (02) DO if %%i EQU %start:~0,2% set month_day=28
+    set /a dd=%dd%+!month_day!
+)
+set hh=%end:~4,2%
+set /a hh=100%hh%%%100
+set nn=%end:~6,2%
+set /a nn=100%nn%%%100
+set ss=%end:~8,2%
+set /a ss=100%ss%%%100
+set /a end_secs=dd*86400+hh*3600+nn*60+ss
+set /a cost_secs=end_secs-start_sec
+echo "Windows %~3 Time: %cost_secs%s"
+goto:eof
+
+
 rem ---------------------------------------------------------------------------------------------
 :success
 echo    ========================================
@@ -340,7 +399,7 @@ taskkill /f /im git-remote-https.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
-taskkill /f /im %cd%\paddle\fluid\pybind\Release\op_function_generator.exe  2>NUL
+taskkill /f /im op_function_generator.exe  2>NUL
 taskkill /f /im python.exe  2>NUL
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 taskkill /f /im python.exe  2>NUL
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 926747ef6186e3b9439baf787572fe9d1988fb46..6414b78172bbb3848ea8444fbef5a81fb990a4a1 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -959,7 +959,7 @@ set +x
                     
                     retry_unittests_record="$retry_unittests_record$failed_test_lists"
                     failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
-                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(\w+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
                     echo "========================================="
                     echo "This is the ${exec_time_array[$exec_times]} time to re-run"
                     echo "========================================="
@@ -1395,6 +1395,26 @@ function example() {
     fi
 }
 
+function summary_check_problems() {
+    set +x
+    local check_style_code=$1
+    local example_code=$2
+    if [ $check_style_code -ne 0 -o $example_code -ne 0 ];then
+      echo "========================================"
+      echo "summary problems:"
+      echo "========================================"
+      if [ $check_style_code -ne 0 ];then
+        echo "- Check code style failed! Please check the log and fix problems."
+      fi
+      if [ $example_code -ne 0 ];then
+        echo "- Check example code failed! Please check the log and fix problems."
+      fi
+      [ $check_style_code -ne 0 ] && exit $check_style_code
+      [ $example_code -ne 0 ] && exit $example_code
+    fi
+    set -x
+}
+
 function main() {
     local CMD=$1 
     local parallel_number=$2
@@ -1407,12 +1427,15 @@ function main() {
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         ;;
       build_and_check)
-        check_style
+        $(check_style >&2)
+        check_style_code=$?
         generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         check_sequence_op_unittest
         generate_api_spec ${PYTHON_ABI:-""} "PR"
-        example
+        $(example >&2)
+        example_code=$?
+        summary_check_problems $check_style_code $example_code
         assert_api_spec_approvals
         ;;
       build)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 779a6842ebb03e2afcdb7718f77eb9b0d2c09a83..8244b91d32dd85e905c9df9f6015b29b633d1260 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -95,7 +95,6 @@ if (WITH_TESTING)
   add_subdirectory(paddle/fluid/tests)
   add_subdirectory(paddle/fluid/contrib/tests)
   add_subdirectory(paddle/fluid/contrib/slim/tests)
-  add_subdirectory(paddle/incubate/hapi/tests)
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 46b84697e5a61e164cbc826d5018db7a6d87f69f..5f1ccf3f858287066e36abf9412ba1114c526e61 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -256,12 +256,16 @@ from .device import get_device
 # from .tensor.tensor import LoDTensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensorArray        #DEFINE_ALIAS
 
-from . import incubate
-from .incubate import hapi
 from .fluid.dygraph.base import enable_dygraph as disable_static  #DEFINE_ALIAS
 from .fluid.dygraph.base import disable_dygraph as enable_static  #DEFINE_ALIAS
 from .fluid.framework import in_dygraph_mode as in_dynamic_mode  #DEFINE_ALIAS
-from .fluid.dygraph.base import no_grad  #DEFINE_ALIAS
+from .fluid.dygraph.base import no_grad_ as no_grad  #DEFINE_ALIAS
 
 from . import jit
 from . import static
+
+# high-level api
+from .hapi import Model
+from .hapi import callbacks
+import paddle.text
+import paddle.vision
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 9060b8c0ddb433b9a21f4fc67161b46f139e4e13..5eba18776c9643077d79e2b6b3c9a239bebec637 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -196,3 +196,14 @@ def cluster_files_reader(files_pattern,
                     yield line
 
     return reader
+
+
+def _check_exists_and_download(path, url, md5, module_name, download=True):
+    if path and os.path.exists(path):
+        return path
+
+    if download:
+        return paddle.dataset.common.download(url, module_name, md5)
+    else:
+        raise ValueError('{} not exists and auto download disabled'.format(
+            path))
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index 99fab7ffceb9278505ab7dc1bfee9bdcb4e188ba..251e305104edc72fe79da34286e98bc8cc81c3c7 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -36,7 +36,7 @@ import tarfile
 import gzip
 from collections import defaultdict
 
-import paddle.dataset.common
+import paddle
 import paddle.compat as cpt
 
 __all__ = [
diff --git a/python/paddle/device.py b/python/paddle/device.py
index e2ef8e7092ad3f6af91c8d5d3c0b1deaed025514..de24fd875130e84d6532d033761f68a5c77a68c2 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 
 # TODO: define the functions to manipulate devices 
+import re
+
 from paddle.fluid import core
 from paddle.fluid import framework
-import re
+from paddle.fluid.dygraph.parallel import ParallelEnv
 
 __all__ = [
     'get_cudnn_version',
@@ -81,8 +83,8 @@ def set_device(device):
      .. code-block:: python
             
         import paddle
-        paddle.enable_imperative()
-        paddle.fluid.dygraph.set_device("gpu:0")
+        paddle.disable_static()
+        paddle.set_device("cpu")
         x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
         x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
         data = paddle.stack([x1,x2], axis=1)
@@ -90,18 +92,28 @@ def set_device(device):
     lower_device = device.lower()
     if lower_device == 'cpu':
         place = core.CPUPlace()
-        framework._set_expected_place(place)
+    elif lower_device == 'gpu':
+        if not core.is_compiled_with_cuda():
+            raise ValueError(
+                "The device should not be 'gpu', " \
+                "since PaddlePaddle is not compiled with CUDA")
+        place = core.CUDAPlace(ParallelEnv().dev_id)
     else:
-        avaliable_device = ((lower_device == 'cpu') or
-                            re.match(r'gpu:\d+', lower_device))
+        avaliable_device = re.match(r'gpu:\d+', lower_device)
         if not avaliable_device:
             raise ValueError(
-                "The device must be a string which is like 'cpu' or 'gpu:0'")
+                "The device must be a string which is like 'cpu', 'gpu' or 'gpu:0'"
+            )
+        if not core.is_compiled_with_cuda():
+            raise ValueError(
+                "The device should not be {}, since PaddlePaddle is " \
+                "not compiled with CUDA".format(avaliable_device))
         device_info_list = device.split(':', 1)
         device_id = device_info_list[1]
         device_id = int(device_id)
         place = core.CUDAPlace(device_id)
-        framework._set_expected_place(place)
+    framework._set_expected_place(place)
+    return place
 
 
 def get_device():
@@ -116,8 +128,8 @@ def get_device():
      .. code-block:: python
             
         import paddle
-        paddle.enable_imperative()
-        device = paddle.fluid.dygraph.get_device()
+        paddle.disable_static()
+        device = paddle.get_device()
 
     """
     device = ''
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index c40ae7179395a2fc16ece0d68546221ce53c2180..19df0ca91e103a0865f648daa5c173c2691307de 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -73,20 +73,21 @@ def broadcast(tensor, src, group=0):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data = np.array([[4, 5, 6], [4, 5, 6]])
-        else:
-            np_data = np.array([[1, 2, 3], [1, 2, 3]])
-        data = paddle.to_tensor(np_data)
-        paddle.distributed.broadcast(data, 1)
-        out = data.numpy()
-        # [[1, 2, 3], [1, 2, 3]]
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data = np.array([[4, 5, 6], [4, 5, 6]])
+            else:
+                np_data = np.array([[1, 2, 3], [1, 2, 3]])
+            data = paddle.to_tensor(np_data)
+            paddle.distributed.broadcast(data, 1)
+            out = data.numpy()
+            # [[1, 2, 3], [1, 2, 3]]
     """
     if in_dygraph_mode():
         return core.ops.c_broadcast(tensor, tensor, 'root', src,
@@ -129,21 +130,22 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
     Examples:
         .. code-block:: python
 
-        import paddle
-        from paddle.distributed import ReduceOp
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data = np.array([[4, 5, 6], [4, 5, 6]])
-        else:
-            np_data = np.array([[1, 2, 3], [1, 2, 3]])
-        data = paddle.to_tensor(np_data)
-        paddle.distributed.all_reduce(data)
-        out = data.numpy()
-        # [[5, 7, 9], [5, 7, 9]]
+            import numpy as np
+            import paddle
+            from paddle.distributed import ReduceOp
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data = np.array([[4, 5, 6], [4, 5, 6]])
+            else:
+                np_data = np.array([[1, 2, 3], [1, 2, 3]])
+            data = paddle.to_tensor(np_data)
+            paddle.distributed.all_reduce(data)
+            out = data.numpy()
+            # [[5, 7, 9], [5, 7, 9]]
     """
     if in_dygraph_mode():
         if op == ReduceOp.SUM:
@@ -204,20 +206,21 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data = np.array([[4, 5, 6], [4, 5, 6]])
-        else:
-            np_data = np.array([[1, 2, 3], [1, 2, 3]])
-        data = paddle.to_tensor(np_data)
-        paddle.distributed.reduce(data, 0)
-        out = data.numpy()
-        # [[5, 7, 9], [5, 7, 9]]
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data = np.array([[4, 5, 6], [4, 5, 6]])
+            else:
+                np_data = np.array([[1, 2, 3], [1, 2, 3]])
+            data = paddle.to_tensor(np_data)
+            paddle.distributed.reduce(data, 0)
+            out = data.numpy()
+            # [[5, 7, 9], [5, 7, 9]]
     """
     if in_dygraph_mode():
         if op == ReduceOp.SUM:
@@ -286,25 +289,26 @@ def all_gather(tensor_list, tensor, group=0):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        tensor_list = []
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data1 = np.array([[4, 5, 6], [4, 5, 6]])
-            np_data2 = np.array([[4, 5, 6], [4, 5, 6]])
-            data1 = paddle.to_tensor(np_data1)
-            data2 = paddle.to_tensor(np_data2)
-            paddle.distributed.all_gather(tensor_list, data1)
-        else:
-            np_data1 = np.array([[1, 2, 3], [1, 2, 3]])
-            np_data2 = np.array([[1, 2, 3], [1, 2, 3]])
-            data1 = paddle.to_tensor(np_data1)
-            data2 = paddle.to_tensor(np_data2)
-            out = paddle.distributed.all_gather(tensor_list, data2)
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            tensor_list = []
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data1 = np.array([[4, 5, 6], [4, 5, 6]])
+                np_data2 = np.array([[4, 5, 6], [4, 5, 6]])
+                data1 = paddle.to_tensor(np_data1)
+                data2 = paddle.to_tensor(np_data2)
+                paddle.distributed.all_gather(tensor_list, data1)
+            else:
+                np_data1 = np.array([[1, 2, 3], [1, 2, 3]])
+                np_data2 = np.array([[1, 2, 3], [1, 2, 3]])
+                data1 = paddle.to_tensor(np_data1)
+                data2 = paddle.to_tensor(np_data2)
+                paddle.distributed.all_gather(tensor_list, data2)
     """
     op_type = 'c_allgather'
     helper = LayerHelper(op_type, **locals())
@@ -359,25 +363,26 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.prepare_context as prepare_context
-
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        if paddle.ParallelEnv().local_rank == 0:
-            np_data1 = np.array([7, 8, 9])
-            np_data2 = np.array([10, 11, 12])
-        else:
-            np_data1 = np.array([1, 2, 3])
-            np_data2 = np.array([4, 5, 6])
-        data1 = paddle.to_tensor(np_data1)
-        data2 = paddle.to_tensor(np_data2)
-        if paddle.ParallelEnv().local_rank == 0:
-            paddle.distributed.scatter(data1, src=1)
-        else:
-            paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
-        out = data1.numpy()
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                np_data1 = np.array([7, 8, 9])
+                np_data2 = np.array([10, 11, 12])
+            else:
+                np_data1 = np.array([1, 2, 3])
+                np_data2 = np.array([4, 5, 6])
+            data1 = paddle.to_tensor(np_data1)
+            data2 = paddle.to_tensor(np_data2)
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                paddle.distributed.scatter(data1, src=1)
+            else:
+                paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
+            out = data1.numpy()
     """
     op_type = 'c_scatter'
     global _default_group
@@ -425,13 +430,13 @@ def barrier(group=0):
     Examples:
         .. code-block:: python
 
-        import paddle
-        import paddle.prepare_context as prepare_context
+            import paddle
+            from paddle.distributed import init_parallel_env
 
-        paddle.disable_static()
-        paddle.set_device('gpu:%d'%paddle.ParallelEnv().dev_id)
-        prepare_context()
-        paddle.distributed.barrier()
+            paddle.disable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            paddle.distributed.barrier()
     """
     op_type = 'barrier'
     temp = paddle.fill_constant([1], dtype="int32", value="1")
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index f4a16d0de177f8a63271ef43f6716aa31443f06f..282ac29d6f9dafb4eb3b83471157464620326348 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -22,8 +22,6 @@ from .runtime_factory import RuntimeFactory
 from .util_factory import UtilFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator
 
-#__all__ = ['Fleet']
-
 
 def _inited_runtime_handler_(func):
     def __impl__(*args, **kwargs):
@@ -43,65 +41,123 @@ inited_runtime_handler = wrap_decorator(_inited_runtime_handler_)
 class Fleet(object):
     """
     Unified API for distributed training of PaddlePaddle
-    Please reference the https://github.com/PaddlePaddle/Fleet for details
+    Please reference the https://github.com/PaddlePaddle/FleetX for details
 
 
     Returns:
         Fleet: A Fleet instance
 
-    Examples:
+    Example for collective training:
         .. code-block:: python
 
             import paddle.distributed.fleet as fleet
-            role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
-            fleet.init(role)
+
+            fleet.init(is_collective=True)
+
             strategy = fleet.DistributedStrategy()
             optimizer = paddle.optimizer.SGD(learning_rate=0.001)
             optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+
+            # do distributed training
+
+
+    Example for parameter server training:
+
+        .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+
+            fleet.init()
+
+            strategy = fleet.DistributedStrategy()
+            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+
             if fleet.is_first_worker():
                 print("this is first worker")
+
             print("current node index: {}".format(fleet.worker_index()))
             print("total number of worker num: {}".format(fleet.worker_num()))
+
             if fleet.is_worker():
                 print("this is worker")
             print("worker endpoints: {}".format(fleet.worker_endpoints(to_string=True)))
+
             print("server num: {}".format(fleet.server_num()))
             print("server endpoints: {}".format(fleet.server_endpoints(to_string=True)))
+
             if fleet.is_server():
                 print("this is server")
             fleet.stop_worker()
+
+
     """
 
     def __init__(self):
-        self._runtime_handle = None
-        self._util = None
         self._role_maker = None
+        self.strategy_compiler = None
         self._is_collective = False
+        self._runtime_handle = None
+        self._util = None
 
     def init(self, role_maker=None, is_collective=False):
         """
         Initialize role_maker in Fleet.
 
-        This function is responsible for the distributed architecture 
-        what you want to run your code behind,such as Transpiler,
-        Collective in PaddleCloudRoleMaker or UserDefinedRoleMaker 
+        This function is responsible for the distributed architecture
+        what you want to run your code behind.
+
+        Args:
+            role_maker (RoleMakerBase, optional): A ``RoleMakerBase`` containing the configuration
+                of environment variables related to distributed training.If you did not initialize 
+                the rolemaker by yourself, it will be automatically initialized to PaddleRoleMaker.
+                The default value is None.
+            is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program 
+                runs on the CPU or GPU. False means set distributed training using CPU, and True means
+                GPU.The default value is False.The default value is False.
+        Returns:
+            None
+
+        Examples1:
+
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+
+        Examples2:
+
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init(is_collective=True)
+
+        Examples3:
+
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                role = fleet.PaddleCloudRoleMaker
+                fleet.init(role)
 
         """
-        if isinstance(role_maker, RoleMakerBase):
-            self._role_maker = role_maker
-        elif role_maker == None:
+
+        if role_maker is None:
             if isinstance(is_collective, bool):
                 self._is_collective = is_collective
                 self._role_maker = PaddleCloudRoleMaker(
                     is_collective=self._is_collective)
             else:
                 raise ValueError(
-                    "Something wrong occurred, please check whether is_collective is bool value"
-                )
+                    "`is_collective` should be instance of `bool`, but got {}".
+                    format(type(is_collective)))
         else:
-            raise ValueError(
-                "Something wrong occurred, please check whether rolemaker is instance of RoleMakerBase"
-            )
+            if isinstance(role_maker, RoleMakerBase):
+                self._role_maker = role_maker
+            else:
+                raise ValueError(
+                    "`role_maker` should be subclass of `RoleMakerBase`, but got {}".
+                    format(type(role_maker)))
         self.strategy_compiler = StrategyCompiler()
         return None
 
@@ -113,6 +169,14 @@ class Fleet(object):
             bool: True if this is the first node of worker,
                   False if not.
 
+        Examples:
+
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+                fleet.is_first_worker()
+
         """
         return self._role_maker.is_first_worker()
 
@@ -122,6 +186,14 @@ class Fleet(object):
 
         Returns:
             int: node id
+
+        Examples:
+
+            .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+                fleet.worker_index()
+
         """
         return self._role_maker.worker_index()
 
@@ -131,6 +203,14 @@ class Fleet(object):
 
         Returns:
             int: worker numbers
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+                fleet.worker_num()
+
         """
         return self._role_maker.worker_num()
 
@@ -141,15 +221,31 @@ class Fleet(object):
         Returns:
             bool: True if this is a node of worker,
                   False if not.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+                fleet.is_worker()
+
         """
         return self._role_maker.is_worker()
 
     def worker_endpoints(self, to_string=False):
         """
-        Get current server endpoints, such as ["127.0.0.1:1001", "127.0.0.1:1002"].
+        Get current worker endpoints, such as ["127.0.0.1:1001", "127.0.0.1:1002"].
 
         Returns:
             list/string: server endpoints
+
+        Examples:
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+                fleet.worker_endpoints()
+
         """
         '''
         if to_string:
@@ -165,6 +261,12 @@ class Fleet(object):
 
         Returns:
             int: server number
+
+        Examples:
+            .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            fleet.init()
+            fleet.server_num()
         """
         return len(self._role_maker.get_pserver_endpoints())
 
@@ -174,6 +276,14 @@ class Fleet(object):
 
         Returns:
             int: node id
+
+        Examples:
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+                fleet.server_index()
+
         """
         return self._role_maker.server_index()
 
@@ -183,14 +293,20 @@ class Fleet(object):
 
         Returns:
             list/string: server endpoints
+
+        Examples:
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+                fleet.server_endpoints()
+
         """
-        '''
+
         if to_string:
             return ",".join(self._role_maker.get_pserver_endpoints())
         else:
             return self._role_maker.get_pserver_endpoints()
-        '''
-        return ["127.0.0.1:1001", "127.0.0.1:1002"]
 
     def is_server(self):
         """
@@ -199,6 +315,14 @@ class Fleet(object):
         Returns:
             bool: True if this is a node of server,
                   False if not.
+
+        Examples:
+
+            .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+                fleet.is_server()
+
         """
         return self._role_maker.is_server(
         ) or self._role_maker._is_heter_worker()
@@ -208,6 +332,19 @@ class Fleet(object):
         """
         Utility functions that can be used under certain runtime
         return util
+
+        Returns:
+            UtilBase: instance of UtilBase, can use distributed ops/tools easily.
+
+        Examples:
+
+            .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+                util = fleet.util
+                files = ["1.log", "2.log", "3.log", "4.log"]
+                files = util.get_file_shard()
+
         """
         return self._util
 
@@ -215,41 +352,114 @@ class Fleet(object):
     def util(self, util):
         """
         Set Utility functions for userd-defined runtime
-        set util
+
+        Returns:
+            None
         """
         self._util = util
 
     def barrier_worker(self):
         """
-        barrier between workers
+        barrier all workers
+
+        Returns:
+            None
         """
         self._role_maker.barrier_worker()
 
     @inited_runtime_handler
     def init_worker(self):
         """
-        init worker
+        initialize `Communicator` for parameter server training.
+
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+
+                # build net
+                # fleet.distributed_optimizer(...)
+
+                fleet.init_worker()
+
         """
         self._runtime_handle._init_worker()
 
     @inited_runtime_handler
     def init_server(self, *args, **kwargs):
         """
-        init server
+        init_server executor to initialize startup program,
+        if the `args` is not empty, it will run load_persistables for increment training.
+
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+
+                # build net
+                # fleet.distributed_optimizer(...)
+
+                fleet.init_server()
+
         """
         self._runtime_handle._init_server(*args, **kwargs)
 
     @inited_runtime_handler
     def run_server(self):
         """
-        run server
+        run server will run pserver main program with executor.
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+
+                # build net
+                # fleet.distributed_optimizer(...)
+
+                if fleet.is_server():
+                    fleet.init_server()
+
         """
         self._runtime_handle._run_server()
 
     @inited_runtime_handler
     def stop_worker(self):
         """
-        stop worker
+        stop `Communicator` and give training complete notice to parameter server.
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+
+                # build net
+                # fleet.distributed_optimizer(...)
+
+                fleet.init_server()
+
         """
         self._runtime_handle._stop_worker()
 
@@ -260,27 +470,98 @@ class Fleet(object):
                              target_vars,
                              main_program=None,
                              export_for_deployment=True):
+        """
+        save inference model for inference.
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+
+                # build net
+                # fleet.distributed_optimizer(...)
+
+                fleet.init_server()
+
+        """
+
         self._runtime_handle._save_inference_model(
             executor, dirname, feeded_var_names, target_vars, main_program,
             export_for_deployment)
 
     def save_persistables(self, executor, dirname, main_program=None):
+        """
+
+        saves all persistable variables from :code:`main_program` to
+        the folder :code:`dirname`. You can refer to
+
+        The :code:`dirname` is used to specify the folder where persistable variables
+        are going to be saved. If you would like to save variables in separate
+        files, set :code:`filename` None.
+
+        Args:
+            executor(Executor): The executor to run for saving persistable variables.
+                                You can refer to :ref:`api_guide_executor_en` for
+                                more details.
+
+            dirname(str, optional): The saving directory path.
+                                When you need to save the parameter to the memory, set it to None.
+            main_program(Program, optional): The program whose persistbale variables will
+                                             be saved. Default: None.
+
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: text
+
+                import paddle.distributed.fleet as fleet
+                import paddle.fluid as fluid
+
+                fleet.init()
+
+                # build net
+                # fleet.distributed_optimizer(...)
+
+                exe = fluid.Executor(fluid.CPUPlace())
+                fleet.save_persistables(exe, "dirname", fluid.default_main_program())
+
+        """
+
         self._runtime_handle._save_persistables(executor, dirname, main_program)
 
     def distributed_optimizer(self, optimizer, strategy=None):
         """
-        distirbuted_optimizer
+        Optimizer for distributed training.
+
+        For the distributed training, this method would rebuild a new instance of DistributedOptimizer.
+        Which has basic Optimizer function and special features for distributed training.
+
+        Args:
+            optimizer(Optimizer): The executor to run for init server.
+            strategy(DistributedStrategy): Extra properties for distributed optimizer.
+
         Returns:
-            Fleet instance with minimize interface like optimizers
+            Fleet: instance of fleet.
 
         Examples:
+
             .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
-            fleet.init(role)
-            strategy = fleet.DistributedStrategy()
-            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+
+                import paddle.distributed.fleet as fleet
+                role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(role)
+                strategy = fleet.DistributedStrategy()
+                optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+                optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+
         """
         self.user_defined_optimizer = optimizer
         if strategy == None:
@@ -317,23 +598,25 @@ class Fleet(object):
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
-            import paddle
-            import paddle.distributed.fleet as fleet
+            .. code-block:: python
 
-            fc_1 = paddle.layers.fc(input=input_x, size=hid_dim, act='tanh')
-            fc_2 = paddlen.layers.fc(input=fc_1, size=hid_dim, act='tanh')
-            prediction = paddle.layers.fc(input=[fc_2], size=label_dim, act='softmax')
-            cost = paddle.layers.cross_entropy(input=prediction, label=input_y)
-            avg_cost = paddle.layers.mean(x=cost)
+                import paddle
+                import paddle.distributed.fleet as fleet
 
-            role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
-            fleet.init(role)
-            strategy = fleet.DistributedStrategy()
-            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-            optimizer.minimize(avg_cost)
+                fc_1 = paddle.fluid.layers.fc(input=input_x, size=hid_dim, act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=hid_dim, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2], size=label_dim, act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
+
+                role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(role)
+                strategy = fleet.DistributedStrategy()
+                optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+                optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+                optimizer.minimize(avg_cost)
 
-            # for more examples, please reference https://github.com/PaddlePaddle/Fleet
+                # for more examples, please reference https://github.com/PaddlePaddle/FleetX
 
         """
         context = {}
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 1741f10ccb1c28bfe6abaa63e754568fa08e21ce..870c3fe8be4c87b8c7bde9b690c3ee1eded39393 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -154,15 +154,16 @@ class ParameterServerRuntime(RuntimeBase):
             kwargs["sparse_attrs"] = get_sparse_attrs()
             return kwargs
 
-        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops
+        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops, _has_global_step
 
         from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
             SyncStrategy, GeoStrategy
 
         trainer_config = self.async_strategy.get_trainer_runtime_config()
-        lrs = _get_lr_ops(self.origin_main_program)
 
-        if len(lrs) > 0:
+        lrs = _has_global_step(_get_lr_ops(self.origin_main_program))
+
+        if lrs:
             kwargs = {"need_global_step": "1"}
         else:
             kwargs = {"need_global_step": "0"}
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 0c806747217add5022b0c6ea66e184b44ef56836..d35bc096343bc240786232882316606151d2fb46 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -29,13 +29,13 @@ __all__ = ["init_parallel_env"]
 ParallelStrategy = core.ParallelStrategy
 
 
-def init_parallel_env(backend='nccl'):
+def init_parallel_env():
     """
-    Initialize parallel training environments in dynamic mode.
+    Initialize parallel training environment in dynamic graph mode.
 
-    Args:
-        backend(str, optional): The backend to communication between multiple devices.
-            Now only support ``nccl`` . Default value is ``nccl`` .
+    .. note::
+        Now only supports initializing the GPU parallel training 
+        environment and using NCCL for communication.
 
     Returns:
         None
@@ -89,14 +89,12 @@ def init_parallel_env(backend='nccl'):
                 dist.spawn(train)
     """
 
-    # 1. input check
-    if not isinstance(backend, six.string_types):
-        raise TypeError("input `backend` type error, expected type is str, "
-                        "but received type is %s." % type(backend))
-    if cpt.to_text(backend) != 'nccl':
-        raise ValueError(
-            "backend `%s` is not supported, now only supports `nccl` backend." %
-            backend)
+    # 1. gpu check
+    if not core.is_compiled_with_cuda():
+        raise NotImplementedError(
+            "Cannot initialize parallel environment in CPU-only version, now only "
+            "supports initializing the GPU parallel environment. Please recompile "
+            "or reinstall paddle with GPU support.")
 
     # 2. check env
     def _check_var_exists(var_name):
@@ -112,30 +110,28 @@ def init_parallel_env(backend='nccl'):
     _check_var_exists("PADDLE_TRAINERS_NUM")
     _check_var_exists("PADDLE_TRAINER_ENDPOINTS")
 
-    # 3. init ParallelStrategy
+    # 3. init NCCL ParallelStrategy
     strategy = ParallelStrategy()
-    if cpt.to_text(backend) == 'nccl':
-        if parallel_helper._is_parallel_ctx_initialized():
-            warnings.warn("The parallel environment has been initialized.")
-        strategy.nranks = ParallelEnv().world_size
-        strategy.local_rank = ParallelEnv().rank
-        strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
-        strategy.current_endpoint = ParallelEnv().current_endpoint
-        if strategy.nranks < 2:
-            return
-        # NOTE(chenweihang): [ why config global place here? ]
-        # the dygraph mode will be set to default mode, 
-        # users will not call `dygraph.guard` or `enable_dygraph`
-        # directly, if they want to switch default place,
-        # they need to call a function to change default place,
-        # here just set correctly place to users
-        place = core.CUDAPlace(ParallelEnv().device_id)
-        _set_expected_place(place)
-
-        # init nccl context
-        parallel_helper._set_parallel_ctx(
-            core.NCCLParallelContext(strategy, place))
-        parallel_helper._init_parallel_ctx()
+    if parallel_helper._is_parallel_ctx_initialized():
+        warnings.warn("The parallel environment has been initialized.")
+    strategy.nranks = ParallelEnv().world_size
+    strategy.local_rank = ParallelEnv().rank
+    strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+    strategy.current_endpoint = ParallelEnv().current_endpoint
+    if strategy.nranks < 2:
+        return
+    # NOTE(chenweihang): [ why config global place here? ]
+    # the dygraph mode will be set to default mode, 
+    # users will not call `dygraph.guard` or `enable_dygraph`
+    # directly, if they want to switch default place,
+    # they need to call a function to change default place,
+    # here just set correctly place to users
+    place = core.CUDAPlace(ParallelEnv().device_id)
+    _set_expected_place(place)
+
+    # init nccl context
+    parallel_helper._set_parallel_ctx(core.NCCLParallelContext(strategy, place))
+    parallel_helper._init_parallel_ctx()
 
 
 def get_rank():
@@ -163,7 +159,7 @@ def get_rank():
 
 def get_world_size():
     """
-    The number of trainers (number of processes participating in current job).
+    Returns the number of trainers (number of processes participating in current job).
 
     Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . 
     The default value is 1.
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 1ca2ebaa8d4bd3e0f11e41cdcc35ab585a70b802..6f1dcd15df3bc4218a5e465cae813ff17ea66efd 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -236,8 +236,6 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
         func (function): The target function is called by spawned process.
             This function need to be able to pickled, so it must be defined
             at the top level of a module.
-            This function should be called as ``func(i, *args)``, ``i`` is
-            the process index and ``args`` contains other arguments as tuple.
         args (tuple, optional): Arguments passed to ``func``.
         nprocs (int, optional): Number of processed to start. Default: -1.
             when nprocs is -1, the available device will be obtained from 
@@ -246,8 +244,8 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
             variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available
             CPU number is obtained from the environment variable CPU_NUM. 
             For example, export CPU_NUM=4, if the environment variable is not set, 
-            the executor will add the variable to the environment variable and 
-            set its value to 1.
+            the spawn method will add default value to the environment variable 
+            and set its value to 1.
         join (bool, optional): Perform a blocking join on all spawned processes.
             Default: True.
         daemon (bool, optional): The spawned processes' daemon flag. Default: False.
@@ -266,8 +264,8 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
             such as 6170. Default: None; 
             (5) selected_gpus (string): The training process will run on the 
             selected_gpus, such as "0,1,2,3". Default: None; 
-            (6) print_config: Print current parallel training config. Default: False;
-            (7) use_paddlecloud: Whether to use paddlecloud platform to run your 
+            (6) print_config (bool): Print current parallel training config. Default: False;
+            (7) use_paddlecloud (bool): Whether to use paddlecloud platform to run your 
             multi-process job. Default: False.
 
     Returns:
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 5f6594a47213021c3a82dd4a0266f52240270e87..7b301ac19d1d3dc1f4aabb6cf3af2f0874faa677 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -129,7 +129,7 @@ class GradientClipBase(object):
     def __str__(self):
         raise NotImplementedError()
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
         raise NotImplementedError
 
@@ -258,7 +258,7 @@ class GradientClipByValue(GradientClipBase):
     def __str__(self):
         return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
         for p, g in params_grads:
@@ -413,7 +413,7 @@ class GradientClipByNorm(GradientClipBase):
     def __str__(self):
         return "Gradient Clip By Norm, clip_norm=%f" % self.clip_norm
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
         for p, g in params_grads:
@@ -565,7 +565,7 @@ class GradientClipByGlobalNorm(GradientClipBase):
     def __str__(self):
         return "Gradient Clip By GlobalNorm, global_norm=%f" % (self.clip_norm)
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
         sum_square_list = []
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index e3755cbafea41e61352f67c3de040e700297b61a..5662284483bf529034e42178c8a431f6286e31b8 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -69,7 +69,7 @@ class ImperativeQuantAware(object):
 
             from paddle.fluid.contrib.slim.quantization \
                 import ImperativeQuantAware
-            from paddle.incubate.hapi.vision.models \
+            from paddle.vision.models \
                 import resnet
             
             model = resnet.resnet50(pretrained=True)
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 007d701284dfc7ff2cafb128984414517579fce3..6ac005060e0b21d88f17619bbe88b7a56c23fdb8 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -270,7 +270,7 @@ foreach(src ${TEST_OPS})
 endforeach()
 
 # setting timeout value for old unittests
-if(NOT WIN32)
+if(NOT WIN32 AND NOT APPLE)
     set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250 LABELS "RUN_TYPE=NIGHTLY")
-	  set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
+	set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
 endif()
diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py
index 1d180329b72510de5e7e9362e4c002f4508ba1be..085dcf6592de5193d62e43e4e74e4527818071de 100644
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
@@ -16,10 +16,12 @@ from __future__ import print_function
 from __future__ import division
 
 import numpy as np
+import math
+
 from .sampler import Sampler, SequenceSampler, RandomSampler
 from .dataset import Dataset, IterableDataset
 
-__all__ = ["BatchSampler"]
+__all__ = ["BatchSampler", "DistributedBatchSampler"]
 
 
 class BatchSampler(Sampler):
@@ -158,3 +160,185 @@ class _InfiniteIterableSampler(object):
     def __iter__(self):
         while True:
             yield [None] * self.batch_size
+
+
+class DistributedBatchSampler(BatchSampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+
+    In such case, each process can pass a DistributedBatchSampler instance 
+    as a DataLoader sampler, and load a subset of the original dataset that 
+    is exclusive to it.
+
+    .. note::
+        Dataset is assumed to be of constant size.
+        
+    Args:
+        dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement
+                     or other python object which implemented
+                     `__len__` for BatchSampler to get sample
+                     number of data source.
+        batch_size(int): sample indice number in a mini-batch indices.
+        num_replicas(int, optional): porcess number in distributed training.
+            If :attr:`num_replicas` is None, :attr:`num_replicas` will be
+            retrieved from :code:`paddle.fluid.dygraph.parallel.ParallenEnv`.
+            Default None.
+        rank(int, optional): the rank of the current process among :attr:`num_replicas`
+            processes. If :attr:`rank` is None, :attr:`rank` is retrieved from
+            :code:`paddle.fluid.dygraph.parallel.ParallenEnv`. Default None.
+        shuffle(bool): whther to shuffle indices order before genrating
+            batch indices. Default False.
+        drop_last(bool): whether drop the last incomplete batch dataset size
+            is not divisible by the batch size. Default False
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+
+            from paddle.io import Dataset, DistributedBatchSampler
+
+            # init with dataset
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+            
+                def __getitem__(self, idx):
+                    image = np.random.random([784]).astype('float32')
+                    label = np.random.randint(0, 9, (1, )).astype('int64')
+                    return image, label
+                
+                def __len__(self):
+                    return self.num_samples
+  
+            dataset = RandomDataset(100)
+            sampler = DistributedBatchSampler(dataset, batch_size=64)
+
+            for data in sampler:
+                # do something
+                break
+    """
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=False,
+                 drop_last=False):
+        self.dataset = dataset
+
+        assert isinstance(batch_size, int) and batch_size > 0, \
+                "batch_size should be a positive integer"
+        self.batch_size = batch_size
+        assert isinstance(shuffle, bool), \
+                "shuffle should be a boolean value"
+        self.shuffle = shuffle
+        assert isinstance(drop_last, bool), \
+                "drop_last should be a boolean number"
+
+        from paddle.fluid.dygraph.parallel import ParallelEnv
+
+        if num_replicas is not None:
+            assert isinstance(num_replicas, int) and num_replicas > 0, \
+                    "num_replicas should be a positive integer"
+            self.nranks = num_replicas
+        else:
+            self.nranks = ParallelEnv().nranks
+
+        if rank is not None:
+            assert isinstance(rank, int) and rank >= 0, \
+                    "rank should be a non-negative integer"
+            self.local_rank = rank
+        else:
+            self.local_rank = ParallelEnv().local_rank
+
+        self.drop_last = drop_last
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))
+        self.total_size = self.num_samples * self.nranks
+
+    def __iter__(self):
+        num_samples = len(self.dataset)
+        indices = np.arange(num_samples).tolist()
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        if self.shuffle:
+            np.random.RandomState(self.epoch).shuffle(indices)
+            self.epoch += 1
+
+        # subsample
+        def _get_indices_by_batch_size(indices):
+            subsampled_indices = []
+            last_batch_size = self.total_size % (self.batch_size * self.nranks)
+            assert last_batch_size % self.nranks == 0
+            last_local_batch_size = last_batch_size // self.nranks
+
+            for i in range(self.local_rank * self.batch_size,
+                           len(indices) - last_batch_size,
+                           self.batch_size * self.nranks):
+                subsampled_indices.extend(indices[i:i + self.batch_size])
+
+            indices = indices[len(indices) - last_batch_size:]
+            subsampled_indices.extend(indices[
+                self.local_rank * last_local_batch_size:(
+                    self.local_rank + 1) * last_local_batch_size])
+            return subsampled_indices
+
+        if self.nranks > 1:
+            indices = _get_indices_by_batch_size(indices)
+
+        assert len(indices) == self.num_samples
+        _sample_iter = iter(indices)
+
+        batch_indices = []
+        for idx in _sample_iter:
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size:
+                yield batch_indices
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        num_samples = self.num_samples
+        num_samples += int(not self.drop_last) * (self.batch_size - 1)
+        return num_samples // self.batch_size
+
+    def set_epoch(self, epoch):
+        """
+        Sets the epoch number. When :attr:`shuffle=True`, this number is used
+        as seeds of random numbers. By default, users may not set this, all
+        replicas (workers) use a different random ordering for each epoch.
+        If set same number at each epoch, this sampler will yield the same
+        ordering at all epoches.
+
+        Arguments:
+            epoch (int): Epoch number.
+
+        Examples:
+            .. code-block:: python
+    
+                import numpy as np
+    
+                from paddle.io import Dataset, DistributedBatchSampler
+    
+                # init with dataset
+                class RandomDataset(Dataset):
+                    def __init__(self, num_samples):
+                        self.num_samples = num_samples
+                
+                    def __getitem__(self, idx):
+                        image = np.random.random([784]).astype('float32')
+                        label = np.random.randint(0, 9, (1, )).astype('int64')
+                        return image, label
+                    
+                    def __len__(self):
+                        return self.num_samples
+      
+                dataset = RandomDataset(100)
+                sampler = DistributedBatchSampler(dataset, batch_size=64)
+    
+                for epoch in range(10):
+                    sampler.set_epoch(epoch)
+        """
+        self.epoch = epoch
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 2174dbd31b8fb1ae97894699e03e25e809085cc8..c548bdfeba19510b26c0f80d356fa6a6b7bbaed7 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
-import inspect
 import decorator
 import contextlib
+import functools
+import inspect
 import sys
 import numpy as np
 from paddle.fluid import core
@@ -26,8 +27,8 @@ import objgraph
 from ..data_feeder import convert_dtype
 
 __all__ = [
-    'no_grad', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph', 'enabled',
-    'to_variable'
+    'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
+    'enabled', 'to_variable'
 ]
 
 
@@ -167,7 +168,80 @@ def disable_dygraph():
         _functional_dygraph_context_manager = None
 
 
-class no_grad:
+@signature_safe_contextmanager
+def _switch_tracer_mode_guard_(is_train=True):
+    tracer = framework._dygraph_tracer()
+    if tracer:
+        mode = tracer._train_mode
+        tracer._train_mode = is_train
+        try:
+            yield
+        finally:
+            tracer._train_mode = mode
+    else:
+        yield
+
+
+def no_grad(func=None):
+    """
+    :api_attr: imperative
+
+    Create a context which disables dygraph gradient calculation.
+    In this mode, the result of every computation will have `stop_gradient=True`.
+
+    Also functions as a decorator. (Make sure to instantiate without parenthesis.)
+
+    Examples:
+
+     .. code-block:: python
+
+        import numpy as np
+        import paddle.fluid as fluid
+
+        # use as generator
+
+        data = np.array([[2, 3], [4, 5]]).astype('float32')
+        with fluid.dygraph.guard():
+            l0 = fluid.Linear(2, 2)  # l0.weight.gradient() is None
+            l1 = fluid.Linear(2, 2)
+            with fluid.dygraph.no_grad():
+                # l1.weight.stop_gradient is False
+                tmp = l1.weight * 2  # tmp.stop_gradient is True
+            x = fluid.dygraph.to_variable(data)
+            y = l0(x) + tmp
+            o = l1(y)
+            o.backward()
+            print(tmp.gradient() is None)  # True
+            print(l0.weight.gradient() is None)  # False
+
+        # use as decorator
+
+        @fluid.dygraph.no_grad
+        def test_layer():
+            with fluid.dygraph.guard():
+                inp = np.ones([3, 1024], dtype='float32')
+                t = fluid.dygraph.base.to_variable(inp)
+                linear1 = fluid.Linear(1024, 4, bias_attr=False)
+                linear2 = fluid.Linear(4, 4)
+                ret = linear1(t)
+                dy_ret = linear2(ret)
+
+        test_layer()
+
+    """
+    if func is None:
+        return _switch_tracer_mode_guard_(is_train=False)
+    else:
+
+        @decorator.decorator
+        def __impl__(func, *args, **kwargs):
+            with _switch_tracer_mode_guard_(is_train=False):
+                return func(*args, **kwargs)
+
+        return __impl__(func)
+
+
+class no_grad_:
     """
     :api_attr: imperative
 
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 82018132cc8b8600958e5cd52df5844e3d37638e..f4d68a798efa26d43702aa1c555f6046f0e6a6a5 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -207,6 +207,7 @@ def load_dygraph(model_path, keep_name_table=False):
         # NOTE: `jit.save` doesn't save optimizer state
     else:
         # Load state dict by `save_dygraph` save format
+        para_dict = {}
         if os.path.exists(params_file_path):
             with open(params_file_path, 'rb') as f:
                 para_dict = pickle.load(f) if six.PY2 else pickle.load(
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
index 82f39ffd080ec803beca4e60695204b707f48210..9334c15f7bcbc0ca3782be1d4f7fc6826a59bdbc 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
@@ -16,9 +16,7 @@ import astor
 import gast
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
-from paddle.fluid.dygraph.dygraph_to_static.utils import is_dygraph_api, is_to_variable
-from paddle.fluid.dygraph.dygraph_to_static.utils import to_assign_node, to_static_ast, update_args_of_func
-from paddle.fluid.dygraph.dygraph_to_static.utils import dygraph_class_to_static_api
+from paddle.fluid.dygraph.dygraph_to_static import utils
 
 
 class BasicApiTransformer(gast.NodeTransformer):
@@ -56,7 +54,7 @@ class BasicApiTransformer(gast.NodeTransformer):
             if isinstance(child_node, gast.Call):
                 # TODO(liym27):
                 #  Considers that a dygraph api which modifies the input or has a output.
-                if is_dygraph_api(child_node):
+                if utils.is_dygraph_api(child_node):
                     return
                 else:
                     self._visit_Call(child_node)
@@ -73,7 +71,7 @@ class BasicApiTransformer(gast.NodeTransformer):
 
         if self._is_dygraph_forward(func_name):
             class_node = self._get_class_node(func_name)
-            static_node = to_static_ast(node, class_node)
+            static_node = utils.to_static_ast(node, class_node)
             return static_node
         else:
             return node
@@ -91,14 +89,51 @@ class BasicApiTransformer(gast.NodeTransformer):
             if is_to_variable(node_value):
                 return False
 
-            if is_dygraph_api(node_value):
+            if utils.is_dygraph_api(node_value):
                 dygraph_api = node_value.func.attr
-                if not dygraph_class_to_static_api.get(dygraph_api):
+                if not utils.dygraph_class_to_static_api.get(dygraph_api):
                     return False
 
-                update_args_of_func(node_value, node_value, "__init__")
+                utils.update_args_of_func(node_value, node_value, "__init__")
                 target_str = astor.to_source(gast.gast_to_ast(node.targets[0]))
                 self.class_node_dict[target_str] = node_value
                 return True
             # TODO: node.value is not dygraph class
         return False
+
+
+def is_to_variable(node):
+    assert isinstance(node, gast.Call)
+    api_name = utils.ast_to_source_code(node.func).strip()
+
+    if utils.is_dygraph_api(node):
+        return api_name.endswith("to_variable")
+
+    if utils.is_paddle_api(node):
+        return api_name.endswith("to_tensor")
+
+    return False
+
+
+def to_assign_node(node):
+    # Transform dygraph api `fluid.dygraph.to_variable` alias `paddle.to_tensor` to static api `fluid.layers.assign`.
+    # NOTE:
+    #   1. Api `to_variable` supports data type {float16, float32, float64, int16, int32, int64, uint8, uint16},
+    #   but api `assign` only supports {float32, float64, int32, int64, bool};
+    #   2. If the input of api `assign` is numpy.ndarray, its size cannot be greater than 1024 * 1024.
+
+    assert isinstance(node, gast.Call)
+    assign_api = gast.parse('fluid.layers.assign').body[0].value
+    node.func = assign_api
+
+    if node.args:
+        node.args = [node.args[0]]
+        node.keywords = []
+    else:
+        for idx, kw in enumerate(node.keywords):
+            if kw.arg == 'value' or kw.arg == 'data':
+                node.keywords[idx].arg = 'input'
+                node.keywords = [node.keywords[idx]]
+                node.args = []
+                break
+    return node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index 5540c63a85bd7f8db760f0c3e25be9eefa2aace7..90e38bd98863ff62174bd569a483b11984480b5a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -296,7 +296,7 @@ def convert_to_input_spec(inputs, input_spec):
     elif isinstance(input_spec, dict):
         input_with_spec = {}
         check_type_and_len(inputs, input_spec, True)
-        for name, input in inputs.items():
+        for name, input in six.iteritems(inputs):
             if name in input_spec:
                 input_with_spec[name] = convert_to_input_spec(input,
                                                               input_spec[name])
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index 28073f157ddb858da4fdf0e49026f5286d00411b..5b8e6d2a9bdf36d45582fcabba5a740f353f78ac 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import six
 import copy
 from collections import defaultdict
 
@@ -230,7 +231,7 @@ class NameVisitor(gast.NodeVisitor):
         return False
 
     def _update_name_ids(self, new_name_ids):
-        for name_id, ctxs in new_name_ids.items():
+        for name_id, ctxs in six.iteritems(new_name_ids):
             self.name_ids[name_id] = ctxs + self.name_ids[name_id]
 
 
@@ -250,7 +251,7 @@ def parse_cond_args(var_ids_dict, return_ids=None, ctx=gast.Load):
     """
 
     name_ids = [
-        var_id for var_id, var_ctx in var_ids_dict.items()
+        var_id for var_id, var_ctx in six.iteritems(var_ids_dict)
         if isinstance(var_ctx[0], ctx)
     ]
     if return_ids:
@@ -341,7 +342,7 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict,
 
     def _vars_with_store(ids_dict):
         vars = []
-        for k, ctxs in ids_dict.items():
+        for k, ctxs in six.iteritems(ids_dict):
             if _is_return_var(ctxs):
                 vars.append(k)
         return vars
@@ -353,7 +354,7 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict,
 
     def _vars_loaded_before_store(ids_dict):
         new_dict = defaultdict(list)
-        for k, ctxs in ids_dict.items():
+        for k, ctxs in six.iteritems(ids_dict):
             for ctx in ctxs:
                 if isinstance(ctx, gast.Load):
                     new_dict[k].append(ctx)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
index 75cb65085846d672d2488c98bf6ad625ac12e78b..c52872b15016169504359b54ad5a40360e244ce0 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -98,8 +98,15 @@ class TranslatorLogger(object):
         return level == self.transformed_code_level
 
     def has_verbosity(self, level):
+        """
+        Checks whether the verbosity level set by the user is greater than or equal to the log level.
+        Args:
+            level(int): The level of log.
+        Returns:
+            True if the verbosity level set by the user is greater than or equal to the log level, otherwise False.
+        """
         level = self.check_level(level)
-        return level >= self.verbosity_level
+        return self.verbosity_level >= level
 
     def error(self, msg, *args, **kwargs):
         self.logger.error(msg, *args, **kwargs)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 7d2a767dd8f86fbf7e0908720d4d8a81a4885685..59cb5fb144eb50f4616c94ed78348d56a4029834 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import numpy as np
 import logging
+import six
 
 from paddle.fluid import log_helper
 from paddle.fluid import framework, backward, core
@@ -334,7 +335,7 @@ class PartialProgramLayer(layers.Layer):
             param_and_buffer_names_set.add(var.name)
 
         for block in main_program.blocks:
-            for name, var in block.vars.items():
+            for name, var in six.iteritems(block.vars):
                 if isinstance(var, framework.Parameter):
                     if name not in param_and_buffer_names_set:
                         raise ValueError(
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 698d989343a23015529a3b37b285640466d1c30d..cb489af44d0adc7da377f73a3205c3c264769b4d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -24,6 +24,7 @@ import warnings
 
 import gast
 from paddle.fluid import framework
+from paddle.fluid import in_dygraph_mode
 from paddle.fluid.dygraph import layers
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.layers.utils import flatten
@@ -32,6 +33,7 @@ from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst
 from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
 from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import create_and_update_origin_info_map
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import update_op_callstack_with_origin_info
@@ -283,13 +285,21 @@ class StaticLayer(object):
         Return:
             Outputs of decorated function.
         """
+
         # 1. call dygraph function directly if not enable `declarative`
         if not self._program_trans.enable_declarative:
-            warnings.warn(
-                "The decorator '@paddle.jit.to_static' doesn't work when setting ProgramTranslator.enable=False. "
+            logging_utils.warn(
+                "The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable=False. "
                 "We will just return dygraph output.")
             return self._call_dygraph_function(*args, **kwargs)
 
+        if not in_dygraph_mode() and self._program_trans.enable_declarative:
+            raise RuntimeError(
+                "Failed to run the callable object {} decorated by '@paddle.jit.to_static', "
+                "because it does NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
+                "following API: paddle.disable_static().".format(
+                    self.dygraph_function))
+
         # 2. trace ops from dygraph layers and cache the generated program.
         args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
         try:
@@ -393,19 +403,43 @@ class StaticLayer(object):
     def concrete_program(self):
         """
         Returns recent ConcreteProgram instance of decorated function.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.jit import to_static
+                from paddle.static import InputSpec
+
+                paddle.disable_static()
+
+                def foo(x, y):
+                    z = x + y
+                    return z
+                
+                # usage 1:
+                decorated_foo = to_static(foo, input_spec=[InputSpec([10], name='x'), InputSpec([10], name='y')])
+                print(decorated_foo.concrete_program)
+
+                # usage 2:
+                decorated_foo = to_static(foo)
+                out_foo = decorated_foo(paddle.rand([10]), paddle.rand([10]))
+                print(decorated_foo.concrete_program)
         """
         # if specific the `input_spec`, the length of program_cache will always 1,
         # else, return the last one.
         cached_program_len = len(self._program_cache)
         # If specific `input_spec`, apply convertion from dygraph layers into static Program.
         if cached_program_len == 0:
-            if len(self._function_spec.flat_input_spec) > 0:
-                input_spec = self._function_spec.input_spec
+            input_spec = self._function_spec.input_spec
+            has_input_spec = (input_spec is not None and len(input_spec) > 0)
+            if has_input_spec:
                 concrete_program, _ = self.get_concrete_program(*input_spec)
                 return concrete_program
             else:
-                raise ValueError("No valid transformed program for {}".format(
-                    self._function_spec))
+                raise ValueError(
+                    "No valid transformed program for {}.\n\t    Please specific `input_spec` in `@paddle.jit.to_static` or feed input tensor to call the decorated function at once.\n".
+                    format(self._function_spec))
         # If more than one programs have been cached, return the recent converted program by default.
         elif cached_program_len > 1:
             logging.warning(
@@ -617,7 +651,7 @@ class ProgramCache(object):
         return len(self._caches)
 
     def concrete_programs(self):
-        return [cp for key, (cp, _) in self._caches.iteritems()]
+        return [cp for key, (cp, _) in six.iteritems(self._caches)]
 
 
 def synchronized(func):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index ba02a983f8e641079d8a60b166a6f098e6f725a8..86593dc24aa8bda7906aab2001e8bd285f64288a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -136,9 +136,12 @@ def is_api_in_module(node, module_prefix):
         #  import_str = "".join(import_statements)
         import paddle
         import paddle.fluid as fluid
+        import paddle.fluid.dygraph as dygraph
         import paddle.fluid.layers as layers
+
         from paddle.fluid.dygraph import to_variable
-        import paddle.fluid.dygraph as dygraph
+        from paddle import to_tensor
+
         return eval("_is_api_in_module_helper({}, '{}')".format(func_str,
                                                                 module_prefix))
     except NameError:
@@ -146,15 +149,18 @@ def is_api_in_module(node, module_prefix):
 
 
 def is_dygraph_api(node):
+
     # Note: A api in module dygraph_to_static is not a real dygraph api.
     if is_api_in_module(node, "paddle.fluid.dygraph.dygraph_to_static"):
         return False
 
+    # TODO(liym27): A better way to determine whether it is a dygraph api.
+    #  Consider the decorator @dygraph_only
     return is_api_in_module(node, "paddle.fluid.dygraph")
 
 
 def is_paddle_api(node):
-    return is_api_in_module(node, "paddle.fluid")
+    return is_api_in_module(node, "paddle")
 
 
 # Is numpy_api cannot reuse is_api_in_module because of numpy module problem
@@ -233,14 +239,6 @@ def _add_keywords_to(node, dygraph_api_name):
     return
 
 
-def is_to_variable(node):
-    assert isinstance(node, gast.Call)
-    if is_dygraph_api(node):
-        api_name = ast_to_source_code(node.func).strip()
-        return api_name.endswith("to_variable")
-    return False
-
-
 def to_static_ast(node, class_node):
     assert isinstance(node, gast.Call)
     assert isinstance(class_node, gast.Call)
@@ -268,29 +266,6 @@ def to_static_ast(node, class_node):
     return node
 
 
-def to_assign_node(node):
-    # Transform dygraph api `fluid.dygraph.to_variable` to static api `fluid.layers.assign`.
-    # NOTE:
-    #   1. Api `to_variable` supports data type {float16, float32, float64, int16, int32, int64, uint8, uint16},
-    #   but api `assign` only supports {float32, float64, int32, int64, bool};
-    #   2. If the input of api `assign` is numpy.ndarray, its size cannot be greater than 1024 * 1024.
-    assert isinstance(node, gast.Call)
-    assign_api = gast.parse('fluid.layers.assign').body[0].value
-    node.func = assign_api
-
-    if node.args:
-        node.args = [node.args[0]]
-        node.keywords = []
-    else:
-        for idx, kw in enumerate(node.keywords):
-            if kw.arg == 'value':
-                node.keywords[idx].arg = 'input'
-                node.keywords = [node.keywords[idx]]
-                node.args = []
-                break
-    return node
-
-
 def update_args_of_func(node, dygraph_node, method_name):
     assert isinstance(node, gast.Call)
     if method_name not in ["__init__", "forward"]:
@@ -493,7 +468,7 @@ def recover_globals_attribute(src_obj, dst_obj):
     src_globals = getattr(src_obj, attr_name, {})
     dst_globals = getattr(dst_obj, attr_name, {})
 
-    for k, v in src_globals.items():
+    for k, v in six.iteritems(src_globals):
         # ignore builtin attribute.
         if not (k.startswith('__') and k.endswith('__')):
             dst_globals[k] = v
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 853c16a5d0f7129f097f7fca860ab260f9dc7fd5..f67b79b91f7da235697d920cf0dfe376e88ab93e 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -754,7 +754,7 @@ def save(layer, model_path, input_spec=None, configs=None):
     # saved to inference program may not need by dygraph Layer, 
     # we only record the state_dict variable's structured name
     state_names_dict = dict()
-    for structured_name, var in layer.state_dict().items():
+    for structured_name, var in six.iteritems(layer.state_dict()):
         state_names_dict[var.name] = structured_name
 
     # 3. share parameters from Layer to scope & record var info
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index bb55c6725e6a62f2cef393fd34b249c217be0c54..8c4109674200bf97354444f92f00b13e053152a0 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -41,7 +41,7 @@ def monkey_patch_math_varbase():
     The difference is, in dygraph mode, use auto-generated op functions for better performance.
     """
 
-    @no_grad()
+    @no_grad
     def create_tensor(value, dtype, shape):
         out = _varbase_creator(dtype=dtype)
         out = core.ops.fill_constant(out, 'dtype', dtype, 'shape', shape,
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index bd578e6ba98a0f31a952bd5620b90e9464fe8666..5ecc713ddcace7a6bed05ffa4282d9f5c1041a44 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -349,38 +349,53 @@ class DataParallel(layers.Layer):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-
-                place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-                with fluid.dygraph.guard(place):
-
-                    # prepare the data parallel context
-                    strategy = fluid.dygraph.prepare_context()
-
-                    linear = fluid.dygraph.Linear(1, 10, act="softmax")
-                    adam = fluid.optimizer.AdamOptimizer(
-                        learning_rate=0.001, parameter_list=linear.parameters())
-
-                    # make the module become the data parallelism module
-                    linear = fluid.dygraph.DataParallel(linear, strategy)
-
-                    x_data = np.random.random(size=[10, 1]).astype(np.float32)
-                    data = fluid.dygraph.to_variable(x_data)
-
-                    hidden = linear(data)
-                    avg_loss = fluid.layers.mean(hidden)
-
-                    # scale the loss according to the number of trainers.
-                    avg_loss = linear.scale_loss(avg_loss)
-
-                    avg_loss.backward()
-
-                    # collect the gradients of trainers.
-                    linear.apply_collective_grads()
-
-                    adam.minimize(avg_loss)
-                    linear.clear_gradients()
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
+                import paddle.distributed as dist
+
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear1 = nn.Linear(10, 10)
+                        self._linear2 = nn.Linear(10, 1)
+                        
+                    def forward(self, x):
+                        return self._linear2(self._linear1(x))
+
+                def train():
+                    # 1. enable dynamic mode
+                    paddle.disable_static()
+                    
+                    # 2. initialize parallel environment
+                    dist.init_parallel_env()
+
+                    # 3. create data parallel layer & optimizer
+                    layer = LinearNet()
+                    dp_layer = paddle.DataParallel(layer)
+
+                    loss_fn = nn.MSELoss()
+                    adam = opt.Adam(
+                        learning_rate=0.001, parameters=dp_layer.parameters())
+
+                    # 4. run layer
+                    inputs = paddle.randn([10, 10], 'float32')
+                    outputs = dp_layer(inputs)
+                    labels = paddle.randn([10, 1], 'float32')
+                    loss = loss_fn(outputs, labels)
+                    
+                    loss = dp_layer.scale_loss(loss)
+                    loss.backward()
+                    dp_layer.apply_collective_grads()
+
+                    adam.step()
+                    adam.clear_grad()
+
+                if __name__ == '__main__':
+                    # 1. start by ``paddle.distributed.spawn`` (default)
+                    dist.spawn(train, nprocs=2)
+                    # 2. start by ``paddle.distributed.launch``
+                    # train()
         """
         if not self._is_data_parallel_mode():
             return loss
@@ -430,7 +445,7 @@ class DataParallel(layers.Layer):
                 self._reshape_inplace(x=g_var, shape=g_shape)
                 assert g_var.shape == g_shape
 
-    @no_grad()
+    @no_grad
     def apply_collective_grads(self):
         """
         AllReduce the Parameters' gradient.
@@ -438,38 +453,53 @@ class DataParallel(layers.Layer):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-
-                place = fluid.CUDAPlace(fluid.dygraph.ParallelEnv().dev_id)
-                with fluid.dygraph.guard(place):
-
-                    # prepare the data parallel context
-                    strategy = fluid.dygraph.prepare_context()
-
-                    linear = fluid.dygraph.Linear(1, 10, act="softmax")
-                    adam = fluid.optimizer.AdamOptimizer(
-                        learning_rate=0.001, parameter_list=linear.parameters())
-
-                    # make the module become the data parallelism module
-                    linear = fluid.dygraph.DataParallel(linear, strategy)
-
-                    x_data = np.random.random(size=[10, 1]).astype(np.float32)
-                    data = fluid.dygraph.to_variable(x_data)
-
-                    hidden = linear(data)
-                    avg_loss = fluid.layers.mean(hidden)
-
-                    # scale the loss according to the number of trainers.
-                    avg_loss = linear.scale_loss(avg_loss)
-
-                    avg_loss.backward()
-
-                    # collect the gradients of trainers.
-                    linear.apply_collective_grads()
-
-                    adam.minimize(avg_loss)
-                    linear.clear_gradients()
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
+                import paddle.distributed as dist
+
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear1 = nn.Linear(10, 10)
+                        self._linear2 = nn.Linear(10, 1)
+                        
+                    def forward(self, x):
+                        return self._linear2(self._linear1(x))
+
+                def train():
+                    # 1. enable dynamic mode
+                    paddle.disable_static()
+                    
+                    # 2. initialize parallel environment
+                    dist.init_parallel_env()
+
+                    # 3. create data parallel layer & optimizer
+                    layer = LinearNet()
+                    dp_layer = paddle.DataParallel(layer)
+
+                    loss_fn = nn.MSELoss()
+                    adam = opt.Adam(
+                        learning_rate=0.001, parameters=dp_layer.parameters())
+
+                    # 4. run layer
+                    inputs = paddle.randn([10, 10], 'float32')
+                    outputs = dp_layer(inputs)
+                    labels = paddle.randn([10, 1], 'float32')
+                    loss = loss_fn(outputs, labels)
+                    
+                    loss = dp_layer.scale_loss(loss)
+                    loss.backward()
+                    dp_layer.apply_collective_grads()
+
+                    adam.step()
+                    adam.clear_grad()
+
+                if __name__ == '__main__':
+                    # 1. start by ``paddle.distributed.spawn`` (default)
+                    dist.spawn(train, nprocs=2)
+                    # 2. start by ``paddle.distributed.launch``
+                    # train()
         """
         if not self._is_data_parallel_mode():
             return
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index 378c8fc23d7528766ca9eca062c87a4511e32b46..216478479a7cfdcffac5f21855d0974309842c89 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -42,6 +42,9 @@ op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
 OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
 
+SPARSE_OP_LIST = ["lookup_table", "lookup_table_v2"]
+SPARSE_OP_TYPE_DICT = {"lookup_table": "W", "lookup_table_v2": "W"}
+
 
 def _get_lr_ops(program):
     lr_ops = []
@@ -66,7 +69,7 @@ def _has_global_step(lr_ops):
 
 
 def is_sparse_op(op):
-    if op.type == "lookup_table" and op.attr('is_sparse') is True and op.attr(
+    if op.type in SPARSE_OP_LIST and op.attr('is_sparse') is True and op.attr(
             'is_distributed') is False:
         return True
 
@@ -78,7 +81,7 @@ def is_sparse_op(op):
 
 
 def is_distributed_sparse_op(op):
-    if op.type == "lookup_table" and op.attr('is_distributed') is True:
+    if op.type in SPARSE_OP_LIST and op.attr('is_distributed') is True:
         return True
 
     if op.type == "distributed_lookup_table" and op.attr(
@@ -802,11 +805,10 @@ class CompileTimeStrategy(object):
 
         def _get_sparse_varnames():
             varnames = []
-            op_types = {"lookup_table": "W"}
             for op in origin_program.global_block().ops:
-                if op.type in op_types.keys() \
+                if op.type in SPARSE_OP_TYPE_DICT.keys() \
                         and op.attr('remote_prefetch') is True:
-                    param_name = op.input(op_types[op.type])[0]
+                    param_name = op.input(SPARSE_OP_TYPE_DICT[op.type])[0]
                     varnames.append(param_name)
 
             return list(set(varnames))
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 201b3863a4b6d6d5fed036d85b2103f5defe61f0..5e6b8ca639952730f60e55718b0a8534404ca94c 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -40,6 +40,8 @@ LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
 OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize
 op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
+SPARSE_OP_TYPE_DICT = {"lookup_table": "W", "lookup_table_v2": "W"}
+
 DEVICE_LIST = ["cpu", "gpu", "xpu"]
 COMMUNICATE_OPS_TYPE = ["send", "recv", "fetch_barrier", "send_barrier"]
 DEFAULT_DEVICE = 'cpu'
@@ -81,11 +83,10 @@ def distributed_ops_pass(program, config):
 
     def _get_pull_sparse_ops(_program):
         pull_sparse_ops = {}
-        op_types = {"lookup_table": "W"}
         for op in _program.global_block().ops:
-            if op.type in op_types.keys() \
+            if op.type in SPARSE_OP_TYPE_DICT.keys() \
                     and op.attr('remote_prefetch') is True:
-                param_name = op.input(op_types[op.type])[0]
+                param_name = op.input(SPARSE_OP_TYPE_DICT[op.type])[0]
                 ops = pull_sparse_ops.get(param_name, [])
                 ops.append(op)
                 pull_sparse_ops[param_name] = ops
@@ -101,6 +102,7 @@ def distributed_ops_pass(program, config):
             w = program.global_block().vars[ops[0].input("W")[0]]
             padding_idx = ops[0].attr("padding_idx")
             is_distributed = ops[0].attr("is_distributed")
+            op_type = ops[0].type
 
             outputs = [
                 program.global_block().vars[op.output("Out")[0]] for op in ops
@@ -149,7 +151,8 @@ def distributed_ops_pass(program, config):
                         "is_distributed": is_distributed,
                         "pserver_num": len(pserver_endpoints),
                         "padding_idx": padding_idx,
-                        "trainer_id": trainer_id
+                        "trainer_id": trainer_id,
+                        "lookup_table_version": op_type
                     })
             else:
                 raise ValueError(
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index 2a1945532e654605d2e2d45206daa3cd8306737f..f3563808d235b634c92671dfb654918001b6c8bc 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -348,6 +348,41 @@ class PSLib(Fleet):
             self._fleet_ptr.save_model(dirname, mode)
         self._role_maker._barrier_worker()
 
+    def save_model_with_whitelist(self,
+                                  executor,
+                                  dirname,
+                                  whitelist_path,
+                                  main_program=None,
+                                  **kwargs):
+        """
+        save whitelist, mode is consistent with fleet.save_persistables,
+        when using fleet, it will save sparse and dense feature
+
+        Args:
+            executor(Executor): fluid executor
+            dirname(str): save path. It can be hdfs/afs path or local path
+            main_program(Program): fluid program, default None
+            kwargs: use define property, current support following
+                mode(int): 0 means save all pserver model,
+                           1 means save delta pserver model (save diff),
+                           2 means save xbox base,
+                           3 means save batch model.
+
+        Example:
+            .. code-block:: python
+
+              fleet.save_persistables(dirname="/you/path/to/model", mode = 0)
+
+        """
+        mode = kwargs.get("mode", 0)
+        table_id = kwargs.get("table_id", 0)
+        self._fleet_ptr.client_flush()
+        self._role_maker._barrier_worker()
+        if self._role_maker.is_first_worker():
+            self._fleet_ptr.save_model_with_whitelist(table_id, dirname, mode,
+                                                      whitelist_path)
+        self._role_maker._barrier_worker()
+
     def save_cache_model(self, executor, dirname, main_program=None, **kwargs):
         """
         save sparse cache table,
@@ -480,6 +515,51 @@ class PSLib(Fleet):
             self._fleet_ptr.clear_model()
         self._role_maker._barrier_worker()
 
+    def load_pslib_whitelist(self, table_id, model_path, **kwargs):
+        """
+        load pslib model for one table with whitelist
+
+        Args:
+            table_id(int): load table id
+            model_path(str): load model path, can be local or hdfs/afs path
+            kwargs(dict): user defined params, currently support following:
+                only for load pslib model for one table:
+                    mode(int): load model mode. 0 is for load whole model, 1 is
+                               for load delta model (load diff), default is 0.
+                only for load params from paddle model:
+                    scope(Scope): Scope object
+                    model_proto_file(str): path of program desc proto binary
+                                           file, can be local or hdfs/afs file
+                    var_names(list): var name list
+                    load_combine(bool): load from a file or split param files
+                                        default False.
+
+        Examples:
+            .. code-block:: python
+
+              # load pslib model for one table
+              fleet.load_one_table(0, "hdfs:/my_fleet_model/20190714/0/")
+              fleet.load_one_table(1, "hdfs:/xx/xxx", mode = 0)
+
+              # load params from paddle model
+              fleet.load_one_table(2, "hdfs:/my_paddle_model/",
+                                   scope = my_scope,
+                                   model_proto_file = "./my_program.bin",
+                                   load_combine = False)
+
+              # below is how to save proto binary file
+              with open("my_program.bin", "wb") as fout:
+                  my_program = fluid.default_main_program()
+                  fout.write(my_program.desc.serialize_to_string())
+
+        """
+        self._role_maker._barrier_worker()
+        mode = kwargs.get("mode", 0)
+        if self._role_maker.is_first_worker():
+            self._fleet_ptr.load_table_with_whitelist(table_id, model_path,
+                                                      mode)
+        self._role_maker._barrier_worker()
+
     def load_one_table(self, table_id, model_path, **kwargs):
         """
         load pslib model for one table or load params from paddle model
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 15a3022f932f4a702bf7f94ed936468b6a06e94e..529588c0846b5a90a842c398bbb4409a04f35d53 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -129,6 +129,7 @@ def one_hot(input, depth, allow_out_of_range=False):
     return one_hot_out
 
 
+@deprecated(since='2.0.0', update_to='paddle.nn.functional.embedding')
 def embedding(input,
               size,
               is_sparse=False,
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 19822e410c71aa993e2d90a92c57c3522023ad81..db556913384785e1f11ba05dcc524ef1f1de92ab 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -147,8 +147,10 @@ class LayerHelper(LayerHelperBase):
 
         if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
             act['use_cudnn'] = self.kwargs.get('use_cudnn')
-        if 'use_mkldnn' in self.kwargs:
-            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
+        use_mkldnn = self.kwargs.get(
+            'use_mkldnn', core.globals().get("FLAGS_use_mkldnn", False))
+        if use_mkldnn:
+            act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
 
         tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 39c4df00657daccb88ae1ad95781891c4c6ec11e..e77f58d31f7c49ebb315523e6a071136090dece2 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -367,6 +367,7 @@ def fc(input,
     return helper.append_activation(pre_activation)
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.embedding")
 def embedding(input,
               size,
               is_sparse=False,
@@ -15013,6 +15014,7 @@ def gather_tree(ids, parents):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.uniform")
 @templatedoc()
 def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
                    name=None):
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 3891952d5b95f2dec4fc503239758d1370d4f72f..2dd654c35c35b2b8212e7285d7830f82e9b53a66 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -62,7 +62,7 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def __init__(self,
                  learning_rate,
                  parameter_list=None,
@@ -898,7 +898,7 @@ class Optimizer(object):
             if p.trainable:
                 p.clear_gradient()
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -1016,7 +1016,7 @@ class SGDOptimizer(Optimizer):
             name=name)
         self.type = "sgd"
 
-    @no_grad()
+    @no_grad
     def _append_optimize_op(self, block, param_and_grad):
         lr = self._create_param_lr(param_and_grad)
         if framework.in_dygraph_mode():
@@ -1553,7 +1553,7 @@ class DGCMomentumOptimizer(Optimizer):
         dgc_op._set_attr(op_maker.kOpRoleVarAttrName(),
                          [param_var.name, grad_var.name])
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def apply_gradients(self, params_grads):
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
         params_grads, table_param_and_grad, table_optimize_op = \
diff --git a/python/paddle/fluid/tests/demo/executor_train_dataset.py b/python/paddle/fluid/tests/demo/executor_train_dataset.py
deleted file mode 100644
index 6938982de725c296aae29e70d0640749d0876353..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/executor_train_dataset.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tarfile
-import paddle.fluid as fluid
-import paddle
-from paddle.fluid import core
-
-URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
-MD5 = '2a405a31508969b3ab823f42c0f522ca'
-
-
-def bow_net(data,
-            label,
-            dict_dim=89528,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    BOW net
-    This model is from https://github.com/PaddlePaddle/models:
-    models/fluid/PaddleNLP/text_classification/nets.py
-    """
-    # embedding
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bowh = fluid.layers.tanh(bow)
-    # fc layer after conv
-    fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    # probability of each class
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    # cross entropy loss
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    # mean loss
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, acc, prediction
-
-
-def train():
-    # Download data
-    with tarfile.open(paddle.dataset.common.download(URL, "imdb", MD5)) as tarf:
-        tarf.extractall(path='./')
-        tarf.close()
-
-    # Initialize dataset description
-    dataset = fluid.DatasetFactory().create_dataset()
-    dataset.set_batch_size(128)  # See API doc for how to change other fields
-
-    # define network
-    # input text data
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    # label data
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    dataset.set_use_var([data, label])
-    avg_cost, acc, prediction = bow_net(data, label)
-    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
-    opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
-
-    # Run startup program
-    startup_program = fluid.default_startup_program()
-    place = fluid.CPUPlace()
-    executor = fluid.Executor(place)
-    executor.run(startup_program)
-
-    main_program = fluid.default_main_program()
-    epochs = 10
-    filelist = ["train_data/part-%d" % i for i in range(12)]
-    dataset.set_filelist(filelist)
-    for i in range(epochs):
-        dataset.set_thread(4)
-        executor.train_from_dataset(
-            main_program,  # This can be changed during iteration
-            dataset,  # This can be changed during iteration
-            debug=False)
-        fluid.io.save_inference_model('imdb/epoch%d.model' % i,
-                                      [data.name, label.name], [acc], executor)
-
-
-if __name__ == "__main__":
-    train()
diff --git a/python/paddle/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py
deleted file mode 100644
index bd77779ce6ab5cf19e3e5ace3e51e39734b27c10..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ /dev/null
@@ -1,173 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import errno
-import math
-import os
-
-import matplotlib
-import numpy
-
-import paddle
-import paddle.fluid as fluid
-
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt
-import matplotlib.gridspec as gridspec
-
-NOISE_SIZE = 100
-NUM_PASS = 1000
-NUM_REAL_IMGS_IN_BATCH = 121
-NUM_TRAIN_TIMES_OF_DG = 3
-LEARNING_RATE = 2e-5
-
-
-def D(x):
-    hidden = fluid.layers.fc(input=x,
-                             size=200,
-                             act='relu',
-                             param_attr='D.w1',
-                             bias_attr='D.b1')
-    logits = fluid.layers.fc(input=hidden,
-                             size=1,
-                             act=None,
-                             param_attr='D.w2',
-                             bias_attr='D.b2')
-    return logits
-
-
-def G(x):
-    hidden = fluid.layers.fc(input=x,
-                             size=200,
-                             act='relu',
-                             param_attr='G.w1',
-                             bias_attr='G.b1')
-    img = fluid.layers.fc(input=hidden,
-                          size=28 * 28,
-                          act='tanh',
-                          param_attr='G.w2',
-                          bias_attr='G.b2')
-    return img
-
-
-def plot(gen_data):
-    gen_data.resize(gen_data.shape[0], 28, 28)
-    n = int(math.ceil(math.sqrt(gen_data.shape[0])))
-    fig = plt.figure(figsize=(n, n))
-    gs = gridspec.GridSpec(n, n)
-    gs.update(wspace=0.05, hspace=0.05)
-
-    for i, sample in enumerate(gen_data):
-        ax = plt.subplot(gs[i])
-        plt.axis('off')
-        ax.set_xticklabels([])
-        ax.set_yticklabels([])
-        ax.set_aspect('equal')
-        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
-
-    return fig
-
-
-def main():
-    try:
-        os.makedirs("./out")
-    except OSError as e:
-        if e.errno != errno.EEXIST:
-            raise
-
-    startup_program = fluid.Program()
-    d_program = fluid.Program()
-    dg_program = fluid.Program()
-
-    with fluid.program_guard(d_program, startup_program):
-        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
-        d_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-            x=D(img),
-            label=fluid.layers.data(
-                name='label', shape=[1], dtype='float32'))
-        d_loss = fluid.layers.mean(d_loss)
-
-    with fluid.program_guard(dg_program, startup_program):
-        noise = fluid.layers.data(
-            name='noise', shape=[NOISE_SIZE], dtype='float32')
-        g_img = G(x=noise)
-        g_program = dg_program.clone()
-        dg_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
-            x=D(g_img),
-            label=fluid.layers.fill_constant_batch_size_like(
-                input=noise, dtype='float32', shape=[-1, 1], value=1.0))
-        dg_loss = fluid.layers.mean(dg_loss)
-
-    opt = fluid.optimizer.Adam(learning_rate=LEARNING_RATE)
-
-    opt.minimize(loss=d_loss, startup_program=startup_program)
-    opt.minimize(
-        loss=dg_loss,
-        startup_program=startup_program,
-        parameter_list=[
-            p.name for p in g_program.global_block().all_parameters()
-        ])
-    exe = fluid.Executor(fluid.CPUPlace())
-    exe.run(startup_program)
-
-    num_true = NUM_REAL_IMGS_IN_BATCH
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=60000),
-        batch_size=num_true)
-
-    for pass_id in range(NUM_PASS):
-        for batch_id, data in enumerate(train_reader()):
-            num_true = len(data)
-            n = numpy.random.uniform(
-                low=-1.0, high=1.0,
-                size=[num_true * NOISE_SIZE]).astype('float32').reshape(
-                    [num_true, NOISE_SIZE])
-            generated_img = exe.run(g_program,
-                                    feed={'noise': n},
-                                    fetch_list={g_img})[0]
-            real_data = numpy.array([x[0] for x in data]).astype('float32')
-            real_data = real_data.reshape(num_true, 784)
-            total_data = numpy.concatenate([real_data, generated_img])
-            total_label = numpy.concatenate([
-                numpy.ones(
-                    shape=[real_data.shape[0], 1], dtype='float32'),
-                numpy.zeros(
-                    shape=[real_data.shape[0], 1], dtype='float32')
-            ])
-            d_loss_np = exe.run(d_program,
-                                feed={'img': total_data,
-                                      'label': total_label},
-                                fetch_list={d_loss})[0]
-            for _ in range(NUM_TRAIN_TIMES_OF_DG):
-                n = numpy.random.uniform(
-                    low=-1.0, high=1.0,
-                    size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(
-                        [2 * num_true, NOISE_SIZE, 1, 1])
-                dg_loss_np = exe.run(dg_program,
-                                     feed={'noise': n},
-                                     fetch_list={dg_loss})[0]
-            print("Pass ID={0}, Batch ID={1}, D-Loss={2}, DG-Loss={3}".format(
-                pass_id, batch_id, d_loss_np, dg_loss_np))
-        # generate image each batch
-        fig = plot(generated_img)
-        plt.savefig(
-            'out/{0}.png'.format(str(pass_id).zfill(3)), bbox_inches='tight')
-        plt.close(fig)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/demo/pipeline_train.py b/python/paddle/fluid/tests/demo/pipeline_train.py
deleted file mode 100644
index 2f75908a160fd3c61c743dc407095d645737a534..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/pipeline_train.py
+++ /dev/null
@@ -1,205 +0,0 @@
-#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import numpy as np
-import copy
-import pickle
-import os
-from functools import partial
-import logging
-import time
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import argparse
-import random
-import sys
-import math
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-
-is_profile = False
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("Resnet with pipelie parallel.")
-    parser.add_argument(
-        '--batch_size', type=int, default=100, help='input batch size')
-    parser.add_argument('--lr', type=float, default=0.001, help='learning rate')
-    return parser.parse_args()
-
-
-def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
-                  act=None):
-    conv = fluid.layers.conv2d(
-        input=input,
-        num_filters=num_filters,
-        filter_size=filter_size,
-        stride=stride,
-        padding=(filter_size - 1) // 2,
-        groups=groups,
-        act=None,
-        bias_attr=False)
-    return fluid.layers.batch_norm(
-        input=conv,
-        act=act, )
-
-
-def shortcut(input, ch_out, stride, is_first):
-    ch_in = input.shape[1]
-    if ch_in != ch_out or stride != 1 or is_first == True:
-        return conv_bn_layer(input, ch_out, 1, stride)
-    else:
-        return input
-
-
-def bottleneck_block(input, num_filters, stride):
-    conv0 = conv_bn_layer(
-        input=input, num_filters=num_filters, filter_size=1, act='relu')
-    conv1 = conv_bn_layer(
-        input=conv0,
-        num_filters=num_filters,
-        filter_size=3,
-        stride=stride,
-        act='relu')
-    conv2 = conv_bn_layer(
-        input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
-
-    short = shortcut(input, num_filters * 4, stride, is_first=False)
-
-    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
-
-
-def basic_block(input, num_filters, stride, is_first):
-    conv0 = conv_bn_layer(
-        input=input,
-        num_filters=num_filters,
-        filter_size=3,
-        act='relu',
-        stride=stride)
-    conv1 = conv_bn_layer(
-        input=conv0, num_filters=num_filters, filter_size=3, act=None)
-    short = shortcut(input, num_filters, stride, is_first)
-    return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
-
-
-def network(input, layers=50, class_dim=1000):
-    supported_layers = [18, 34, 50, 101, 152]
-    assert layers in supported_layers
-    depth = None
-    if layers == 18:
-        depth = [2, 2, 2, 2]
-    elif layers == 34 or layers == 50:
-        depth = [3, 4, 6, 3]
-    elif layers == 101:
-        depth = [3, 4, 23, 3]
-    elif layers == 152:
-        depth = [3, 8, 36, 3]
-    num_filters = [64, 128, 256, 512]
-    with fluid.device_guard("gpu:0"):
-        conv = conv_bn_layer(
-            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
-        conv = fluid.layers.pool2d(
-            input=conv,
-            pool_size=3,
-            pool_stride=2,
-            pool_padding=1,
-            pool_type='max')
-    if layers >= 50:
-        for block in range(len(depth)):
-            with fluid.device_guard("gpu:1"):
-                for i in range(depth[block]):
-                    conv = bottleneck_block(
-                        input=conv,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1)
-
-        with fluid.device_guard("gpu:2"):
-            pool = fluid.layers.pool2d(
-                input=conv, pool_size=7, pool_type='avg', global_pooling=True)
-            stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
-            out = fluid.layers.fc(
-                input=pool,
-                size=class_dim,
-                param_attr=fluid.param_attr.ParamAttr(
-                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
-    else:
-        for block in range(len(depth)):
-            with fluid.device_guard("gpu:1"):
-                for i in range(depth[block]):
-                    conv = basic_block(
-                        input=conv,
-                        num_filters=num_filters[block],
-                        stride=2 if i == 0 and block != 0 else 1,
-                        is_first=block == i == 0)
-        with fluid.device_guard("gpu:2"):
-            pool = fluid.layers.pool2d(
-                input=conv, pool_size=7, pool_type='avg', global_pooling=True)
-            stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
-            out = fluid.layers.fc(
-                input=pool,
-                size=class_dim,
-                param_attr=fluid.param_attr.ParamAttr(
-                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
-    return out
-
-
-def train():
-    args = parse_args()
-    lr = args.lr
-
-    with fluid.device_guard("gpu:0"):
-        image = fluid.layers.data(
-            name="image", shape=[3, 224, 224], dtype="float32")
-        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-        data_loader = fluid.io.DataLoader.from_generator(
-            feed_list=[image, label],
-            capacity=64,
-            use_double_buffer=True,
-            iterable=False)
-        fc = build_network(image, layers=50)
-
-    with fluid.device_guard("gpu:3"):
-        out, prob = fluid.layers.softmax_with_cross_entropy(
-            logits=fc, label=label, return_softmax=True)
-        loss = fluid.layers.mean(out)
-        acc_top1 = fluid.layers.accuracy(input=prob, label=label, k=1)
-        acc_top5 = fluid.layers.accuracy(input=prob, label=label, k=5)
-
-    optimizer = fluid.optimizer.SGD(lr)
-    optimizer = fluid.optimizer.PipelineOptimizer(optimizer, num_microbatches=2)
-    optimizer.minimize(loss)
-
-    def train_reader():
-        for _ in range(4000):
-            img = np.random.random(size=[3, 224, 224]).astype('float32')
-            label = np.random.random(size=[1]).astype('int64')
-            yield img, label
-
-    data_loader.set_sample_generator(train_reader, batch_size=args.batch_size)
-
-    place = fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-
-    exe.run(fluid.default_startup_program())
-
-    data_loader.start()
-    logger.info("begin training...")
-    exe.train_from_dataset(fluid.default_main_program(), debug=is_profile)
-
-
-if __name__ == "__main__":
-    train()
diff --git a/python/paddle/fluid/tests/demo/pyreader.py b/python/paddle/fluid/tests/demo/pyreader.py
deleted file mode 100644
index 6995346ffa61ea65119930296be2fba5a10c5451..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy
-import six
-
-import paddle
-import paddle.dataset.mnist as mnist
-import paddle.fluid as fluid
-
-
-def network(is_train):
-    reader = fluid.layers.py_reader(
-        capacity=10,
-        shapes=((-1, 784), (-1, 1)),
-        dtypes=('float32', 'int64'),
-        name="train_reader" if is_train else "test_reader",
-        use_double_buffer=True)
-    img, label = fluid.layers.read_file(reader)
-
-    hidden = img
-
-    for i in six.moves.xrange(2):
-        hidden = fluid.layers.fc(input=hidden, size=100, act='tanh')
-        hidden = fluid.layers.dropout(
-            hidden, dropout_prob=0.5, is_test=not is_train)
-
-    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    return fluid.layers.mean(loss), reader
-
-
-def main():
-    train_prog = fluid.Program()
-    startup_prog = fluid.Program()
-
-    with fluid.program_guard(train_prog, startup_prog):
-        with fluid.unique_name.guard():
-            loss, train_reader = network(True)
-            adam = fluid.optimizer.Adam(learning_rate=0.01)
-            adam.minimize(loss)
-
-    test_prog = fluid.Program()
-    test_startup = fluid.Program()
-    with fluid.program_guard(test_prog, test_startup):
-        with fluid.unique_name.guard():
-            test_loss, test_reader = network(False)
-
-    use_cuda = fluid.core.is_compiled_with_cuda()
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    fluid.Executor(place).run(startup_prog)
-    fluid.Executor(place).run(test_startup)
-
-    trainer = fluid.ParallelExecutor(
-        use_cuda=use_cuda, loss_name=loss.name, main_program=train_prog)
-
-    tester = fluid.ParallelExecutor(
-        use_cuda=use_cuda, share_vars_from=trainer, main_program=test_prog)
-
-    train_reader.decorate_paddle_reader(
-        paddle.reader.shuffle(
-            paddle.batch(mnist.train(), 512), buf_size=8192))
-
-    test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
-
-    for epoch_id in six.moves.xrange(10):
-        train_reader.start()
-        try:
-            while True:
-                print(
-                    'train_loss',
-                    numpy.array(trainer.run(fetch_list=[loss.name])))
-        except fluid.core.EOFException:
-            print('End of epoch', epoch_id)
-            train_reader.reset()
-
-        test_reader.start()
-        try:
-            while True:
-                print(
-                    'test loss',
-                    numpy.array(tester.run(fetch_list=[test_loss.name])))
-        except fluid.core.EOFException:
-            print('End of testing')
-            test_reader.reset()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6220bf62c79c30737f923e744d5670818f54ff6e..a25cba029dd8bac81d6b00c1d9fb710f421ce9d0 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -432,8 +432,6 @@ if(WITH_DISTRIBUTE)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_lars")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_train")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_save_load")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_simnet_bow")
-    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_text_classification")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_train")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_word2vec")
@@ -587,8 +585,10 @@ endif()
 
 # setting timeout value for old unittests
 # set_tests_properties(test_dist_fleet_sparse_embedding_ctr PROPERTIES TIMEOUT 200)
-set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
-set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
-set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
-set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
-set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
+if(NOT WIN32 AND NOT APPLE)
+    set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 73b546b95cfeb8032c6e99eabe24c883d1f5f66c..dc39472d7aed8f52ee3bb0f85a5e503db9093070 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -196,8 +196,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         fleet.stop_worker()
 
     def do_dataset_training(self, fleet):
-        dnn_input_dim, lr_input_dim, train_file_path = ctr_dataset_reader.prepare_data(
-        )
+        train_file_list = ctr_dataset_reader.prepare_fake_data()
 
         exe = fluid.Executor(fluid.CPUPlace())
 
@@ -206,9 +205,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
 
         thread_num = 2
         batch_size = 128
-        filelist = []
-        for _ in range(thread_num):
-            filelist.append(train_file_path)
+        filelist = train_file_list
 
         # config dataset
         dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
similarity index 55%
rename from python/paddle/fluid/tests/unittests/dist_simnet_bow.py
rename to python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index 9fcba2aede1cea3c78108e7daa8eb34a1ab80048..7d5ca4fc6e3916eab29942c85e88664f60cbf032 100644
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -19,6 +19,8 @@ import argparse
 import time
 import math
 import random
+import shutil
+import tempfile
 
 import paddle
 import paddle.fluid as fluid
@@ -29,7 +31,8 @@ from multiprocessing import Process
 import os
 import signal
 from functools import reduce
-from test_dist_base import TestDistRunnerBase, runtime_main
+from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
+from paddle.distributed.fleet.base.util_factory import fleet_util
 
 DTYPE = "int64"
 DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000'
@@ -49,6 +52,18 @@ fluid.default_startup_program().random_seed = 1
 fluid.default_main_program().random_seed = 1
 
 
+def fake_simnet_reader():
+    def reader():
+        for _ in range(1000):
+            q = np.random.random_integers(0, 1500 - 1, size=1).tolist()
+            label = np.random.random_integers(0, 1, size=1).tolist()
+            pt = np.random.random_integers(0, 1500 - 1, size=1).tolist()
+            nt = np.random.random_integers(0, 1500 - 1, size=1).tolist()
+            yield [q, label, pt, nt]
+
+    return reader
+
+
 def get_acc(cos_q_nt, cos_q_pt, batch_size):
     cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
     cond = fluid.layers.cast(cond, dtype='float64')
@@ -75,34 +90,40 @@ def get_loss(cos_q_pt, cos_q_nt):
     return avg_cost
 
 
-def get_optimizer(op="sgd"):
-    if op.upper() == "sgd".upper():
-        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
-    elif op.upper() == "adam".upper():
-        optimizer = fluid.optimizer.Adam(learning_rate=base_lr)
-    else:
-        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
-    return optimizer
-
-
 def train_network(batch_size,
                   is_distributed=False,
                   is_sparse=False,
-                  is_self_contained_lr=False):
+                  is_self_contained_lr=False,
+                  is_pyreader=False):
     # query
     q = fluid.layers.data(
         name="query_ids", shape=[1], dtype="int64", lod_level=1)
+    # label data
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    # pt
+    pt = fluid.layers.data(
+        name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+    # nt
+    nt = fluid.layers.data(
+        name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+
+    datas = [q, label, pt, nt]
+
+    reader = None
+    if is_pyreader:
+        reader = fluid.io.PyReader(
+            feed_list=datas,
+            capacity=64,
+            iterable=False,
+            use_double_buffer=False)
+
     # embedding
     q_emb = fluid.embedding(
         input=q,
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__emb__",
-            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
-                name="__emb__"),
+            initializer=fluid.initializer.Constant(value=0.01), name="__emb__"),
         is_sparse=is_sparse)
     q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
     # vsum
@@ -115,12 +136,8 @@ def train_network(batch_size,
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__q_fc__",
-            learning_rate=base_lr))
-    # label data
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    # pt
-    pt = fluid.layers.data(
-        name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+            learning_rate=base_lr), )
+
     # embedding
     pt_emb = fluid.embedding(
         input=pt,
@@ -129,9 +146,7 @@ def train_network(batch_size,
         param_attr=fluid.ParamAttr(
             initializer=fluid.initializer.Constant(value=0.01),
             name="__emb__",
-            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
-                name="__emb__"),
+            learning_rate=emb_lr),
         is_sparse=is_sparse)
     pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
     # vsum
@@ -142,24 +157,16 @@ def train_network(batch_size,
         input=pt_ss,
         size=hid_dim,
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__fc__",
-            learning_rate=base_lr),
+            initializer=fluid.initializer.Constant(value=0.01), name="__fc__"),
         bias_attr=fluid.ParamAttr(name="__fc_b__"))
-    # nt
-    nt = fluid.layers.data(
-        name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+
     # embedding
     nt_emb = fluid.embedding(
         input=nt,
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__emb__",
-            learning_rate=emb_lr) if is_self_contained_lr else fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.01),
-                name="__emb__"),
+            initializer=fluid.initializer.Constant(value=0.01), name="__emb__"),
         is_sparse=is_sparse)
     nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
     # vsum
@@ -170,9 +177,7 @@ def train_network(batch_size,
         input=nt_ss,
         size=hid_dim,
         param_attr=fluid.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01),
-            name="__fc__",
-            learning_rate=base_lr),
+            initializer=fluid.initializer.Constant(value=0.01), name="__fc__"),
         bias_attr=fluid.ParamAttr(name="__fc_b__"))
     cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
     cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
@@ -180,79 +185,67 @@ def train_network(batch_size,
     avg_cost = get_loss(cos_q_pt, cos_q_nt)
     # acc
     acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
-    return [avg_cost, acc, cos_q_pt]
-
-
-def combination(x, y):
-    res = [[[xi, yi] for yi in y] for xi in x]
-    return res[0]
-
-
-def get_one_data(file_list):
-    for file in file_list:
-        contents = []
-        with open(file, "r") as fin:
-            for i in fin:
-                contents.append(i.strip())
-            for index, q in enumerate(contents):
-                try:
-                    one_data = [[int(j) for j in i.split(" ")]
-                                for i in q.split(";")[:-1]]
-                    if one_data[1][0] + one_data[1][1] != len(one_data) - 3:
-                        q = fin.readline()
-                        continue
-                    tmp = combination(one_data[3:3 + one_data[1][0]],
-                                      one_data[3 + one_data[1][0]:])
-                except Exception as e:
-                    continue
-
-                for each in tmp:
-                    yield [one_data[2], 0, each[0], each[1]]
-
-
-def get_batch_reader(file_list, batch_size):
-    def batch_reader():
-        res = []
-        for i in get_one_data(file_list):
-            if random.random() <= sample_rate:
-                res.append(i)
-            if len(res) >= batch_size:
-                yield res
-                res = []
-
-    return batch_reader
-
-
-def get_train_reader(batch_size):
-    # The training data set.
-    train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet",
-                              "train")
-    train_reader = get_batch_reader([train_file], batch_size)
-    train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"]
-    return train_reader, train_feed
-
-
-class TestDistSimnetBow2x2(TestDistRunnerBase):
-    def get_model(self, batch_size=2):
-        # Train program
-        avg_cost, acc, predict = \
-            train_network(batch_size,
-                          bool(int(os.environ["IS_DISTRIBUTED"])),
-                          bool(int(os.environ["IS_SPARSE"])),
-                          bool(int(os.environ["IS_SELF_CONTAINED_LR"])))
-
-        inference_program = fluid.default_main_program().clone()
-
-        # Optimization
-        opt = os.getenv('OPTIMIZER', 'sgd')
-        opt = get_optimizer(opt)
-        opt.minimize(avg_cost)
-
-        # Reader
-        train_reader, _ = get_train_reader(batch_size)
-        return inference_program, avg_cost, train_reader, train_reader, acc, predict
+    return avg_cost, acc, cos_q_pt, reader
+
+
+class TestDistSimnetBow2x2(FleetDistRunnerBase):
+    """
+    For test SimnetBow model, use Fleet api
+    """
+
+    def net(self, args, batch_size=4, lr=0.01):
+        avg_cost, _, predict, self.reader = \
+            train_network(batch_size=batch_size, is_distributed=False,
+                               is_sparse=True, is_self_contained_lr=False, is_pyreader=(args.reader == "pyreader"))
+        self.avg_cost = avg_cost
+        self.predict = predict
+
+        return avg_cost
+
+    def check_model_right(self, dirname):
+        model_filename = os.path.join(dirname, "__model__")
+
+        with open(model_filename, "rb") as f:
+            program_desc_str = f.read()
+
+        program = fluid.Program.parse_from_string(program_desc_str)
+        with open(os.path.join(dirname, "__model__.proto"), "w") as wn:
+            wn.write(str(program))
+
+    def do_pyreader_training(self, fleet):
+        """
+        do training using dataset, using fetch handler to catch variable
+        Args:
+            fleet(Fleet api): the fleet object of Parameter Server, define distribute training role
+        """
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        fleet.init_worker()
+        exe.run(fluid.default_startup_program())
+        batch_size = 4
+        # reader
+        train_reader = paddle.batch(fake_simnet_reader(), batch_size=batch_size)
+        self.reader.decorate_sample_list_generator(train_reader)
+        for epoch_id in range(1):
+            self.reader.start()
+            try:
+                pass_start = time.time()
+                while True:
+                    loss_val = exe.run(program=fluid.default_main_program(),
+                                       fetch_list=[self.avg_cost.name])
+                    loss_val = np.mean(loss_val)
+                    message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
+                                                                      loss_val)
+                    fleet_util.print_on_rank(message, 0)
+
+                pass_time = time.time() - pass_start
+            except fluid.core.EOFException:
+                self.reader.reset()
+        fleet.stop_worker()
+
+    def do_dataset_training(self, fleet):
+        pass
 
 
 if __name__ == "__main__":
-    paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
     runtime_main(TestDistSimnetBow2x2)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
index 3e6fe168b8eaf39286c518c8b4a2ad6d48b0e6bb..29b4f1b05f9c2911b849b323674b3a704a1da297 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
@@ -19,9 +19,11 @@ import unittest
 import inspect
 import gast
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.dygraph as dygraph
 
+from paddle import to_tensor
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph.jit import dygraph_to_static_func
 from paddle.fluid.dygraph.dygraph_to_static.utils import is_dygraph_api
@@ -45,11 +47,19 @@ def dyfunc_to_variable_3(x):
     return res
 
 
+def dyfunc_to_tensor(x):
+    res1 = paddle.to_tensor(x, dtype=None, place=None, stop_gradient=True)
+    res2 = paddle.tensor.to_tensor(data=res1)
+    res3 = to_tensor(data=res2)
+    return res3
+
+
 class TestDygraphBasicApi_ToVariable(unittest.TestCase):
     def setUp(self):
         self.input = np.ones(5).astype("int32")
         self.test_funcs = [
-            dyfunc_to_variable, dyfunc_to_variable_2, dyfunc_to_variable_3
+            dyfunc_to_tensor, dyfunc_to_variable, dyfunc_to_variable_2,
+            dyfunc_to_variable_3
         ]
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
index 8e35dd78457bb59bb4882bc1deeb23539f47012a..b72149a29c73ff9e1fa1975c3caffebb6202e0b7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
@@ -123,7 +123,7 @@ class TestConvertWithCache(unittest.TestCase):
 
 
 @declarative
-def sum_even_util_limit(max_len, limit):
+def sum_even_until_limit(max_len, limit):
     ret_sum = fluid.dygraph.to_variable(np.zeros((1)).astype('int32'))
     for i in range(max_len):
         if i % 2 > 0:
@@ -147,7 +147,7 @@ def sum_under_while(limit):
 class TestToOutputWithCache(unittest.TestCase):
     def test_output(self):
         with fluid.dygraph.guard():
-            ret = sum_even_util_limit(80, 10)
+            ret = sum_even_until_limit(80, 10)
             self.assertEqual(ret.numpy(), 30)
 
             ret = declarative(sum_under_while)(100)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 4a689354f56757ba754b76e3d407cb7083b95b3b..949286f63efb3357325f25b02f60e938eebd28e8 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -13,11 +13,15 @@
 # limitations under the License.
 
 import numpy as np
-from paddle.static import InputSpec
+import unittest
+
+import paddle
 import paddle.fluid as fluid
+from paddle.static import InputSpec
 from paddle.fluid.dygraph import to_variable, declarative, ProgramTranslator, Layer, jit
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ConcreteProgram
 
-import unittest
+from test_basic_api_transformation import dyfunc_to_variable
 
 program_trans = ProgramTranslator()
 
@@ -179,6 +183,9 @@ def foo_func(a, b, c=1, d=2):
 
 
 class TestDifferentInputSpecCacheProgram(unittest.TestCase):
+    def setUp(self):
+        program_trans.enable(True)
+
     def test_with_different_input(self):
         with fluid.dygraph.guard(fluid.CPUPlace()):
             x_data = np.ones([16, 10]).astype('float32')
@@ -191,6 +198,7 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase):
             out_1 = foo(to_variable(x_data), to_variable(y_data))
             self.assertTrue(np.allclose(x_data + y_data, out_1.numpy()))
             self.assertTrue(len(foo.program_cache) == 1)
+            self.assertTrue(len(foo.program_cache.concrete_programs()) == 1)
 
             # [16, 10] + [10] (numpy)
             out_2 = foo(to_variable(x_data), y_data)
@@ -245,6 +253,47 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase):
         concrete_program_5 = foo.get_concrete_program(
             InputSpec([10]), InputSpec([10]), e=4)
 
+    def test_concrete_program(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+
+            # usage 1
+            foo_1 = paddle.jit.to_static(
+                foo_func,
+                input_spec=[
+                    InputSpec(
+                        [10], name='x'), InputSpec(
+                            [10], name='y')
+                ])
+            self.assertTrue(isinstance(foo_1.concrete_program, ConcreteProgram))
+
+            # usage 2
+            foo_2 = paddle.jit.to_static(foo_func)
+            out = foo_2(paddle.rand([10]), paddle.rand([10]))
+            self.assertTrue(isinstance(foo_2.concrete_program, ConcreteProgram))
+
+            # raise error
+            foo_3 = paddle.jit.to_static(foo_func)
+            with self.assertRaises(ValueError):
+                foo_3.concrete_program
+
+
+class TestDeclarativeAPI(unittest.TestCase):
+    def test_error(self):
+        func = declarative(dyfunc_to_variable)
+
+        paddle.enable_static()
+
+        # Failed to run the callable object decorated by '@paddle.jit.to_static'
+        # if it does NOT in dynamic mode.
+        with self.assertRaises(RuntimeError):
+            func(np.ones(5).astype("int32"))
+
+        program_trans.enable(False)
+        with self.assertRaises(AssertionError):
+            # AssertionError: We Only support to_variable in imperative mode,
+            #  please use fluid.dygraph.guard() as context to run it in imperative Mode
+            func(np.ones(5).astype("int32"))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
index 214cd95d3bc620b3bcadb88e57c7e54a593eaaf4..510b615654751500c33dc3311353ba7e2f8baf40 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -86,11 +86,11 @@ class TestLoggingUtils(unittest.TestCase):
             with mock.patch.object(sys, 'stdout', stream):
                 logging_utils.warn(warn_msg)
                 logging_utils.error(error_msg)
-                self.translator_logger.verbosity_level = 2
+                self.translator_logger.verbosity_level = 1
                 logging_utils.log(1, log_msg_1)
                 logging_utils.log(2, log_msg_2)
 
-            result_msg = '\n'.join([warn_msg, error_msg, log_msg_2, ""])
+            result_msg = '\n'.join([warn_msg, error_msg, log_msg_1, ""])
             self.assertEqual(result_msg, stream.getvalue())
 
     def test_log_transformed_code(self):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index 1ef3bd1bf150056816283c83fa3ff6af1e589732..bd600d2f2dbd6341ff7a83d6636047d01cae7859 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -154,6 +154,18 @@ class TestMNISTWithToStatic(TestMNIST):
             msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
                                                             static_loss))
 
+    def test_mnist_declarative_cpu_vs_mkldnn(self):
+        dygraph_loss_cpu = self.train_dygraph()
+        fluid.set_flags({'FLAGS_use_mkldnn': True})
+        try:
+            dygraph_loss_mkldnn = self.train_dygraph()
+        finally:
+            fluid.set_flags({'FLAGS_use_mkldnn': False})
+        self.assertTrue(
+            np.allclose(dygraph_loss_cpu, dygraph_loss_mkldnn),
+            msg='cpu dygraph is {}\n mkldnn dygraph is \n{}'.format(
+                dygraph_loss_cpu, dygraph_loss_mkldnn))
+
     def train(self, to_static=False):
         prog_trans = ProgramTranslator()
         prog_trans.enable(to_static)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 6556b2f03bd5304e290792d07d1d969ab255bfdc..203c8ddb3488c0fef9a0a590378505e5b61233cf 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -346,6 +346,13 @@ class TestResnet(unittest.TestCase):
                                                              dygraph_loss))
         self.verify_predict()
 
+    def test_in_static_mode_mkldnn(self):
+        fluid.set_flags({'FLAGS_use_mkldnn': True})
+        try:
+            train(to_static=True)
+        finally:
+            fluid.set_flags({'FLAGS_use_mkldnn': False})
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/launch_function_helper.py b/python/paddle/fluid/tests/unittests/launch_function_helper.py
index 13041827ffeabd3d6b79e4f34a67bd09624e54f6..046268444018799ca4d7f5530cbb6b1c707e062f 100644
--- a/python/paddle/fluid/tests/unittests/launch_function_helper.py
+++ b/python/paddle/fluid/tests/unittests/launch_function_helper.py
@@ -15,7 +15,8 @@ from multiprocessing import Pool, Process
 import os
 import socket
 from contextlib import closing
-import psutil
+import time
+import sys
 
 
 def launch_func(func, env_dict):
@@ -25,19 +26,36 @@ def launch_func(func, env_dict):
     return proc
 
 
-def wait(procs, timeout=None):
-    # wait
-    decents = []
+def wait(procs, timeout=30):
+    error = False
+    begin = time.time()
+    while True:
+        alive = False
+        for p in procs:
+            p.join(timeout=10)
+            if p.exitcode is None:
+                alive = True
+                continue
+            elif p.exitcode != 0:
+                error = True
+                break
+
+        if not alive:
+            break
+
+        if error:
+            break
+
+        if timeout is not None and time.time() - begin >= timeout:
+            error = True
+            break
+
     for p in procs:
-        for child in psutil.Process(p.pid).children(recursive=True):
-            decents.append(child)
-
-    gone, alive = psutil.wait_procs(decents, timeout=timeout)
-    for p in alive:
-        p.kill()
-    for p in gone:
-        if p.returncode != 0:
-            sys.exit(1)
+        if p.is_alive():
+            p.terminate()
+
+    if error:
+        sys.exit(1)
 
 
 def _find_free_port(port_set):
diff --git a/python/paddle/fluid/tests/unittests/simnet_dataset_reader.py b/python/paddle/fluid/tests/unittests/simnet_dataset_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..41eadc13a2ad26ac15b0623147dae5771f371a12
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/simnet_dataset_reader.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import logging
+import tarfile
+
+import random
+
+import paddle
+import paddle.fluid.incubate.data_generator as data_generator
+
+logging.basicConfig()
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+
+
+class DatasetSimnetReader(data_generator.MultiSlotDataGenerator):
+    def generate_sample(self, line):
+        pass
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index d4aafcd27a5aceb3c0b5fa9ddf8343d404bddbf5..14e83fccd655527d8f3012365e4757d23236a445 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -450,7 +450,7 @@ class TestAdamOpV2(unittest.TestCase):
 
         import paddle
         paddle.disable_static()
-        emb = paddle.nn.Embedding([10, 10])
+        emb = paddle.nn.Embedding(10, 10)
 
         adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
         state_dict = adam.state_dict()
@@ -504,6 +504,19 @@ class TestAdamOpV2(unittest.TestCase):
                 shape=[1], value=lr, dtype='float32')
             adam.set_lr(lr_var)
 
+    def test_adam_op_invalid_input(self):
+        paddle.disable_static()
+        linear = paddle.nn.Linear(10, 10)
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.Adam(
+                0.1, beta1=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.Adam(
+                0.1, beta2=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.Adam(
+                0.1, epsilon=-1, parameters=linear.parameters())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_op.py b/python/paddle/fluid/tests/unittests/test_adamax_op.py
index a6d1be7616c73019cd8f66dcf0c108cd58ec600b..8ce7656acfae77987b284e29cd85b35d264b20e2 100644
--- a/python/paddle/fluid/tests/unittests/test_adamax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_op.py
@@ -184,5 +184,21 @@ def adamax_step(inputs, attributes):
     return param_out, moment_out, inf_norm_out
 
 
+class TestAdamaxOpV2(unittest.TestCase):
+    def test_adamax_op_invalid_input(self):
+        import paddle
+        paddle.disable_static()
+        linear = paddle.nn.Linear(10, 10)
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.Adamax(
+                0.1, beta1=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.Adamax(
+                0.1, beta2=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.Adamax(
+                0.1, epsilon=-1, parameters=linear.parameters())
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index 0a7cf54e2e0f15e51ba1b6f7526837f53c7cc2e0..cce24b57d2ca50e96e3ae0cf6d8912a8aea79a31 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -76,6 +76,19 @@ class TestAdamWOp(unittest.TestCase):
         rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
         assert rets[0] is not None
 
+    def test_adamw_op_invalid_input(self):
+        paddle.disable_static()
+        linear = paddle.nn.Linear(10, 10)
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.AdamW(
+                0.1, beta1=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.AdamW(
+                0.1, beta2=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.AdamW(
+                0.1, epsilon=-1, parameters=linear.parameters())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
index 7c1f9d802c31ac2c3b244541936ba25018e1487a..0fd9863948aedb64052e8fa0668f03600ae3197c 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
@@ -218,7 +218,7 @@ def create_test_case(op_type):
                 self.assertTrue("test_arg_api" in result.name)
 
         def run_dygraph(self, place):
-            paddle.disable_static()
+            paddle.disable_static(place)
             op = eval("paddle.%s" % (op_type))
             data_tensor = paddle.to_tensor(self.input_data)
 
@@ -240,7 +240,7 @@ def create_test_case(op_type):
             #case 4 
             result_data = op(data_tensor, axis=-1, keepdim=True)
             excepted_data = self.numpy_op(self.input_data, axis=-1)
-            excepted_data = excepted_data.reshape((10))
+            excepted_data = excepted_data.reshape((10, 1))
             self.assertTrue((result_data.numpy() == excepted_data).all(), True)
 
             #case 5 
@@ -299,14 +299,28 @@ class TestArgMinMaxOpError(unittest.TestCase):
                     name="test_argmax", shape=[10], dtype="float32")
                 output = paddle.argmax(x=data, dtype="float32")
 
-            self.assertRaises(ValueError, test_argmax_attr_type)
+            self.assertRaises(TypeError, test_argmax_attr_type)
 
             def test_argmin_attr_type():
                 data = paddle.static.data(
                     name="test_argmax", shape=[10], dtype="float32")
                 output = paddle.argmin(x=data, dtype="float32")
 
-            self.assertRaises(ValueError, test_argmin_attr_type)
+            self.assertRaises(TypeError, test_argmin_attr_type)
+
+            def test_argmax_axis_type():
+                data = paddle.static.data(
+                    name="test_argmax", shape=[10], dtype="float32")
+                output = paddle.argmax(x=data, axis=1.2)
+
+            self.assertRaises(TypeError, test_argmax_axis_type)
+
+            def test_argmin_axis_type():
+                data = paddle.static.data(
+                    name="test_argmin", shape=[10], dtype="float32")
+                output = paddle.argmin(x=data, axis=1.2)
+
+            self.assertRaises(TypeError, test_argmin_axis_type)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 5c705378e515eec4c950f6996e2789df603fcda3..2af0b31d6fc26c59803f29dcdc54979491767dd2 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -43,6 +43,21 @@ class TestBatchNorm(unittest.TestCase):
             x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
             x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
 
+            def error1d_dataformat():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm1d(1, data_format='NCDHW')
+                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d_dataformat():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                batch_norm2d = paddle.nn.BatchNorm2d(1, data_format='NCDHW')
+                batch_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d_dataformat():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm3d = paddle.nn.BatchNorm3d(1, data_format='NCL')
+                batch_norm3d(fluid.dygraph.to_variable(x_data_4))
+
             def error1d():
                 x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
                 batch_norm1d = paddle.nn.BatchNorm1d(1)
@@ -62,6 +77,9 @@ class TestBatchNorm(unittest.TestCase):
                 self.assertRaises(ValueError, error1d)
                 self.assertRaises(ValueError, error2d)
                 self.assertRaises(ValueError, error3d)
+                self.assertRaises(ValueError, error1d_dataformat)
+                self.assertRaises(ValueError, error2d_dataformat)
+                self.assertRaises(ValueError, error3d_dataformat)
 
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_batch_sampler.py b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
index 6ec6fdb59f200ce1dc9b6418b7f11329f85ba5dd..4faef77dad40dd3a9c0a8e5cf1b0d4438c1b7a8a 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py
@@ -18,6 +18,7 @@ import unittest
 
 import paddle.fluid as fluid
 from paddle.io import BatchSampler, Dataset, Sampler, SequenceSampler, RandomSampler
+from paddle.io import DistributedBatchSampler
 
 
 class RandomDataset(Dataset):
@@ -194,5 +195,15 @@ class TestBatchSamplerWithSamplerShuffle(unittest.TestCase):
             pass
 
 
+class TestDistributedBatchSamplerWithSampler(TestBatchSampler):
+    def init_batch_sampler(self):
+        dataset = RandomDataset(1000, 10)
+        bs = DistributedBatchSampler(
+            dataset=dataset,
+            batch_size=self.batch_size,
+            drop_last=self.drop_last)
+        return bs
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 74c01e1424885051faf3e263e6ca26c1269a838e..2e1f9d41747e3a99b4b4a0650a52973459b85c7b 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -166,12 +166,16 @@ class TestClipAPI(unittest.TestCase):
         data_shape = [1, 9, 9, 4]
         data = np.random.random(data_shape).astype('float32')
         images = paddle.to_variable(data, dtype='float32')
+        v_min = paddle.to_variable(np.array([0.2], dtype=np.float32))
+        v_max = paddle.to_variable(np.array([0.8], dtype=np.float32))
 
         out_1 = paddle.clip(images, min=0.2, max=0.8)
         out_2 = paddle.clip(images, min=0.2, max=0.9)
+        out_3 = paddle.clip(images, min=v_min, max=v_max)
 
         self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
         self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
+        self.assertTrue(np.allclose(out_3.numpy(), data.clip(0.2, 0.8)))
 
     def test_errors(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index b506f179143412e2bdb5d9eda511d90a0a3eea6d..e2336caac1c07f555280b82ba8fcfa7e5ec7f5b8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -156,40 +156,5 @@ class TestDistCtrHalfAsync2x2(TestFleetBase):
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
 
-class TestDistCtrPsGpuPyreaderAsync2x2(TestFleetBase):
-    def _setup_config(self):
-        self._mode = "async"
-        self._reader = "pyreader"
-
-    def check_with_place(self,
-                         model_file,
-                         delta=1e-3,
-                         check_error_log=False,
-                         need_envs={}):
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "FLAGS_rpc_deadline": "30000",  # 5sec to fail fast
-            "http_proxy": "",
-            "FLAGS_communicator_send_queue_size": "2",
-            "FLAGS_communicator_max_merge_var_num": "2",
-            "CPU_NUM": "2",
-            "SAVE_MODEL": "1"
-        }
-
-        required_envs.update(need_envs)
-
-        if check_error_log:
-            required_envs["GLOG_v"] = "3"
-            required_envs["GLOG_logtostderr"] = "1"
-
-        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
-
-    def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_ctr_ps_gpu.py", delta=1e-5, check_error_log=True)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
index 0fe7c386c1eeb751f34cf681778132310c304d51..7d18e935f58b6588adbef913c10d3ad497f07b53 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
@@ -21,7 +21,7 @@ import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 from test_dist_fleet_base import TestFleetBase
-from dist_simnet_bow import train_network
+from dist_fleet_simnet_bow import train_network
 
 
 class TestDistGeoCtr_2x2(TestFleetBase):
@@ -72,7 +72,7 @@ class TestGeoSgdTranspiler(unittest.TestCase):
 
         strategy = StrategyFactory.create_geo_strategy(5)
 
-        avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse)
+        avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse)
 
         optimizer = fluid.optimizer.SGD(0.1)
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
index 46616f3dde486e61488d6852ca9efc37a066ab0b..3c68af474cf7cae96a9fa62688460f84123438f5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
@@ -21,7 +21,7 @@ import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
 from test_dist_fleet_base import TestFleetBase
-from dist_simnet_bow import train_network
+from dist_fleet_simnet_bow import train_network
 
 
 @unittest.skip(reason="Skip unstable ut, add it after PR 22957 merged")
@@ -44,7 +44,7 @@ class TestDistGeoClipByGlobalNormTranspiler(unittest.TestCase):
         strategy.geo_sgd_mode = True
         strategy.geo_sgd_need_push_nums = 5
 
-        avg_cost, _, _ = train_network(batch_size, is_distribute, is_sparse)
+        avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse)
         fluid.clip.set_gradient_clip(
             clip=fluid.clip.GradientClipByGlobalNorm(2.0))
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec34993905e3cfc4603ac48987a690b7fa8a5439
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import tempfile
+from test_dist_fleet_base import TestFleetBase
+
+
+class TestDistSimnetASync2x2(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "async"
+        self._reader = "pyreader"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "2"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_simnet_bow.py", delta=1e-5, check_error_log=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
deleted file mode 100644
index 3189f092413c1f6f1526a5ca66b27f91c95082b1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ /dev/null
@@ -1,161 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import os
-import unittest
-
-from test_dist_base import TestDistBase
-
-import os
-flag_name = os.path.splitext(__file__)[0]
-
-
-class TestDistSimnetBowDense2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '0',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=1e-5,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-class TestDistSimnetBow2x2DenseAsync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._enforce_place = "CPU"
-
-    # FIXME(typhoonzero): fix async tests later
-    def notest_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '0',
-            'IS_SELF_CONTAINED_LR': '1',
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=100,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-class TestDistSimnetBowSparse2x2(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=1e-5,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-class TestDistSimnetBow2x2SparseAsync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=100,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-# FIXME(tangwei): Learningrate variable is not created on pserver.
-class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=1e-5,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = False
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '1'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=100,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
-    def _setup_config(self):
-        self._sync_mode = True
-        self._enforce_place = "CPU"
-
-    def test_simnet_bow(self):
-        need_envs = {
-            "IS_DISTRIBUTED": '0',
-            "IS_SPARSE": '1',
-            'IS_SELF_CONTAINED_LR': '0'
-        }
-        self.check_with_place(
-            "dist_simnet_bow.py",
-            delta=1e-5,
-            check_error_log=True,
-            need_envs=need_envs,
-            log_name=flag_name)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index d18c8e25974441a6989b18a0fe13bac91251de9d..ceec1190279212fbe6f3f128bdd1397cdb9ea1a2 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -530,7 +530,7 @@ class TestDropout2dFAPIError(unittest.TestCase):
             self.assertRaises(ValueError, test_dataformat)
 
 
-class TestDropout2DCAPI(unittest.TestCase):
+class TestDropout2dCAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -543,7 +543,7 @@ class TestDropout2DCAPI(unittest.TestCase):
                 input_np = np.random.random([2, 3, 4, 5]).astype("float32")
                 result_np = input_np
                 input = fluid.dygraph.to_variable(input_np)
-                m = paddle.nn.Dropout2D(p=0.)
+                m = paddle.nn.Dropout2d(p=0.)
                 m.eval()
                 result = m(input)
                 self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -616,7 +616,7 @@ class TestDropout3dFAPIError(unittest.TestCase):
             self.assertRaises(ValueError, test_dataformat)
 
 
-class TestDropout3DCAPI(unittest.TestCase):
+class TestDropout3dCAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -629,7 +629,7 @@ class TestDropout3DCAPI(unittest.TestCase):
                 input_np = np.random.random([2, 3, 4, 5, 6]).astype("float32")
                 result_np = input_np
                 input = fluid.dygraph.to_variable(input_np)
-                m = paddle.nn.Dropout3D(p=0.)
+                m = paddle.nn.Dropout3d(p=0.)
                 m.eval()
                 result = m(input)
                 self.assertTrue(np.allclose(result.numpy(), result_np))
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
index 9eec73116cc283b58d3ee39cefb9256e12d4ef15..927c155ff1116a821a13730a9d2a779a7c68b254 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
@@ -190,7 +190,7 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
             optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 9ab84404073906a8a95f9eb562cbe220e7c6b455..fc668ce3493e96e0790af522a439367fe10455f3 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -22,6 +22,7 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from op_test import OpTest
+import paddle
 
 
 class TestGaussianRandomOp(OpTest):
@@ -235,6 +236,56 @@ class TestGaussianRandomAPI(unittest.TestCase):
         self.assertAlmostEqual(np.mean(res_6), 0.0, delta=0.1)
         self.assertAlmostEqual(np.std(res_6), 1., delta=0.1)
 
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp_16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.gaussian_random([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp_16)
+
+        def test_default_fp_32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.gaussian_random([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp_64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.gaussian_random([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp_64()
+        test_default_fp_32()
+
+        paddle.enable_static()
+
+
+class TestStandardNormalDtype(unittest.TestCase):
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp_16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.standard_normal([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp_16)
+
+        def test_default_fp_32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.standard_normal([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp_64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.standard_normal([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp_64()
+        test_default_fp_32()
+
+        paddle.enable_static()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_get_set_flags.py b/python/paddle/fluid/tests/unittests/test_get_set_flags.py
index 2a5b8454e0350d8b3bf476214e060ca8b605ad89..e2761ff4358e3e5687bedb48aa374533e08bfb85 100644
--- a/python/paddle/fluid/tests/unittests/test_get_set_flags.py
+++ b/python/paddle/fluid/tests/unittests/test_get_set_flags.py
@@ -40,7 +40,7 @@ class TestGetAndSetFlagsErrors(unittest.TestCase):
     def test_errors(self):
         flags_list = ['FLAGS_eager_delete_tensor_gb', 'FLAGS_check_nan_inf']
         flag = 1
-        flag_private = {'FLAGS_use_mkldnn': True}
+        flag_private = {'FLAGS_free_idle_chunk': True}
 
         # flags type of set_flags should be dict.
         def test_set_flags_input_type():
@@ -51,7 +51,7 @@ class TestGetAndSetFlagsErrors(unittest.TestCase):
         # flags in set_flags should be public flags.
         def test_set_private_flag():
 
-            fluid.get_flags('FLAGS_use_mkldnn')
+            fluid.set_flags(flag_private)
 
         self.assertRaises(ValueError, test_set_private_flag)
 
@@ -63,7 +63,7 @@ class TestGetAndSetFlagsErrors(unittest.TestCase):
 
         # flags in get_flags should be public flags.
         def test_get_private_flag():
-            fluid.get_flags('FLAGS_use_mkldnn')
+            fluid.get_flags('FLAGS_free_idle_chunk')
 
         self.assertRaises(ValueError, test_get_private_flag)
 
diff --git a/python/paddle/fluid/tests/unittests/test_global_var_getter_setter.py b/python/paddle/fluid/tests/unittests/test_global_var_getter_setter.py
index 548b75831150012d5d1f2d2abfa55e6cf37c4983..3394a08de8b197c59745edeea8953fe3ec6a2488 100644
--- a/python/paddle/fluid/tests/unittests/test_global_var_getter_setter.py
+++ b/python/paddle/fluid/tests/unittests/test_global_var_getter_setter.py
@@ -26,7 +26,7 @@ class VarInfo(object):
 class TestGlobalVarGetterSetter(unittest.TestCase):
     def test_main(self):
         var_infos = [
-            VarInfo("FLAGS_use_mkldnn", bool, False),
+            VarInfo("FLAGS_free_idle_chunk", bool, False),
             VarInfo("FLAGS_eager_delete_tensor_gb", float, True),
         ]
 
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
index 654e8d6f129e1ffe0dce59113ca88a16d348f210..a46b9b0ca78bf37e1c421a08a6fa8c5353c6d45d 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -35,24 +35,33 @@ class TestDygraphGroupNormv2(unittest.TestCase):
 
             def compute_v1(x):
                 with fluid.dygraph.guard(p):
-                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    gn = fluid.dygraph.GroupNorm(channels=6, groups=2)
                     y = gn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
             def compute_v2(x):
                 with fluid.dygraph.guard(p):
-                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    gn = paddle.nn.GroupNorm(num_channels=6, num_groups=2)
                     y = gn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
+            def test_weight_bias_false():
+                with fluid.dygraph.guard(p):
+                    gn = paddle.nn.GroupNorm(
+                        num_channels=6,
+                        num_groups=2,
+                        weight_attr=False,
+                        bias_attr=False)
+
             x = np.random.randn(*shape).astype("float32")
             y1 = compute_v1(x)
             y2 = compute_v2(x)
             self.assertTrue(np.allclose(y1, y2))
+            test_weight_bias_false()
 
     def test_static(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+        if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
             places.append(fluid.CUDAPlace(0))
         for p in places:
             exe = fluid.Executor(p)
@@ -60,7 +69,7 @@ class TestDygraphGroupNormv2(unittest.TestCase):
 
             def compute_v1(x_np):
                 with program_guard(Program(), Program()):
-                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    gn = fluid.dygraph.GroupNorm(channels=6, groups=2)
                     x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
                     y = gn(x)
                     exe.run(fluid.default_startup_program())
@@ -69,7 +78,7 @@ class TestDygraphGroupNormv2(unittest.TestCase):
 
             def compute_v2(x_np):
                 with program_guard(Program(), Program()):
-                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    gn = paddle.nn.GroupNorm(num_channels=6, num_groups=2)
                     x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
                     y = gn(x)
                     exe.run(fluid.default_startup_program())
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index b74182d27ab8c89cc43d3fc1656ca13916d159c1..74cfeab601b04d9624a5f6e48fd06c6cbf3715f8 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -19,6 +19,7 @@ import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid import Linear
+from paddle.fluid.layer_helper import LayerHelper
 from test_imperative_base import new_program_scope
 import paddle.fluid.dygraph_utils as dygraph_utils
 from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper
@@ -636,6 +637,31 @@ class TestDygraphUtils(unittest.TestCase):
             res2 = fluid.layers.sigmoid(a)
             self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
 
+    def test_append_activation_in_dygraph_use_mkldnn(self):
+        a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
+        helper = LayerHelper(
+            fluid.unique_name.generate("test"), act="relu", use_mkldnn=True)
+        func = helper.append_activation
+        with fluid.dygraph.guard():
+            a = fluid.dygraph.to_variable(a_np)
+            res1 = func(a)
+            res2 = fluid.layers.relu(a)
+            self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
+
+    def test_append_activation_in_dygraph_global_use_mkldnn(self):
+        a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
+        helper = LayerHelper(fluid.unique_name.generate("test"), act="relu")
+        func = helper.append_activation
+        with fluid.dygraph.guard():
+            a = fluid.dygraph.to_variable(a_np)
+            fluid.set_flags({'FLAGS_use_mkldnn': True})
+            try:
+                res1 = func(a)
+            finally:
+                fluid.set_flags({'FLAGS_use_mkldnn': False})
+            res2 = fluid.layers.relu(a)
+        self.assertTrue(np.array_equal(res1.numpy(), res2.numpy()))
+
     def test_append_bias_in_dygraph_exception(self):
         with new_program_scope():
             np_inp = np.random.random(size=(10, 20, 30)).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
index 820206a3ce630eb92a36a154ca7cdec62de2ce34..13ca1840d0d24c73577a547f186d4f03b13bca28 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_decorator.py
@@ -28,7 +28,7 @@ class TestTracerMode(unittest.TestCase):
     def get_tracer_mode(self):
         assert fluid.in_dygraph_mode(), "Dygraph mode must be enabled"
 
-    @paddle.no_grad()
+    @fluid.dygraph.no_grad
     def no_grad_func(self, a):
         self.assertEqual(self.tracer._train_mode, False)
         return a
@@ -56,35 +56,17 @@ class TestTracerMode(unittest.TestCase):
             def need_no_grad_func(a, b=1):
                 return a + b
 
-            decorated_func = paddle.no_grad()(need_no_grad_func)
+            decorated_func = fluid.dygraph.no_grad(need_no_grad_func)
             self.assertTrue(
                 str(inspect.getargspec(decorated_func)) ==
                 str(inspect.getargspec(need_no_grad_func)))
 
             self.assertEqual(self.tracer._train_mode, self.init_mode)
 
-            def test_gen():
-                for i in range(3):
-                    yield i
-
-            a = 0
-            for i in test_gen():
-                a += i
-
-            @paddle.no_grad()
-            def test_wrapped_gen():
-                for i in range(3):
-                    yield i
-
-            b = 0
-            for i in test_wrapped_gen():
-                b += i
-
-            self.assertEqual(a, b)
-
         with fluid.dygraph.guard():
             self.check_not_support_rlt(False)
 
+        paddle.enable_static()
         with new_program_scope():
             self.check_not_support_rlt(True)
 
@@ -94,5 +76,48 @@ class TestTracerMode2(TestTracerMode):
         self.init_mode = False
 
 
+class TestNoGradClass(unittest.TestCase):
+    @paddle.no_grad()
+    def no_grad_func(self, a):
+        self.assertEqual(self.tracer._train_mode, False)
+        return a
+
+    def test_main(self):
+        paddle.disable_static()
+
+        self.tracer = framework._dygraph_tracer()
+        self.tracer._train_mode = True
+
+        self.assertEqual(self.no_grad_func(1), 1)
+        self.assertEqual(self.no_grad_func.__name__, "no_grad_func")
+
+        def need_no_grad_func(a, b=1):
+            return a + b
+
+        decorated_func = paddle.no_grad()(need_no_grad_func)
+        self.assertEqual(
+            str(inspect.getargspec(decorated_func)),
+            str(inspect.getargspec(need_no_grad_func)))
+
+        def test_gen():
+            for i in range(3):
+                yield i
+
+        a = 0
+        for i in test_gen():
+            a += i
+
+        @paddle.no_grad()
+        def test_wrapped_gen():
+            for i in range(3):
+                yield i
+
+        b = 0
+        for i in test_wrapped_gen():
+            b += i
+
+        self.assertEqual(a, b)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
index 619e9e8e90783365b5f0d718783a14468520c8d4..887e50f07c55cc991d7816609253039ce0d48d7d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -401,9 +401,7 @@ class TestOptimizerLearningRate(unittest.TestCase):
             a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
 
             linear = fluid.dygraph.nn.Linear(10, 10)
-
             a = fluid.dygraph.to_variable(a)
-
             b = linear(a)
 
             loss = fluid.layers.reduce_mean(b)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index e81d1c8610f6bebffadf930b67dc14a4a418ef05..3ccd1dbda3a443d50e43ba498cb3d5b529318c32 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -297,6 +297,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             paddle.save(self.state_dict, "./test_dy_v2")
 
     def testLoadAndSetVarBase(self):
+        self.setUp()
         seed = 90
         hidden_size = 10
         vocab_size = 1000
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
index b02ba1a584b52dbbc99fcc8ed7bad438e7a9dd46..c45c144e3ad44c5781ea1f1d7d61028b56d8a254 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -48,7 +48,13 @@ class TestInstanceNorm(unittest.TestCase):
                 instance_norm3d = paddle.nn.BatchNorm3d(1)
                 instance_norm3d(fluid.dygraph.to_variable(x_data_4))
 
+            def weight_bias_false():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm3d = paddle.nn.BatchNorm3d(
+                    1, weight_attr=False, bias_attr=False)
+
             with fluid.dygraph.guard(p):
+                weight_bias_false()
                 self.assertRaises(ValueError, error1d)
                 self.assertRaises(ValueError, error2d)
                 self.assertRaises(ValueError, error3d)
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
index 98d8b7f9f88d2f8892bb2ac8190fbb3c9f19e047..44a653521a9c4878f6135c7f78f4e779c929e7d3 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
@@ -59,7 +59,7 @@ class TestLookupTableOpWithTensorIds(OpTest):
     def setUp(self):
         self.op_type = "lookup_table_v2"
         table = np.random.random((17, 31)).astype("float64")
-        ids = np.random.randint(low=0, high=17, size=(2, 4, 5)).astype("int64")
+        ids = np.random.randint(low=0, high=17, size=(2, 4, 5)).astype("int32")
         self.inputs = {'W': table, 'Ids': ids}
         self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
 
@@ -100,7 +100,7 @@ class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
 class TestLookupTableWIsSelectedRows(unittest.TestCase):
     def prepare_ids(self, scope, place):
         ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.array([0, 4, 3, 5]).astype("int64")
+        ids_array = np.array([0, 4, 3, 5]).astype("int32")
         ids_tensor.set(ids_array, place)
         return ids_array
 
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0edf9019356f38eb3c74b9cadfa6ae575e9b823
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+
+class EmbeddingDygraph(unittest.TestCase):
+    def test_1(self):
+        import paddle
+        import paddle.nn as nn
+        import numpy as np
+        paddle.disable_static()
+
+        # example 1
+        inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64')
+        inp_word.shape  # [2, 3]
+        dict_size = 20
+
+        emb = nn.Embedding(dict_size, 32, weight_attr='emb.w', sparse=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9c91ceb39de42c44f9ce81658aa79b896999552
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.nn.functional as functional
+
+
+class EmbeddingStatic(unittest.TestCase):
+    def test_1(self):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+
+            def test_bad_x():
+                initializer = fluid.initializer.NumpyArrayInitializer(
+                    np.random.random(size=(128, 100)))
+
+                param_attr = fluid.ParamAttr(
+                    name="emb_weight",
+                    learning_rate=0.5,
+                    initializer=initializer,
+                    trainable=True)
+
+                weight = prog.global_block().create_parameter(
+                    (128, 100), attr=param_attr, dtype="float32")
+
+                label = fluid.layers.data(
+                    name="label",
+                    shape=[4],
+                    append_batch_size=False,
+                    dtype="int64")
+
+                emb = functional.embedding(
+                    x=label, weight=weight, sparse=True, name="embedding")
+
+            test_bad_x()
+
+    def test_2(self):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+
+            def test_bad_x():
+                initializer = fluid.initializer.NumpyArrayInitializer(
+                    np.random.random(size=(128, 100)))
+
+                param_attr = fluid.ParamAttr(
+                    name="emb_weight",
+                    learning_rate=0.5,
+                    initializer=initializer,
+                    trainable=True)
+
+                weight = prog.global_block().create_parameter(
+                    (128, 100), attr=param_attr, dtype="float32")
+
+                label = fluid.layers.data(
+                    name="label",
+                    shape=[4],
+                    append_batch_size=False,
+                    dtype="int32")
+
+                emb = functional.embedding(
+                    x=label, weight=weight, sparse=True, name="embedding")
+
+            test_bad_x()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rand_op.py b/python/paddle/fluid/tests/unittests/test_rand_op.py
index c8e0130b77dc661d190f568ac501c9986a81f5e4..1eceeaadfec651ade5031ddc7e6a012244050e84 100644
--- a/python/paddle/fluid/tests/unittests/test_rand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rand_op.py
@@ -21,6 +21,7 @@ import paddle.fluid.core as core
 from paddle import rand
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+import paddle
 
 
 class TestRandOpError(unittest.TestCase):
@@ -115,5 +116,31 @@ class TestRandOpForDygraph(unittest.TestCase):
             self.run_net(True)
 
 
+class TestRandDtype(unittest.TestCase):
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp_16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.rand([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp_16)
+
+        def test_default_fp_32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.rand([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp_64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.rand([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp_64()
+        test_default_fp_32()
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index f7b9d4214d36a422a3ec94dc410e58c6c827ef4c..ddac7f6b98b19d204d20ccdff75c6d4fcae50d4d 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -276,6 +276,19 @@ class TestRMSPropV2(unittest.TestCase):
             learning_rate=0.1,
             momentum=None)
 
+    def test_rmsprop_op_invalid_input(self):
+        paddle.disable_static()
+        linear = paddle.nn.Linear(10, 10)
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.RMSProp(
+                0.1, epsilon=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.RMSProp(
+                0.1, momentum=-1, parameters=linear.parameters())
+        with self.assertRaises(ValueError):
+            adam = paddle.optimizer.RMSProp(
+                0.1, rho=-1, parameters=linear.parameters())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 04d5cc941a4636da0352fe9221cdad8bdfcd2bd9..a37fad9cf0ca0772fc4939e6d59e27f6fc7efac1 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -153,16 +153,103 @@ class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
         return [2, 3, 4, 5]
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp3(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+    def get_axis(self):
+        return 0
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp4(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
+    def get_axis(self):
+        return 1
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp5(TestSoftmaxCUDNNOp):
     def get_x_shape(self):
         return [2, 3, 4, 5]
 
+    def get_axis(self):
+        return 2
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp6(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+
     def get_axis(self):
         return 3
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp7(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5, 6]
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp8(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5, 6]
+
+    def get_axis(self):
+        return 0
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp9(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5, 6]
+
+    def get_axis(self):
+        return 1
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp10(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5, 6]
+
+    def get_axis(self):
+        return 2
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp11(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5, 6]
+
+    def get_axis(self):
+        return 3
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp12(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5, 6]
+
+    def get_axis(self):
+        return 4
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestSoftmaxFP16Op(TestSoftmaxOp):
diff --git a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
index ca92bc75245cebbfdfbbed80e99957d2b4f57b2a..171d3788d830dfb570616ccbae29f45e10b1172c 100644
--- a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
+++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
@@ -30,15 +30,9 @@ from paddle.fluid.dygraph import parallel_helper
 # executed in the python3 sub-process. 
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestInitParallelEnv(unittest.TestCase):
-    def test_beckend_type_error(self):
-        with self.assertRaises(TypeError):
-            dist.init_parallel_env(backend=1)
-
-    def test_backend_value_error(self):
-        with self.assertRaises(ValueError):
-            dist.init_parallel_env(backend="mpi")
-
     def test_check_env_failed(self):
         os.environ['FLAGS_selected_gpus'] = '0'
         os.environ['PADDLE_TRAINER_ID'] = '0'
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index a04aaaef0d41b9f991889586b489269b6ede5b42..56dc27a9a5b136829ce410b50998e23b77510665 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -536,5 +536,31 @@ class TestUniformDygraphMode(unittest.TestCase):
                 self.assertTrue((x_np[i] > 0 and x_np[i] < 1.0))
 
 
+class TestUniformDtype(unittest.TestCase):
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp_16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.uniform([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp_16)
+
+        def test_default_fp_32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.uniform([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp_64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.uniform([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp_64()
+        test_default_fp_32()
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index b2975283fbef010029b935b9b209411f09bdb5fd..af788874191335ad31d1540bcc0db90cc12383c6 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -20,8 +20,8 @@ __all__ = [
 ]
 
 __all__ += [
-    'grad', 'LayerList', 'load', 'save', 'prepare_context', 'to_variable',
-    'no_grad', 'ParallelEnv', 'DataParallel'
+    'grad', 'LayerList', 'load', 'save', 'to_variable', 'no_grad',
+    'DataParallel'
 ]
 
 __all__ += [
diff --git a/python/paddle/incubate/hapi/__init__.py b/python/paddle/hapi/__init__.py
similarity index 69%
rename from python/paddle/incubate/hapi/__init__.py
rename to python/paddle/hapi/__init__.py
index c0361fa33246ff3315a107c520972ca6bebc8168..87f5a82525cdfa36e48d40c6d12488d359fe99db 100644
--- a/python/paddle/incubate/hapi/__init__.py
+++ b/python/paddle/hapi/__init__.py
@@ -13,34 +13,15 @@
 # limitations under the License.
 
 from . import logger
-from . import progressbar
 from . import callbacks
-from . import download
 
 from . import model
 from .model import *
 
-from . import datasets
-from . import distributed
-from . import vision
-from . import text
-from . import utils
-
-from . import device
-from .device import *
-
 from .dygraph_layer_patch import monkey_patch_layer
 
 logger.setup_logger()
 
-__all__ = [
-    'callbacks',
-    'datasets',
-    'distributed',
-    'download',
-    'vision',
-    'text',
-    'utils',
-] + model.__all__ + device.__all__
+__all__ = ['callbacks'] + model.__all__
 
 monkey_patch_layer()
diff --git a/python/paddle/incubate/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
similarity index 88%
rename from python/paddle/incubate/hapi/callbacks.py
rename to python/paddle/hapi/callbacks.py
index 0804708210a9749813e195a8b5579b339986acd6..7ed571fa9c6a4a962b20397c999368dad0734ff0 100644
--- a/python/paddle/incubate/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 from paddle.fluid.dygraph.parallel import ParallelEnv
 
 from .progressbar import ProgressBar
@@ -117,10 +119,10 @@ class Callback(object):
 
         .. code-block:: python
             
-            from paddle.incubate.hapi.callbacks import Callback
+            import paddle
 
             # build a simple model checkpoint callback
-            class ModelCheckpoint(Callback):
+            class ModelCheckpoint(paddle.callbacks.Callback):
                 def __init__(self, save_freq=1, save_dir=None):
                     self.save_freq = save_freq
                     self.save_dir = save_dir
@@ -147,12 +149,12 @@ class Callback(object):
           - 'verbose': an integer. Verbose mode is 0, 1 or 2.
              0 = silent, 1 = progress bar, 2 = one line per epoch.
           - 'metrics': a list of str. Names of metrics, including 'loss'
-              and the names of hapi.Metric.
+              and the names of paddle.metric.Metric.
         """
         self.params = params
 
     def set_model(self, model):
-        """model is instance of hapi.Model.
+        """model is instance of paddle.Model.
         """
         self.model = model
 
@@ -168,7 +170,7 @@ class Callback(object):
 
         Args:
             logs (dict): The logs is a dict or None. The keys of logs
-                passed by hapi.Model contains 'loss', metric names and
+                passed by paddle.Model contains 'loss', metric names and
                 `batch_size`.
         """
 
@@ -177,10 +179,10 @@ class Callback(object):
 
         Args:
             logs (dict): The logs is a dict or None. The keys of logs
-                passed by hapi.Model contains 'steps' and 'metrics',
+                passed by paddle.Model contains 'steps' and 'metrics',
                 The `steps` is number of total steps of validation dataset.
                 The `metrics` is a list of str including 'loss' and the names
-                of hapi.Metric.
+                of paddle.metric.Metric.
         """
 
     def on_eval_end(self, logs=None):
@@ -188,7 +190,7 @@ class Callback(object):
 
         Args:
             logs (dict): The logs is a dict or None. The `logs` passed by
-                hapi.Model is a dict contains 'loss', metrics and 'batch_size'
+                paddle.Model is a dict contains 'loss', metrics and 'batch_size'
                 of last batch of validation dataset.
         """
 
@@ -212,7 +214,7 @@ class Callback(object):
         Args:
             epoch (int): The index of epoch.
             logs (dict): The logs is a dict or None. The `logs` passed by
-                hapi.Model is None.
+                paddle.Model is None.
         """
 
     def on_epoch_end(self, epoch, logs=None):
@@ -221,7 +223,7 @@ class Callback(object):
         Args:
             epoch (int): The index of epoch.
             logs (dict): The logs is a dict or None. The `logs` passed by
-                hapi.Model is a dict, contains 'loss', metrics and 'batch_size'
+                paddle.Model is a dict, contains 'loss', metrics and 'batch_size'
                 of last batch.
         """
 
@@ -231,7 +233,7 @@ class Callback(object):
         Args:
             step (int): The index of step (or iteration).
             logs (dict): The logs is a dict or None. The `logs` passed by
-                hapi.Model is empty.
+                paddle.Model is empty.
         """
 
     def on_train_batch_end(self, step, logs=None):
@@ -240,7 +242,7 @@ class Callback(object):
         Args:
             step (int): The index of step (or iteration).
             logs (dict): The logs is a dict or None. The `logs` passed by
-                hapi.Model is a dict, contains 'loss', metrics and 'batch_size'
+                paddle.Model is a dict, contains 'loss', metrics and 'batch_size'
                 of current batch.
         """
 
@@ -250,7 +252,7 @@ class Callback(object):
         Args:
             step (int): The index of step (or iteration).
             logs (dict): The logs is a dict or None. The `logs` passed by
-                hapi.Model is empty.
+                paddle.Model is empty.
         """
 
     def on_eval_batch_end(self, step, logs=None):
@@ -259,7 +261,7 @@ class Callback(object):
         Args:
             step (int): The index of step (or iteration).
             logs (dict): The logs is a dict or None. The `logs` passed by
-                hapi.Model is a dict, contains 'loss', metrics and 'batch_size'
+                paddle.Model is a dict, contains 'loss', metrics and 'batch_size'
                 of current batch.
         """
 
@@ -292,23 +294,22 @@ class ProgBarLogger(Callback):
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-            import paddle.incubate.hapi as hapi
+            from paddle.static import InputSpec
 
-            inputs = [hapi.Input([-1, 1, 28, 28], 'float32', 'image')]
-            labels = [hapi.Input([None, 1], 'int64', 'label')]
+            inputs = [InputSpec([-1, 1, 28, 28], 'float32', 'image')]
+            labels = [InputSpec([None, 1], 'int64', 'label')]
 
-            train_dataset = hapi.datasets.MNIST(mode='train')
+            train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-            model = hapi.Model(hapi.vision.LeNet(classifier_activation=None),
+            model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
                 inputs, labels)
 
-            optim = fluid.optimizer.Adam(0.001)
+            optim = paddle.optimizer.Adam(0.001)
             model.prepare(optimizer=optim,
                         loss=paddle.nn.CrossEntropyLoss(),
                         metrics=paddle.metric.Accuracy())
 
-            callback = hapi.callbacks.ProgBarLogger(log_freq=10)
+            callback = paddle.callbacks.ProgBarLogger(log_freq=10)
             model.fit(train_dataset, batch_size=64, callbacks=callback)
     """
 
@@ -428,23 +429,22 @@ class ModelCheckpoint(Callback):
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-            import paddle.incubate.hapi as hapi
+            from paddle.static import InputSpec
 
-            inputs = [hapi.Input([-1, 1, 28, 28], 'float32', 'image')]
-            labels = [hapi.Input([None, 1], 'int64', 'label')]
+            inputs = [InputSpec([-1, 1, 28, 28], 'float32', 'image')]
+            labels = [InputSpec([None, 1], 'int64', 'label')]
 
-            train_dataset = hapi.datasets.MNIST(mode='train')
+            train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-            model = hapi.Model(hapi.vision.LeNet(classifier_activation=None),
+            model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
                 inputs, labels)
 
-            optim = fluid.optimizer.Adam(0.001)
+            optim = paddle.optimizer.Adam(0.001)
             model.prepare(optimizer=optim,
                         loss=paddle.nn.CrossEntropyLoss(),
                         metrics=paddle.metric.Accuracy())
 
-            callback = hapi.callbacks.ModelCheckpoint(save_dir='./temp')
+            callback = paddle.callbacks.ModelCheckpoint(save_dir='./temp')
             model.fit(train_dataset, batch_size=64, callbacks=callback)
     """
 
@@ -461,11 +461,11 @@ class ModelCheckpoint(Callback):
     def on_epoch_end(self, epoch, logs=None):
         if self._is_save() and self.epoch % self.save_freq == 0:
             path = '{}/{}'.format(self.save_dir, epoch)
-            print('save checkpoint at {}'.format(path))
+            print('save checkpoint at {}'.format(os.path.abspath(path)))
             self.model.save(path)
 
     def on_train_end(self, logs=None):
         if self._is_save():
             path = '{}/final'.format(self.save_dir)
-            print('save checkpoint at {}'.format(path))
+            print('save checkpoint at {}'.format(os.path.abspath(path)))
             self.model.save(path)
diff --git a/python/paddle/incubate/hapi/dygraph_layer_patch.py b/python/paddle/hapi/dygraph_layer_patch.py
similarity index 98%
rename from python/paddle/incubate/hapi/dygraph_layer_patch.py
rename to python/paddle/hapi/dygraph_layer_patch.py
index cb3cc10a84dd9347bf4b781031bedb5836dfbd4c..e3a2948b69305fcb08c14c850f5738ac46aea2be 100644
--- a/python/paddle/incubate/hapi/dygraph_layer_patch.py
+++ b/python/paddle/hapi/dygraph_layer_patch.py
@@ -16,8 +16,7 @@ import warnings
 
 import paddle.fluid as fluid
 from paddle.fluid.framework import in_dygraph_mode
-
-from .device import _get_device
+from paddle.fluid.framework import _current_expected_place as _get_device
 
 
 def monkey_patch_layer():
diff --git a/python/paddle/incubate/hapi/logger.py b/python/paddle/hapi/logger.py
similarity index 100%
rename from python/paddle/incubate/hapi/logger.py
rename to python/paddle/hapi/logger.py
diff --git a/python/paddle/incubate/hapi/model.py b/python/paddle/hapi/model.py
similarity index 86%
rename from python/paddle/incubate/hapi/model.py
rename to python/paddle/hapi/model.py
index b52354d4ccf4671b0d372bae63a1befbe383e053..bba94d56cca8e77735d8921d007248b2e388a5f6 100644
--- a/python/paddle/incubate/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -22,37 +22,182 @@ import pickle
 import numpy as np
 import six
 import warnings
+import time
+import socket
+import contextlib
 from collections import Iterable
 
 import paddle
 from paddle import fluid
 from paddle.fluid import core
 from paddle.fluid.framework import in_dygraph_mode, Variable, ParamBase, _current_expected_place
-# Note: Use alias `Input` temporarily before releasing hapi feature.
-from paddle.static import InputSpec as Input
+from paddle.fluid.framework import in_dygraph_mode, Variable
+from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.executor import global_scope
 from paddle.fluid.io import is_belong_to_optimizer
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
 from paddle.fluid.layers.utils import flatten
+from paddle.fluid.layers import collective
 from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
 from paddle.fluid.incubate.fleet.base import role_maker
-from paddle.fluid.executor import scope_guard, Executor
-from paddle.io import DataLoader, Dataset
 
+from paddle.io import DataLoader, Dataset, DistributedBatchSampler
+from paddle.fluid.executor import scope_guard, Executor
 from paddle.fluid.dygraph.layers import Layer
 from paddle.metric import Metric
 
-from .distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
+from paddle.static import InputSpec as Input
+
 from .callbacks import config_callbacks
-from .utils import to_list, to_numpy, flatten_list, restore_flatten_list, extract_args
-from .device import _get_device
 
-__all__ = [
-    'Model',
-    'Input',
-]
+__all__ = ['Model', ]
+
+_parallel_context_initialized = False
+
+
+def to_list(value):
+    if value is None:
+        return value
+    if isinstance(value, (list, tuple)):
+        return list(value)
+    return [value]
+
+
+def to_numpy(var):
+    assert isinstance(var, (Variable, fluid.core.VarBase)), "not a variable"
+    if isinstance(var, fluid.core.VarBase):
+        return var.numpy()
+    t = global_scope().find_var(var.name).get_tensor()
+    return np.array(t)
+
+
+def flatten_list(l):
+    assert isinstance(l, list), "not a list"
+    outl = []
+    splits = []
+    for sl in l:
+        assert isinstance(sl, list), "sub content not a list"
+        splits.append(len(sl))
+        outl += sl
+    return outl, splits
+
+
+def restore_flatten_list(l, splits):
+    outl = []
+    for split in splits:
+        assert len(l) >= split, "list length invalid"
+        sl, l = l[:split], l[split:]
+        outl.append(sl)
+    return outl
+
+
+def extract_args(func):
+    if hasattr(inspect, 'getfullargspec'):
+        return inspect.getfullargspec(func)[0]
+    else:
+        return inspect.getargspec(func)[0]
+
+
+def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
+    return collective._c_allgather(
+        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream)
+
+
+def wait_server_ready(endpoints):
+    assert not isinstance(endpoints, six.string_types)
+    while True:
+        all_ok = True
+        not_ready_endpoints = []
+        for ep in endpoints:
+            ip_port = ep.split(":")
+            with contextlib.closing(
+                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+                sock.settimeout(2)
+                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                if result != 0:
+                    all_ok = False
+                    not_ready_endpoints.append(ep)
+        if not all_ok:
+            time.sleep(3)
+        else:
+            break
+
+
+def init_communicator(program, rank, nranks, wait_port, current_endpoint,
+                      endpoints):
+    if nranks < 2:
+        return
+    other_endpoints = endpoints[:]
+    other_endpoints.remove(current_endpoint)
+    if rank == 0 and wait_port:
+        wait_server_ready(other_endpoints)
+    block = program.global_block()
+    nccl_id_var = block.create_var(
+        name=fluid.unique_name.generate('nccl_id'),
+        persistable=True,
+        type=fluid.core.VarDesc.VarType.RAW)
+
+    block.append_op(
+        type='c_gen_nccl_id',
+        inputs={},
+        outputs={'Out': nccl_id_var},
+        attrs={
+            'rank': rank,
+            'endpoint': current_endpoint,
+            'other_endpoints': other_endpoints
+        })
+
+    block.append_op(
+        type='c_comm_init',
+        inputs={'X': nccl_id_var},
+        outputs={},
+        attrs={
+            'nranks': nranks,
+            'rank': rank,
+            'ring_id': 0,
+        })
+
+
+def prepare_distributed_context(place=None):
+    if place is None:
+        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
+            else fluid.CUDAPlace(0)
+
+    strategy = fluid.dygraph.parallel.ParallelStrategy()
+    strategy.nranks = ParallelEnv().nranks
+    strategy.local_rank = ParallelEnv().local_rank
+    strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+    strategy.current_endpoint = ParallelEnv().current_endpoint
+
+    if strategy.nranks < 2:
+        return
+
+    global _parallel_context_initialized
+
+    if not _parallel_context_initialized and isinstance(place, fluid.CUDAPlace):
+
+        def _init_context():
+            communicator_prog = fluid.Program()
+            init_communicator(communicator_prog, strategy.local_rank,
+                              strategy.nranks, True, strategy.current_endpoint,
+                              strategy.trainer_endpoints)
+            exe = fluid.Executor(place)
+            exe.run(communicator_prog)
+
+        if fluid.in_dygraph_mode():
+            fluid.disable_dygraph()
+            _init_context()
+            fluid.enable_dygraph(place)
+        else:
+            _init_context()
+
+    else:
+        assert ("Only support CUDAPlace for now.")
+
+    _parallel_context_initialized = True
+    return strategy
 
 
 class StaticGraphAdapter(object):
@@ -586,8 +731,8 @@ class DynamicGraphAdapter(object):
         if not self.model._optimizer or not optim_state:
             return
 
-        # If optimizer performs set_dict when state vars haven't been created,
-        # which would happen when set_dict before minimize, the state would be
+        # If optimizer performs set_state_dict when state vars haven't been created,
+        # which would happen when set_state_dict before minimize, the state would be
         # stored in optimizer._accumulators_holder and loaded lazily.
         # To contrive this when loading from static-graph saved states, extend
         # state dict to include keys named accoring to dygraph naming rules.
@@ -631,7 +776,13 @@ class DynamicGraphAdapter(object):
                                      accum_name + "_0")
                     converted_state[dy_state_name] = state_var
 
-        self.model._optimizer.set_dict(converted_state)
+        if not hasattr(self.model._optimizer, 'set_state_dict'):
+            warnings.warn(
+                "paddle.fluid.optimizer is deprecated in API 2.0, please use paddle.optimizer instead"
+            )
+            self.model._optimizer.set_dict(converted_state)
+        else:
+            self.model._optimizer.set_state_dict(converted_state)
 
 
 class Model(object):
@@ -640,55 +791,51 @@ class Model(object):
     Dynamic graph and static graph are supported at the same time,
     switched by `paddle.disable_static()`. The usage is as follows.
     But note, the switching between dynamic and static should be before
-    instantiating a Model. The input description, i.e, hapi.Input,
+    instantiating a Model. The input description, i.e, paddle.static.InputSpec,
     must be required for static graph.
 
     Args:
         network (paddle.nn.Layer): The network is an instance of
             paddle.nn.Layer.
-        inputs (Input|list|dict|None): `inputs`, entry points of network,
-            could be a Input layer, or lits of Input layers,
-            or dict (name: Input), or None. For static graph,
+        inputs (InputSpec|list|dict|None): `inputs`, entry points of network,
+            could be a InputSpec instance, or lits of InputSpec instances,
+            or dict ({name: InputSpec}), or None. For static graph,
             inputs must be set. For dynamic graph, it could be None.
-        labels (Input|list|None): `labels`, entry points of network,
-            could be a Input layer or lits of Input layers, or None.
-            For static graph, if labels is required in loss,
+        labels (InputSpec|list|None): `labels`, entry points of network,
+            could be a InputSpec instnace or lits of InputSpec instances,
+            or None. For static graph, if labels is required in loss,
             labels must be set. Otherwise, it could be None.
 
 
-    Usage:
+    Examples:
         .. code-block:: python
 
         import paddle
-        import paddle.incubate.hapi as hapi
-        
-        class MyNet(paddle.nn.Layer):
-            def __init__(self, classifier_act=None):
-                super(MyNet, self).__init__()
-                self._fc1 = paddle.nn.Linear(784, 200, act=classifier_act)
-
-            def forward(self, x):
-                y = self._fc1(x)
-                return y
-        
-        device = hapi.set_device('gpu')
+        import paddle.nn as nn
+        from paddle.static import InputSpec
+
+        device = paddle.set_device('cpu') # or 'gpu'
         # if use static graph, do not set
         paddle.disable_static(device)
-        
+
+        net = nn.Sequential(
+            nn.Linear(784, 200),
+            nn.Tanh(),
+            nn.Linear(200, 10))
+
         # inputs and labels are not required for dynamic graph.
-        input = hapi.Input([None, 784], 'float32', 'x')
-        label = hapi.Input([None, 1], 'int64', 'label')
+        input = InputSpec([None, 784], 'float32', 'x')
+        label = InputSpec([None, 1], 'int64', 'label')
         
-        model = hapi.Model(MyNet(), input, label)
+        model = paddle.Model(net, input, label)
         optim = paddle.optimizer.SGD(learning_rate=1e-3,
-            parameter_list=model.parameters())
+            parameters=model.parameters())
         model.prepare(optim,
                       paddle.nn.CrossEntropyLoss(),
                       paddle.metric.Accuracy())
         
-        mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
-        model.fit(mnist_data, epochs=2, batch_size=32, verbose=1)
-
+        data = paddle.vision.datasets.MNIST(mode='train', chw_format=False)
+        model.fit(data, epochs=2, batch_size=32, verbose=1)
     """
 
     def __init__(self, network, inputs=None, labels=None):
@@ -736,25 +883,22 @@ class Model(object):
             
               import numpy as np
               import paddle
-              import paddle.incubate.hapi as hapi
-
-              class MyNet(paddle.nn.Layer):
-                  def __init__(self, classifier_act=None):
-                      super(MyNet, self).__init__()
-                      self._fc = paddle.nn.Linear(784, 10, act=classifier_act)
-
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
+              import paddle.nn as nn
+              from paddle.static import InputSpec
 
-              device = hapi.set_device('gpu')
+              device = paddle.set_device('cpu') # or 'gpu'
               paddle.disable_static(device)
 
-              input = hapi.Input([None, 784], 'float32', 'x')
-              label = hapi.Input([None, 1], 'int64', 'label')
-              model = hapi.Model(MyNet(), input, label)
+              net = nn.Sequential(
+                  nn.Linear(784, 200),
+                  nn.Tanh(),
+                  nn.Linear(200, 10))
+
+              input = InputSpec([None, 784], 'float32', 'x')
+              label = InputSpec([None, 1], 'int64', 'label')
+              model = paddle.Model(net, input, label)
               optim = paddle.optimizer.SGD(learning_rate=1e-3,
-                  parameter_list=model.parameters())
+                  parameters=model.parameters())
               model.prepare(optim, paddle.nn.CrossEntropyLoss())
               data = np.random.random(size=(4,784)).astype(np.float32)
               label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
@@ -784,25 +928,22 @@ class Model(object):
             
               import numpy as np
               import paddle
-              import paddle.incubate.hapi as hapi
-
-              class MyNet(paddle.nn.Layer):
-                  def __init__(self, classifier_act=None):
-                      super(MyNet, self).__init__()
-                      self._fc = paddle.nn.Linear(784, 10, act=classifier_act)
+              import paddle.nn as nn
+              from paddle.static import InputSpec
 
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
-
-              device = hapi.set_device('gpu')
+              device = paddle.set_device('cpu') # or 'gpu'
               paddle.disable_static(device)
 
-              input = hapi.Input([None, 784], 'float32', 'x')
-              label = hapi.Input([None, 1], 'int64', 'label')
-              model = hapi.Model(MyNet(), input, label)
+              net = nn.Sequential(
+                  nn.Linear(784, 200),
+                  nn.Tanh(),
+                  nn.Linear(200, 10))
+
+              input = InputSpec([None, 784], 'float32', 'x')
+              label = InputSpec([None, 1], 'int64', 'label')
+              model = paddle.Model(net, input, label)
               optim = paddle.optimizer.SGD(learning_rate=1e-3,
-                  parameter_list=model.parameters())
+                  parameters=model.parameters())
               model.prepare(optim,
                             paddle.nn.CrossEntropyLoss())
               data = np.random.random(size=(4,784)).astype(np.float32)
@@ -830,20 +971,18 @@ class Model(object):
             
               import numpy as np
               import paddle
-              import paddle.incubate.hapi as hapi
+              import paddle.nn as nn
 
-              class MyNet(paddle.nn.Layer):
-                  def __init__(self):
-                      super(MyNet, self).__init__()
-                      self._fc = paddle.nn.Linear(784, 1, act='softmax')
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
-
-              device = hapi.set_device('gpu')
+              device = paddle.set_device('cpu') # or 'gpu'
               paddle.disable_static(device)
 
-              model = hapi.Model(MyNet())
+              net = nn.Sequential(
+                  nn.Linear(784, 200),
+                  nn.Tanh(),
+                  nn.Linear(200, 10),
+                  nn.Softmax())
+
+              model = paddle.Model(net)
               model.prepare()
               data = np.random.random(size=(4,784)).astype(np.float32)
               out = model.test_batch([data])
@@ -884,38 +1023,40 @@ class Model(object):
         Examples:
 
             .. code-block:: python
+
                 import paddle
-                import paddle.incubate.hapi as hapi
-                from paddle.nn import Linear
-                from paddle.incubate.hapi.datasets.mnist import MNIST as MnistDataset
+                import paddle.nn as nn
+                from paddle.static import InputSpec
 
-                class Mnist(paddle.nn.Layer):
+                class Mnist(nn.Layer):
                     def __init__(self):
                         super(Mnist, self).__init__()
-                        self._fc = Linear(784, 10, act='softmax')
+                        self.net = nn.Sequential(
+                            nn.Linear(784, 200),
+                            nn.Tanh(),
+                            nn.Linear(200, 10),
+                            nn.Softmax())
 
                     # If save for inference in dygraph, need this
                     @paddle.jit.to_static
                     def forward(self, x):
-                        y = self._fc(x)
-                        return y
+                        return self.net(x)
 
                 dynamic = True  # False
-                device = hapi.set_device('cpu')
+                device = paddle.set_device('cpu')
                 # if use static graph, do not set
                 paddle.disable_static(device) if dynamic else None
                 # inputs and labels are not required for dynamic graph.
-                input = hapi.Input([None, 784], 'float32', 'x')
-                label = hapi.Input([None, 1], 'int64', 'label')
-                model = hapi.Model(Mnist(), input, label)
+                input = InputSpec([None, 784], 'float32', 'x')
+                label = InputSpec([None, 1], 'int64', 'label')
+                model = paddle.Model(Mnist(), input, label)
                 optim = paddle.optimizer.SGD(learning_rate=1e-3,
-                                            parameter_list=model.parameters())
+                    parameters=model.parameters())
                 model.prepare(optim, paddle.nn.CrossEntropyLoss())
-                mnist_data = hapi.datasets.MNIST(mode='train', chw_format=False)
-                model.fit(mnist_data, epochs=1, batch_size=32, verbose=0)
+                data = paddle.vision.datasets.MNIST(mode='train', chw_format=False)
+                model.fit(data, epochs=1, batch_size=32, verbose=0)
                 model.save('checkpoint/test')  # save for training
                 model.save('inference_model', False)  # save for inference
-
         """
 
         if ParallelEnv().local_rank == 0:
@@ -958,19 +1099,17 @@ class Model(object):
             .. code-block:: python
             
               import paddle
-              import paddle.incubate.hapi as hapi
+              import paddle.nn as nn
               
-              class MyNet(paddle.nn.Layer):
-                  def __init__(self):
-                      super(MyNet, self).__init__()
-                      self._fc = paddle.nn.Linear(784, 1, act='softmax')
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
-              
-              device = hapi.set_device('cpu')
+              device = paddle.set_device('cpu')
               paddle.disable_static(device)
-              model = hapi.Model(MyNet())
+
+              model = paddle.Model(nn.Sequential(
+                  nn.Linear(784, 200),
+                  nn.Tanh(),
+                  nn.Linear(200, 10),
+                  nn.Softmax()))
+              model.save('checkpoint/test')
               model.load('checkpoint/test')
         """
 
@@ -1033,18 +1172,14 @@ class Model(object):
             .. code-block:: python
 
               import paddle
-              from paddle.incubate.hapi import Model
-
-              class MyNet(paddle.nn.Layer):
-                  def __init__(self):
-                      super(MyNet, self).__init__()
-                      self._fc = paddle.nn.Linear(20, 10, act='softmax')
-                  def forward(self, x):
-                      y = self._fc(x)
-                      return y
+              import paddle.nn as nn
 
               paddle.disable_static()
-              model = Model(MyNet())
+
+              model = paddle.Model(nn.Sequential(
+                  nn.Linear(784, 200),
+                  nn.Tanh(),
+                  nn.Linear(200, 10)))
               params = model.parameters()
         """
         return self._adapter.parameters()
@@ -1173,19 +1308,20 @@ class Model(object):
             .. code-block:: python
 
               import paddle
-              import paddle.incubate.hapi as hapi
+              from paddle.static import InputSpec
 
               dynamic = True
-              device = hapi.set_device('gpu')
+              device = paddle.set_device('cpu') # or 'gpu'
               paddle.disable_static(device) if dynamic else None
            
-              train_dataset = hapi.datasets.MNIST(mode='train')
-              val_dataset = hapi.datasets.MNIST(mode='test')
+              train_dataset = paddle.vision.datasets.MNIST(mode='train')
+              val_dataset = paddle.vision.datasets.MNIST(mode='test')
            
-              input = hapi.Input([None, 1, 28, 28], 'float32', 'image')
-              label = hapi.Input([None, 1], 'int64', 'label')
+              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+              label = InputSpec([None, 1], 'int64', 'label')
            
-              model = hapi.Model(hapi.vision.LeNet(classifier_activation=None),
+              model = paddle.Model(
+                  paddle.vision.models.LeNet(classifier_activation=None),
                   input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
@@ -1205,24 +1341,24 @@ class Model(object):
             .. code-block:: python
 
               import paddle
-              import paddle.incubate.hapi as hapi
+              from paddle.static import InputSpec
 
               dynamic = True
-              device = hapi.set_device('gpu')
+              device = paddle.set_device('cpu') # or 'gpu'
               paddle.disable_static(device) if dynamic else None
            
-              train_dataset = hapi.datasets.MNIST(mode='train')
+              train_dataset = paddle.vision.datasets.MNIST(mode='train')
               train_loader = paddle.io.DataLoader(train_dataset,
                   places=device, batch_size=64)
-              val_dataset = hapi.datasets.MNIST(mode='test')
+              val_dataset = paddle.vision.datasets.MNIST(mode='test')
               val_loader = paddle.io.DataLoader(val_dataset,
                   places=device, batch_size=64)
            
-              input = hapi.Input([None, 1, 28, 28], 'float32', 'image')
-              label = hapi.Input([None, 1], 'int64', 'label')
+              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+              label = InputSpec([None, 1], 'int64', 'label')
            
-              model = hapi.Model(hapi.vision.LeNet(classifier_activation=None),
-                  input, label)
+              model = paddle.Model(
+                  paddle.vision.models.LeNet(classifier_activation=None), input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
               model.prepare(
@@ -1341,22 +1477,21 @@ class Model(object):
         .. code-block:: python
 
             import paddle
-            import paddle.incubate.hapi as hapi
+            from paddle.static import InputSpec
 
             # declarative mode
-            val_dataset = hapi.datasets.MNIST(mode='test')
+            val_dataset = paddle.vision.datasets.MNIST(mode='test')
 
-            input = hapi.Input([-1, 1, 28, 28], 'float32', 'image')
-            label = hapi.Input([None, 1], 'int64', 'label')
-            model = hapi.Model(hapi.vision.LeNet(), input, label)
+            input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
+            label = InputSpec([None, 1], 'int64', 'label')
+            model = paddle.Model(paddle.vision.models.LeNet(), input, label)
             model.prepare(metrics=paddle.metric.Accuracy())
-
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
 
             # imperative mode
             paddle.disable_static()
-            model = hapi.Model(hapi.vision.LeNet())
+            model = paddle.Model(paddle.vision.models.LeNet())
             model.prepare(metrics=paddle.metric.Accuracy())
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
@@ -1435,9 +1570,9 @@ class Model(object):
 
             import numpy as np
             import paddle
-            import paddle.incubate.hapi as hapi
+            from paddle.static import InputSpec
 
-            class MnistDataset(hapi.datasets.MNIST):
+            class MnistDataset(paddle.vision.datasets.MNIST):
                 def __init__(self, mode, return_label=True):
                     super(MnistDataset, self).__init__(mode=mode)
                     self.return_label = return_label
@@ -1454,17 +1589,17 @@ class Model(object):
             test_dataset = MnistDataset(mode='test', return_label=False)
 
             # declarative mode
-            input = hapi.Input([-1, 1, 28, 28], 'float32', 'image')
-            model = hapi.Model(hapi.vision.LeNet(), input)
+            input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
+            model = paddle.Model(paddle.vision.models.LeNet(), input)
             model.prepare()
 
             result = model.predict(test_dataset, batch_size=64)
             print(len(result[0]), result[0][0].shape)
 
             # imperative mode
-            device = hapi.set_device('cpu')
+            device = paddle.set_device('cpu')
             paddle.disable_static(device)
-            model = hapi.Model(hapi.vision.LeNet())
+            model = paddle.Model(paddle.vision.models.LeNet())
             model.prepare()
             result = model.predict(test_dataset, batch_size=64)
             print(len(result[0]), result[0][0].shape)
diff --git a/python/paddle/incubate/hapi/progressbar.py b/python/paddle/hapi/progressbar.py
similarity index 98%
rename from python/paddle/incubate/hapi/progressbar.py
rename to python/paddle/hapi/progressbar.py
index 2487fcbde8744fa7cc186e16b0653f03629d0366..c36e875ccb7d594e9cf2ccfe0654551ccbd66afc 100644
--- a/python/paddle/incubate/hapi/progressbar.py
+++ b/python/paddle/hapi/progressbar.py
@@ -66,6 +66,7 @@ class ProgressBar(object):
                     return terminal_size(80, 24)
 
         terminal_width, _ = get_terminal_size()
+        terminal_width = terminal_width if terminal_width > 0 else 80
         max_width = min(int(terminal_width * 0.6), terminal_width - 50)
         return max_width
 
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index aee695d419550c066fdb3eb6333780fedfe29429..2af9255971e65236bc6c73b90d2fcd6d14d7679a 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import hapi
-
 __all__ = []
-__all__ += hapi.__all__
 __all__ += ["reader"]
 
 from ..fluid.contrib import reader
diff --git a/python/paddle/incubate/hapi/device.py b/python/paddle/incubate/hapi/device.py
deleted file mode 100644
index 3ff29822f6f45b7fb977b5888e7d26e293df5761..0000000000000000000000000000000000000000
--- a/python/paddle/incubate/hapi/device.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import six
-
-import paddle.fluid as fluid
-from paddle.fluid.dygraph.parallel import ParallelEnv
-
-__all__ = ['set_device', ]
-
-# TODO(qingqing01): remove or refine _global_device, set_device and get_device
-# after core framework supporting these function.
-_global_device = None
-
-
-def set_device(device):
-    """
-    Args:
-        device (str): specify device type, 'cpu' or 'gpu'.
-        
-    Returns:
-        fluid.CUDAPlace or fluid.CPUPlace: Created GPU or CPU place.
-
-    Examples:
-        .. code-block:: python
-
-        import paddle.incubate.hapi as hapi
-
-        input = hapi.set_device('gpu')
-    """
-
-    assert isinstance(device, six.string_types) and device.lower() in ['cpu', 'gpu'], \
-    "Expected device in ['cpu', 'gpu'], but got {}".format(device)
-
-    device = fluid.CUDAPlace(ParallelEnv().dev_id) \
-            if device.lower() == 'gpu' and fluid.is_compiled_with_cuda() \
-                else fluid.CPUPlace()
-
-    global _global_device
-    _global_device = device
-    return device
-
-
-def _get_device():
-    """
-    Return global device.
-    """
-    if _global_device is not None:
-        device = _global_device
-    else:
-        if fluid.is_compiled_with_cuda():
-            device = fluid.CUDAPlace(ParallelEnv().dev_id)
-        else:
-            device = fluid.CPUPlace()
-    return device
diff --git a/python/paddle/incubate/hapi/distributed.py b/python/paddle/incubate/hapi/distributed.py
deleted file mode 100644
index 0e38dc8edc758e9c1b8a96add1df242fb0aecef1..0000000000000000000000000000000000000000
--- a/python/paddle/incubate/hapi/distributed.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import time
-import math
-import socket
-import contextlib
-import numpy as np
-
-from paddle import fluid
-from paddle.fluid.layers import collective
-from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
-from paddle.io import BatchSampler
-
-_parallel_context_initialized = False
-
-__all__ = ['DistributedBatchSampler']
-
-
-class DistributedBatchSampler(BatchSampler):
-    """Sampler that restricts data loading to a subset of the dataset.
-
-    In such case, each process can pass a DistributedBatchSampler instance 
-    as a DataLoader sampler, and load a subset of the original dataset that 
-    is exclusive to it.
-
-    .. note::
-        Dataset is assumed to be of constant size.
-        
-    Args:
-        dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement
-                     or other python object which implemented
-                     `__len__` for BatchSampler to get sample
-                     number of data source.
-        batch_size(int): sample indice number in a mini-batch indices.
-        num_replicas(int, optional): porcess number in distributed training.
-            If :attr:`num_replicas` is None, :attr:`num_replicas` will be
-            retrieved from :code:`paddle.fluid.dygraph.parallel.ParallenEnv`.
-            Default None.
-        rank(int, optional): the rank of the current process among :attr:`num_replicas`
-            processes. If :attr:`rank` is None, :attr:`rank` is retrieved from
-            :code:`paddle.fluid.dygraph.parallel.ParallenEnv`. Default None.
-        shuffle(bool): whther to shuffle indices order before genrating
-            batch indices. Default False.
-        drop_last(bool): whether drop the last incomplete batch dataset size
-            is not divisible by the batch size. Default False
-
-    Examples:
-        .. code-block:: python
-
-            import numpy as np
-
-            from paddle.incubate.hapi.datasets import MNIST
-            from paddle.incubate.hapi.distributed import DistributedBatchSampler
-
-            class MnistDataset(MNIST):
-                def __init__(self, mode, return_label=True):
-                    super(MnistDataset, self).__init__(mode=mode)
-                    self.return_label = return_label
-
-                def __getitem__(self, idx):
-                    img = np.reshape(self.images[idx], [1, 28, 28])
-                    if self.return_label:
-                        return img, np.array(self.labels[idx]).astype('int64')
-                    return img,
-
-                def __len__(self):
-                    return len(self.images)
-
-            train_dataset = MnistDataset(mode='train')
-            dist_train_dataloader = DistributedBatchSampler(train_dataset, batch_size=64)
-
-            for data in dist_train_dataloader:
-                # do something
-                break
-    """
-
-    def __init__(self,
-                 dataset,
-                 batch_size,
-                 num_replicas=None,
-                 rank=None,
-                 shuffle=False,
-                 drop_last=False):
-        self.dataset = dataset
-
-        assert isinstance(batch_size, int) and batch_size > 0, \
-                "batch_size should be a positive integer"
-        self.batch_size = batch_size
-        assert isinstance(shuffle, bool), \
-                "shuffle should be a boolean value"
-        self.shuffle = shuffle
-        assert isinstance(drop_last, bool), \
-                "drop_last should be a boolean number"
-
-        if num_replicas is not None:
-            assert isinstance(num_replicas, int) and num_replicas > 0, \
-                    "num_replicas should be a positive integer"
-            self.nranks = num_replicas
-        else:
-            self.nranks = ParallelEnv().nranks
-
-        if rank is not None:
-            assert isinstance(rank, int) and rank >= 0, \
-                    "rank should be a non-negative integer"
-            self.local_rank = rank
-        else:
-            self.local_rank = ParallelEnv().local_rank
-
-        self.drop_last = drop_last
-        self.epoch = 0
-        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))
-        self.total_size = self.num_samples * self.nranks
-
-    def __iter__(self):
-        num_samples = len(self.dataset)
-        indices = np.arange(num_samples).tolist()
-        indices += indices[:(self.total_size - len(indices))]
-        assert len(indices) == self.total_size
-        if self.shuffle:
-            np.random.RandomState(self.epoch).shuffle(indices)
-            self.epoch += 1
-
-        # subsample
-        def _get_indices_by_batch_size(indices):
-            subsampled_indices = []
-            last_batch_size = self.total_size % (self.batch_size * self.nranks)
-            assert last_batch_size % self.nranks == 0
-            last_local_batch_size = last_batch_size // self.nranks
-
-            for i in range(self.local_rank * self.batch_size,
-                           len(indices) - last_batch_size,
-                           self.batch_size * self.nranks):
-                subsampled_indices.extend(indices[i:i + self.batch_size])
-
-            indices = indices[len(indices) - last_batch_size:]
-            subsampled_indices.extend(indices[
-                self.local_rank * last_local_batch_size:(
-                    self.local_rank + 1) * last_local_batch_size])
-            return subsampled_indices
-
-        if self.nranks > 1:
-            indices = _get_indices_by_batch_size(indices)
-
-        assert len(indices) == self.num_samples
-        _sample_iter = iter(indices)
-
-        batch_indices = []
-        for idx in _sample_iter:
-            batch_indices.append(idx)
-            if len(batch_indices) == self.batch_size:
-                yield batch_indices
-                batch_indices = []
-        if not self.drop_last and len(batch_indices) > 0:
-            yield batch_indices
-
-    def __len__(self):
-        num_samples = self.num_samples
-        num_samples += int(not self.drop_last) * (self.batch_size - 1)
-        return num_samples // self.batch_size
-
-    def set_epoch(self, epoch):
-        self.epoch = epoch
-
-
-def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
-    return collective._c_allgather(
-        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream)
-
-
-def wait_server_ready(endpoints):
-    assert not isinstance(endpoints, six.string_types)
-    while True:
-        all_ok = True
-        not_ready_endpoints = []
-        for ep in endpoints:
-            ip_port = ep.split(":")
-            with contextlib.closing(
-                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-                sock.settimeout(2)
-                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
-                if result != 0:
-                    all_ok = False
-                    not_ready_endpoints.append(ep)
-        if not all_ok:
-            time.sleep(3)
-        else:
-            break
-
-
-def init_communicator(program, rank, nranks, wait_port, current_endpoint,
-                      endpoints):
-    if nranks < 2:
-        return
-    other_endpoints = endpoints[:]
-    other_endpoints.remove(current_endpoint)
-    if rank == 0 and wait_port:
-        wait_server_ready(other_endpoints)
-    block = program.global_block()
-    nccl_id_var = block.create_var(
-        name=fluid.unique_name.generate('nccl_id'),
-        persistable=True,
-        type=fluid.core.VarDesc.VarType.RAW)
-
-    block.append_op(
-        type='c_gen_nccl_id',
-        inputs={},
-        outputs={'Out': nccl_id_var},
-        attrs={
-            'rank': rank,
-            'endpoint': current_endpoint,
-            'other_endpoints': other_endpoints
-        })
-
-    block.append_op(
-        type='c_comm_init',
-        inputs={'X': nccl_id_var},
-        outputs={},
-        attrs={
-            'nranks': nranks,
-            'rank': rank,
-            'ring_id': 0,
-        })
-
-
-def prepare_distributed_context(place=None):
-    if place is None:
-        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
-            else fluid.CUDAPlace(0)
-
-    strategy = ParallelStrategy()
-    strategy.nranks = ParallelEnv().nranks
-    strategy.local_rank = ParallelEnv().local_rank
-    strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
-    strategy.current_endpoint = ParallelEnv().current_endpoint
-
-    if strategy.nranks < 2:
-        return
-
-    global _parallel_context_initialized
-
-    if not _parallel_context_initialized and isinstance(place, fluid.CUDAPlace):
-
-        def _init_context():
-            communicator_prog = fluid.Program()
-            init_communicator(communicator_prog, strategy.local_rank,
-                              strategy.nranks, True, strategy.current_endpoint,
-                              strategy.trainer_endpoints)
-            exe = fluid.Executor(place)
-            exe.run(communicator_prog)
-
-        if fluid.in_dygraph_mode():
-            fluid.disable_dygraph()
-            _init_context()
-            fluid.enable_dygraph(place)
-        else:
-            _init_context()
-
-    else:
-        assert ("Only support CUDAPlace for now.")
-
-    _parallel_context_initialized = True
-    return strategy
diff --git a/python/paddle/incubate/hapi/tests/CMakeLists.txt b/python/paddle/incubate/hapi/tests/CMakeLists.txt
deleted file mode 100644
index 8ffcd67443f1c8722560da20d9cfb76b18a67351..0000000000000000000000000000000000000000
--- a/python/paddle/incubate/hapi/tests/CMakeLists.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
-string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
-
-
-foreach(TEST_OP ${DIST_TEST_OPS})
-    list(REMOVE_ITEM TEST_OPS ${TEST_OP})
-endforeach()
-
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
-set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 150) 
-
-
-function(py_dist_test TARGET_NAME)
-  if(WITH_TESTING)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS ARGS ENVS)
-    cmake_parse_arguments(py_dist_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if(WITH_COVERAGE AND WITH_GPU AND WITH_NCCL AND NOT WIN32)
-      add_test(NAME ${TARGET_NAME}
-               COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
-               FLAGS_cpu_deterministic=true NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1
-               PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_dist_test_ENVS}
-               COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
-               ${PYTHON_EXECUTABLE} -u ${py_dist_test_SRCS} ${py_dist_test_ARGS}
-               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-      # No unit test should exceed 10 minutes.
-      set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=DIST")
-    endif()
-
-    
-  endif()
-endfunction()
-
-
-
-foreach(src ${DIST_TEST_OPS})
-    message(STATUS ${src})
-    py_dist_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py b/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
deleted file mode 100644
index 6df9b31217aae78c43de8d29956a8b2def99055b..0000000000000000000000000000000000000000
--- a/python/paddle/incubate/hapi/tests/test_uncombined_weight2state_dict.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# copyright (c) 2020 paddlepaddle authors. all rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-
-import numpy as np
-import shutil
-import tempfile
-
-from paddle import fluid
-from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
-
-from paddle.incubate.hapi.utils import uncombined_weight_to_state_dict
-
-
-class LeNetDygraph(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10, classifier_activation='softmax'):
-        super(LeNetDygraph, self).__init__()
-        self.num_classes = num_classes
-        self.features = Sequential(
-            Conv2d(
-                1, 6, 3, stride=1, padding=1),
-            ReLU(),
-            Pool2D(2, 'max', 2),
-            Conv2d(
-                6, 16, 5, stride=1, padding=0),
-            ReLU(),
-            Pool2D(2, 'max', 2))
-
-        if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
-
-    def forward(self, inputs):
-        x = self.features(inputs)
-
-        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
-            x = self.fc(x)
-        return x
-
-
-class TestUncombinedWeight2StateDict(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.save_dir = tempfile.mkdtemp()
-
-    @classmethod
-    def tearDownClass(cls):
-        shutil.rmtree(cls.save_dir)
-
-    def test_infer(self):
-        start_prog = fluid.Program()
-        train_prog = fluid.Program()
-
-        x = fluid.data(name='x', shape=[None, 1, 28, 28], dtype='float32')
-
-        with fluid.program_guard(train_prog, start_prog):
-            with fluid.unique_name.guard():
-                x = fluid.data(
-                    name='x', shape=[None, 1, 28, 28], dtype='float32')
-                model = LeNetDygraph()
-                output = model.forward(x)
-
-        excutor = fluid.Executor()
-        excutor.run(start_prog)
-
-        test_prog = train_prog.clone(for_test=True)
-
-        fluid.io.save_params(excutor, self.save_dir, test_prog)
-
-        rand_x = np.random.rand(1, 1, 28, 28).astype('float32')
-        out = excutor.run(program=test_prog,
-                          feed={'x': rand_x},
-                          fetch_list=[output.name],
-                          return_numpy=True)
-
-        state_dict = uncombined_weight_to_state_dict(self.save_dir)
-
-        key2key_dict = {
-            'features.0.weight': 'conv2d_0.w_0',
-            'features.0.bias': 'conv2d_0.b_0',
-            'features.3.weight': 'conv2d_1.w_0',
-            'features.3.bias': 'conv2d_1.b_0',
-            'fc.0.weight': 'linear_0.w_0',
-            'fc.0.bias': 'linear_0.b_0',
-            'fc.1.weight': 'linear_1.w_0',
-            'fc.1.bias': 'linear_1.b_0',
-            'fc.2.weight': 'linear_2.w_0',
-            'fc.2.bias': 'linear_2.b_0'
-        }
-
-        fluid.enable_imperative()
-        dygraph_model = LeNetDygraph()
-
-        converted_state_dict = dygraph_model.state_dict()
-        for k1, k2 in key2key_dict.items():
-            converted_state_dict[k1] = state_dict[k2]
-
-        dygraph_model.set_dict(converted_state_dict)
-
-        dygraph_model.eval()
-        dy_out = dygraph_model(fluid.dygraph.to_variable(rand_x))
-
-        np.testing.assert_allclose(dy_out.numpy(), out[0], atol=1e-5)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/incubate/hapi/utils.py b/python/paddle/incubate/hapi/utils.py
deleted file mode 100644
index d9708f2927912870218f41103df5b0f94609cd88..0000000000000000000000000000000000000000
--- a/python/paddle/incubate/hapi/utils.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import inspect
-import numpy as np
-
-from collections import OrderedDict
-from paddle import fluid
-from paddle.fluid.framework import Variable
-from paddle.fluid.executor import global_scope
-
-__all__ = ['uncombined_weight_to_state_dict']
-
-
-def uncombined_weight_to_state_dict(weight_dir):
-    """
-    Convert uncombined weight which getted by using `fluid.io.save_params` or `fluid.io.save_persistables` to state_dict
-
-    Args:
-        weight_dir (str): weight direcotory path.
-
-    Returns:
-        OrderDict: weight dict.
-
-    Examples:
-        .. code-block:: python
-
-            import os
-
-            from paddle import fluid
-            from paddle.nn import Conv2D, Pool2D, Linear, ReLU, Sequential
-            from paddle.incubate.hapi.utils import uncombined_weight_to_state_dict
-
-
-            class LeNetDygraph(fluid.dygraph.Layer):
-                def __init__(self, num_classes=10, classifier_activation='softmax'):
-                    super(LeNetDygraph, self).__init__()
-                    self.num_classes = num_classes
-                    self.features = Sequential(
-                        Conv2D(
-                            1, 6, 3, stride=1, padding=1),
-                        ReLU(),
-                        Pool2D(2, 'max', 2),
-                        Conv2D(
-                            6, 16, 5, stride=1, padding=0),
-                        ReLU(),
-                        Pool2D(2, 'max', 2))
-
-                    if num_classes > 0:
-                        self.fc = Sequential(
-                            Linear(400, 120),
-                            Linear(120, 84),
-                            Linear(
-                                84, 10, act=classifier_activation))
-
-                def forward(self, inputs):
-                    x = self.features(inputs)
-
-                    if self.num_classes > 0:
-                        x = fluid.layers.flatten(x, 1)
-                        x = self.fc(x)
-                    return x
-
-            # save weight use fluid.io.save_params
-            save_dir = 'temp'
-            if not os.path.exists(save_dir):
-                os.makedirs(save_dir)
-
-            start_prog = fluid.Program()
-            train_prog = fluid.Program()
-
-            x = fluid.data(name='x', shape=[None, 1, 28, 28], dtype='float32')
-
-            with fluid.program_guard(train_prog, start_prog):
-                with fluid.unique_name.guard():
-                    x = fluid.data(
-                        name='x', shape=[None, 1, 28, 28], dtype='float32')
-                    model = LeNetDygraph()
-                    output = model.forward(x)
-
-            excutor = fluid.Executor()
-            excutor.run(start_prog)
-
-            test_prog = train_prog.clone(for_test=True)
-
-            fluid.io.save_params(excutor, save_dir, test_prog)
-
-            # convert uncombined weight to state dict
-            state_dict = uncombined_weight_to_state_dict(save_dir)
-
-            key2key_dict = {
-                'features.0.weight': 'conv2d_0.w_0',
-                'features.0.bias': 'conv2d_0.b_0',
-                'features.3.weight': 'conv2d_1.w_0',
-                'features.3.bias': 'conv2d_1.b_0',
-                'fc.0.weight': 'linear_0.w_0',
-                'fc.0.bias': 'linear_0.b_0',
-                'fc.1.weight': 'linear_1.w_0',
-                'fc.1.bias': 'linear_1.b_0',
-                'fc.2.weight': 'linear_2.w_0',
-                'fc.2.bias': 'linear_2.b_0'
-            }
-
-            fluid.enable_imperative()
-            dygraph_model = LeNetDygraph()
-
-            converted_state_dict = dygraph_model.state_dict()
-            for k1, k2 in key2key_dict.items():
-                converted_state_dict[k1] = state_dict[k2]
-
-            # dygraph model load state dict which converted from uncombined weight
-            dygraph_model.set_dict(converted_state_dict)
-    """
-
-    def _get_all_params_name(dir):
-        params_name = []
-        dir = os.path.expanduser(dir)
-
-        dir_len = len(dir)
-        for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
-            for fname in sorted(fnames):
-                path = os.path.join(root[dir_len:], fname)
-                params_name.append(path)
-
-        return params_name
-
-    class Load(fluid.dygraph.Layer):
-        def __init__(self):
-            super(Load, self).__init__()
-
-        def forward(self, filename):
-            weight = self.create_parameter(
-                shape=[1],
-                dtype='float32',
-                default_initializer=fluid.initializer.ConstantInitializer(0.0))
-            self._helper.append_op(
-                type='load',
-                inputs={},
-                outputs={'Out': [weight]},
-                attrs={'file_path': filename})
-            return weight
-
-    params_name_list = _get_all_params_name(weight_dir)
-    if not fluid.in_dygraph_mode():
-        dygraph_enabled = False
-        fluid.enable_imperative()
-    else:
-        dygraph_enabled = True
-
-    load = Load()
-    state_dict = OrderedDict()
-
-    for param_name in params_name_list:
-        param_path = os.path.join(weight_dir, param_name)
-        weight = load(param_path)
-        try:
-            weight = weight.numpy()
-        except Exception as e:
-            print(e)
-
-        state_dict[param_name] = weight
-
-    if not dygraph_enabled:
-        fluid.disable_imperative()
-
-    return state_dict
-
-
-def to_list(value):
-    if value is None:
-        return value
-    if isinstance(value, (list, tuple)):
-        return list(value)
-    return [value]
-
-
-def to_numpy(var):
-    assert isinstance(var, (Variable, fluid.core.VarBase)), "not a variable"
-    if isinstance(var, fluid.core.VarBase):
-        return var.numpy()
-    t = global_scope().find_var(var.name).get_tensor()
-    return np.array(t)
-
-
-def flatten_list(l):
-    assert isinstance(l, list), "not a list"
-    outl = []
-    splits = []
-    for sl in l:
-        assert isinstance(sl, list), "sub content not a list"
-        splits.append(len(sl))
-        outl += sl
-    return outl, splits
-
-
-def restore_flatten_list(l, splits):
-    outl = []
-    for split in splits:
-        assert len(l) >= split, "list length invalid"
-        sl, l = l[:split], l[split:]
-        outl.append(sl)
-    return outl
-
-
-def extract_args(func):
-    if hasattr(inspect, 'getfullargspec'):
-        return inspect.getfullargspec(func)[0]
-    else:
-        return inspect.getargspec(func)[0]
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index 78f792d6a5a6698034912297f5d5a23db0b35201..b67779cb2a2ae699c8206dc717670bf6eb23b25e 100644
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -18,6 +18,7 @@ __all__ = [
     'IterableDataset',
     'TensorDataset',
     'BatchSampler',
+    'DistributedBatchSampler',
     #            'Transform',
     'DataLoader',
     'get_worker_info',
@@ -43,7 +44,7 @@ __all__ = [
 
 from ..fluid.io import DataLoader
 from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
-        TensorDataset, Sampler, SequenceSampler, RandomSampler
+        TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler
 from ..fluid.io import load, save, load_program_state, set_program_state, \
         load_inference_model, save_inference_model, batch
 from ..reader import shuffle, buffered, cache, chain, firstn, compose, map_readers, xmap_readers
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 110a62c300559b9037cd2ca735aebd1946ba0ce9..1cd65171ff034e8b834c38184e4452796da985ca 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -202,12 +202,11 @@ class Accuracy(Metric):
         .. code-block:: python
 
         import paddle
-        import paddle.incubate.hapi as hapi
 
         paddle.disable_static()
-        train_dataset = hapi.datasets.MNIST(mode='train')
+        train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-        model = hapi.Model(hapi.vision.LeNet(classifier_activation=None))
+        model = paddle.Model(paddle.vision.LeNet(classifier_activation=None))
         optim = paddle.optimizer.Adam(
             learning_rate=0.001, parameters=model.parameters())
         model.prepare(
@@ -336,7 +335,6 @@ class Precision(Metric):
         
         import paddle
         import paddle.nn as nn
-        import paddle.incubate.hapi as hapi
         
         class Data(paddle.io.Dataset):
             def __init__(self):
@@ -352,7 +350,7 @@ class Precision(Metric):
                 return self.n
   
         paddle.disable_static()
-        model = hapi.Model(nn.Sequential(
+        model = paddle.Model(nn.Sequential(
             nn.Linear(10, 1),
             nn.Sigmoid()
         ))
@@ -471,7 +469,6 @@ class Recall(Metric):
         
         import paddle
         import paddle.nn as nn
-        import paddle.incubate.hapi as hapi
         
         class Data(paddle.io.Dataset):
             def __init__(self):
@@ -487,7 +484,7 @@ class Recall(Metric):
                 return self.n
         
         paddle.disable_static()
-        model = hapi.Model(nn.Sequential(
+        model = paddle.Model(nn.Sequential(
             nn.Linear(10, 1),
             nn.Sigmoid()
         ))
@@ -617,7 +614,6 @@ class Auc(Metric):
         import numpy as np
         import paddle
         import paddle.nn as nn
-        import paddle.incubate.hapi as hapi
         
         class Data(paddle.io.Dataset):
             def __init__(self):
@@ -633,9 +629,9 @@ class Auc(Metric):
                 return self.n
         
         paddle.disable_static()
-        model = hapi.Model(nn.Sequential(
-            nn.Linear(10, 2, act='softmax'),
-        ))
+        model = paddle.Model(nn.Sequential(
+            nn.Linear(10, 2), nn.Softmax())
+        )
         optim = paddle.optimizer.Adam(
             learning_rate=0.001, parameters=model.parameters())
         
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 5cc9f6d32f9d7ef3dafd73badd0ea88bed372968..66caba540f2fed8c035d0f1af14f9e40a329bca5 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -94,8 +94,8 @@ from .layer.common import UpsamplingNearest2d  #DEFINE_ALIAS
 from .layer.common import UpsamplingBilinear2d  #DEFINE_ALIAS
 from .layer.common import Bilinear  #DEFINE_ALIAS
 from .layer.common import Dropout  #DEFINE_ALIAS
-from .layer.common import Dropout2D  #DEFINE_ALIAS
-from .layer.common import Dropout3D  #DEFINE_ALIAS
+from .layer.common import Dropout2d  #DEFINE_ALIAS
+from .layer.common import Dropout3d  #DEFINE_ALIAS
 from .layer.common import AlphaDropout  #DEFINE_ALIAS
 
 from .layer.pooling import AvgPool1d  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 3c0aa9c5c99e545b657559c30fcde46a69781231..325eaa64d5ca4bd3d65bf266ff0a42226a3199e6 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -233,3 +233,4 @@ from .vision import space_to_depth  #DEFINE_ALIAS
 from .vision import yolo_box  #DEFINE_ALIAS
 from .vision import yolov3_loss  #DEFINE_ALIAS
 from .input import one_hot  #DEFINE_ALIAS
+from .input import embedding  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index e77bf0e39672984f7076938b134f3e54f4c761ab..bc48cc21c29e6683602f37fb3eab6c9485fe4977 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -19,7 +19,7 @@ from ...fluid.layer_helper import LayerHelper
 from ...fluid.layers import core
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 
-__all__ = ['one_hot']
+__all__ = ['one_hot', 'embedding']
 
 
 def one_hot(x, num_classes, name=None):
@@ -83,6 +83,7 @@ def one_hot(x, num_classes, name=None):
             #                       [0., 1., 0., 0.],
             #                       [0., 0., 0., 1.],
             #                       [1., 0., 0., 0.]]
+
     """
 
     if in_dygraph_mode():
@@ -94,7 +95,7 @@ def one_hot(x, num_classes, name=None):
 
         one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
         if not isinstance(num_classes, Variable):
-            # user attribute 
+            # user attribute
             inputs = {'X': x}
             attrs = {'depth': num_classes, 'allow_out_of_range': False}
         else:
@@ -108,3 +109,115 @@ def one_hot(x, num_classes, name=None):
             outputs={'Out': one_hot_out},
             stop_gradient=True)
         return one_hot_out
+
+
+def embedding(x, weight, padding_idx=None, sparse=False, name=None):
+    """
+    The operator is used to lookup embeddings vector of ids provided by :attr:`input` .
+
+    The shape of output Tensor is generated by appending the last dimension of the input Tensor shape
+    with embedding size.
+    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < weight.shape[0]` ,
+    otherwise the program will throw an exception and exit.
+
+    .. code-block:: text
+
+        Case 1:
+            input is a Tensor. 
+                padding_idx = -1
+                x.data = [[1, 3], [2, 4], [4, 127]]
+                x.shape = [3, 2]
+                weight.shape = [128, 16]
+            output is a Tensor:
+                out.shape = [3, 2, 16]
+                out.data = [[[0.129435295, 0.244512452, ..., 0.436322452],
+                            [0.345421456, 0.524563927, ..., 0.144534654]],
+                            [[0.345249859, 0.124939536, ..., 0.194353745],
+                            [0.945345345, 0.435394634, ..., 0.435345365]],
+                            [[0.945345345, 0.435394634, ..., 0.435345365],
+                            [0.0,         0.0,         ..., 0.0        ]]]  # padding data
+
+            The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
+            It will pad all-zero data when ids is 127.
+
+    Args:
+        x(Tensor): A Tensor with type int32/int64, which contains the id information. The value of the input id should
+            satisfy :math:`0<= id < weight.shape[0]` .
+        weight (Tensor): The weight. A Tensor with shape of lookup table parameter. It should have two elements which
+            indicates the size of the dictionary of embeddings and the size of each embedding vector respectively.
+        sparse(bool): The flag indicating whether to use sparse update. This parameter only
+            affects the performance of the backwards gradient update. It is recommended to set
+            True because sparse update is faster. But some optimizers does not support sparse update,
+            such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
+            :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
+            :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
+            In these cases, is_sparse must be False. Default: False.
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
+            If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
+            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
+            encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
+            If set None, it makes no effect to output. Default: None.
+        name(str|None): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
+           None by default.
+
+    Returns:
+        Tensor: Embedding Tensor  mapped by input. The data type is the same as :attr:`weight`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            weight = prog.global_block().create_parameter(
+                attr=self._param_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                default_initializer=Constant(1.0))
+
+            prog = paddle.static.Program()
+
+            weight = prog.global_block().create_parameter(
+                    (128, 100), dtype="float32", default_initializer=Constant(1.0))
+
+            label = paddle.data(
+                    name="label",
+                    shape=[4],
+                    append_batch_size=False,
+                    dtype="int64")
+
+            emb = nn.embedding(
+                    x=label, weight=weight, sparse=True, name="embedding")
+
+    """
+    if in_dygraph_mode():
+        return core.ops.lookup_table_v2(
+            weight, x, 'is_sparse', sparse, 'is_distributed', False,
+            'remote_prefetch', False, 'padding_idx', padding_idx)
+    else:
+        helper = LayerHelper('embedding', **locals())
+        dtype = helper.input_dtype()
+
+        check_variable_and_dtype(x, 'input', ['int32', 'int64'], 'embedding')
+
+        is_distributed = False
+        remote_prefetch = sparse and (not is_distributed)
+
+        tmp = helper.create_variable_for_type_inference(dtype)
+        padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
+            weight.shape[0] + padding_idx)
+
+        helper.append_op(
+            type='lookup_table_v2',
+            inputs={'Ids': x,
+                    'W': weight},
+            outputs={'Out': tmp},
+            attrs={
+                'is_sparse': sparse,
+                'is_distributed': is_distributed,
+                'remote_prefetch': remote_prefetch,
+                'padding_idx': padding_idx
+            })
+        return tmp
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index f1509143f3c933db12fc4ab6afd1a00b291f38f4..d2ddee654f4d04de152d15130ba53c424af3e5b2 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -147,7 +147,6 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
             label = paddle.to_tensor(label_data)
             output = paddle.nn.functional.binary_cross_entropy(input, label)
             print(output.numpy())  # [0.65537095]
-            paddle.enable_static()
 
     """
     if reduction not in ['sum', 'mean', 'none']:
@@ -165,8 +164,7 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
             return core.ops.reduce_sum(out, 'dim', [0], 'keep_dim', False,
                                        "reduce_all", True)
         elif reduction == 'mean':
-            return core.ops.reduce_mean(out, 'dim', [0], 'keep_dim', False,
-                                        "reduce_all", True)
+            return core.ops.mean(out)
         else:
             return out
 
@@ -467,14 +465,12 @@ def margin_ranking_loss(input,
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
-
             paddle.disable_static()
 
-            input = paddle.to_variable(np.array([[1, 2], [3, 4]]).astype('float32'))
-            other = paddle.to_variable(np.array([[2, 1], [2, 4]]).astype('float32'))
-            label = paddle.to_variable(np.array([[1, -1], [-1, -1]]).astype('float32'))
+            input = paddle.to_tensor([[1, 2], [3, 4]], dtype='float32')
+            other = paddle.to_tensor([[2, 1], [2, 4]], dtype='float32')
+            label = paddle.to_tensor([[1, -1], [-1, -1]], dtype='float32')
             loss = paddle.nn.functional.margin_ranking_loss(input, other, label)
             print(loss.numpy()) # [0.75]
     """
@@ -578,8 +574,8 @@ def l1_loss(input, label, reduction='mean', name=None):
             paddle.disable_static()
             input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
             label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
-            input = paddle.to_variable(input_data)
-            label = paddle.to_variable(label_data)
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
 
             l1_loss = paddle.nn.functional.l1_loss(input, label)
             print(l1_loss.numpy())
@@ -675,9 +671,9 @@ def nll_loss(input,
 
                 place = paddle.CPUPlace()
                 paddle.disable_static(place)
-                input = paddle.to_variable(input_np)
+                input = paddle.to_tensor(input_np)
                 log_out = log_softmax(input)
-                label = paddle.to_variable(label_np)
+                label = paddle.to_tensor(label_np)
                 result = nll_loss(log_out, label)
                 print(result.numpy()) # [1.0720209]
     """
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index e9c1a21ecffb1b64cb5ae9e6b802600625cb4685..f63fc33525576c61998be9facf32b4c66aa2a971 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -165,7 +165,7 @@ def batch_norm(x,
           w = paddle.to_tensor(weight_data)
           b = paddle.to_tensor(bias_data)
           batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b)
-          print batch_norm_out
+          print(batch_norm_out.numpy())
     """
 
     assert len(x.shape) >= 2, "input dim must be larger than 1"
@@ -176,6 +176,15 @@ def batch_norm(x,
     mean_out = running_mean
     variance_out = running_var
 
+    true_data_format = ['NC', 'NCL', 'NCHW', 'NCWH', 'NCDHW']
+    if data_format not in true_data_format:
+        raise ValueError(
+            "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCWH', 'NCDHW', but receive {}".
+            format(data_format))
+
+    if data_format != 'NCWH':
+        data_format = 'NCHW'
+
     if in_dygraph_mode():
         # for dygraph need tuple
         attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
@@ -270,7 +279,7 @@ def layer_norm(x,
           layer_norm = paddle.nn.functional.layer_norm(x, x.shape[1:])
           layer_norm_out = layer_norm(x)
 
-          print(layer_norm_out.numpy)
+          print(layer_norm_out.numpy())
     """
     input_shape = list(x.shape)
     input_ndim = len(input_shape)
@@ -302,10 +311,10 @@ def layer_norm(x,
     # create output
     helper = LayerHelper('layer_norm', **locals())
     mean_out = helper.create_variable_for_type_inference(
-        dtype=x.type, stop_gradient=True)
+        dtype=x.dtype, stop_gradient=True)
     variance_out = helper.create_variable_for_type_inference(
-        dtype=x.type, stop_gradient=True)
-    layer_norm_out = helper.create_variable_for_type_inference(x.type)
+        dtype=x.dtype, stop_gradient=True)
+    layer_norm_out = helper.create_variable_for_type_inference(x.dtype)
 
     helper.append_op(
         type="layer_norm",
@@ -362,7 +371,7 @@ def instance_norm(x,
           x = paddle.to_tensor(x_data) 
           instance_norm_out = paddle.nn.functional.instancenorm(x)
 
-          print(instance_norm_out.numpy)
+          print(instance_norm_out.numpy())
 
     """
 
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index c8790a75901fd5d9a38862158246e3756dc575c4..b4a713a1964f5d99503e0b5a221668656fa657d1 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -847,8 +847,7 @@ def adaptive_avg_pool1d(x, output_size, name=None):
                               with shape [N, C, L].  The format of input tensor is NCL,
                               where N is batch size, C is the number of channels, L is the
                               length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
+        output_size (int): The target output size. It must be an integer.
         name(str, optional): For detailed information, please refer
                                  to :ref:`api_guide_Name`. Usually name is no need to set and
                                  None by default.
@@ -856,7 +855,7 @@ def adaptive_avg_pool1d(x, output_size, name=None):
             Tensor: The output tensor of adaptive average pooling result. The data type is same
                       as input tensor.
     Raises:
-            ValueError: 'output_size' should be an integer or list or tuple with length as 1.
+            ValueError: 'output_size' should be an integer.
     Examples:
         .. code-block:: python
               # average adaptive pool1d
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 6eac15cd694e51c24f94f7686b6e63fa7c6cbf09..7d7a392ebe80c3af8c991dbff746d0f8f216b18b 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -63,8 +63,8 @@ from .common import UpSample  #DEFINE_ALIAS
 from .common import UpsamplingNearest2d  #DEFINE_ALIAS
 from .common import UpsamplingBilinear2d  #DEFINE_ALIAS
 from .common import Dropout  #DEFINE_ALIAS
-from .common import Dropout2D  #DEFINE_ALIAS
-from .common import Dropout3D  #DEFINE_ALIAS
+from .common import Dropout2d  #DEFINE_ALIAS
+from .common import Dropout3d  #DEFINE_ALIAS
 from .common import AlphaDropout  #DEFINE_ALIAS
 from .pooling import AvgPool1d  #DEFINE_ALIAS
 from .pooling import AvgPool2d  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index a1e6508c67d96e9f6cc077efe6e61d708674b057..d8e1d03b02840e76ff865986d8b90ca9d6cdd9f8 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -15,7 +15,7 @@
 # TODO: define the common classes to build a neural network
 from ...fluid.dygraph import BilinearTensorProduct  #DEFINE_ALIAS
 from ...fluid.dygraph import Pool2D  #DEFINE_ALIAS
-from ...fluid.dygraph import Embedding  #DEFINE_ALIAS
+from ...fluid.dygraph import Linear  #DEFINE_ALIAS
 from ...fluid.dygraph import Flatten  #DEFINE_ALIAS
 from ...fluid.dygraph import layers
 from .. import functional as F
@@ -41,8 +41,8 @@ __all__ = [
     'ReplicationPad3d',
     'CosineSimilarity',
     'Dropout',
-    'Dropout2D',
-    'Dropout3D',
+    'Dropout2d',
+    'Dropout3d',
     'Bilinear',
     'AlphaDropout',
 ]
@@ -146,9 +146,9 @@ class UpSample(layers.Layer):
         'nearest' : Nearest neighbor interpolation
         'bicubic' : Bicubic interpolation
 
-    Linear interpolation is the method of using a line connecting two known quantities 
-    to determine the value of an unknown quantity between the two known quantities. 
-    
+    Linear interpolation is the method of using a line connecting two known quantities
+    to determine the value of an unknown quantity between the two known quantities.
+
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
     direction) on input tensor.
@@ -158,7 +158,7 @@ class UpSample(layers.Layer):
     W-direction in this op) on a rectilinear 2D grid. The key idea is
     to perform linear interpolation first in one direction, and then
     again in the other direction.
-    
+
     Bicubic interpolation is an extension of cubic interpolation for interpolating
     data points on a two-dimensional regular grid. The interpolated surface is
     smoother than corresponding surfaces obtained by bilinear interpolation or
@@ -205,7 +205,7 @@ class UpSample(layers.Layer):
               output: (N,C,H_out,W_out) where:
               H_out = round(H_{in} * scale_{factor})
               W_out = round(W_{in} * scale_{factor})
-        
+
         Bilinear interpolation:
           if:
               align_corners = False , align_mode = 0
@@ -252,19 +252,19 @@ class UpSample(layers.Layer):
 
     https://en.wikipedia.org/wiki/Linear_interpolation.
     For details of linear interpolation, please refer to Wikipedia:
-    
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    
+
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    
+
     For details of bicubic interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bicubic_interpolation
-    
+
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
-    
+
     Parameters:
         x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
@@ -537,8 +537,8 @@ class Pad2D(layers.Layer):
     If mode is 'reflect', paddings[0] and paddings[1] must be no greater
     than height-1. And the width dimension has the same condition.
     Parameters:
-        paddings (int | List[int32]): The padding size. If padding is a int, uses the same 
-            padding in all boundaries, if padding is a List, it must contain four integers, 
+        paddings (int | List[int32]): The padding size. If padding is a int, uses the same
+            padding in all boundaries, if padding is a List, it must contain four integers,
             (padding_top, padding_bottom, padding_left, padding_right).
             Default is [0, 0, 0, 0].
         mode (str): Three modes: 'constant' (default), 'reflect', 'edge' .
@@ -550,7 +550,7 @@ class Pad2D(layers.Layer):
         data_format (str): An string from: "NHWC", "NCHW". Specify the data format of
                            the input data.
                            Default is  "NCHW"
-    Returns: 
+    Returns:
         None
     Examples:
         .. code-block:: text
@@ -631,11 +631,11 @@ class Bilinear(layers.Layer):
        in1_features (int): The dimension of each first input(`x1`).
        in2_features (int): The dimension of each second input(`x2`).
        out_features (int): The dimension of output of this layer.
-       weight_attr (ParamAttr, optional): The parameter attribute for the learnable w, parameters/weights of 
+       weight_attr (ParamAttr, optional): The parameter attribute for the learnable w, parameters/weights of
        this layer. The default value is None.
        bias_attr (ParamAttr, optional): The parameter attribute for the bias
            of this layer. If it is set to False, no bias will be added to the output units.
-           If it is set to None, the bias is initialized zero. The default value is None.       
+           If it is set to None, the bias is initialized zero. The default value is None.
        name (str, optional): The default value is None. Normally there is no need for user
            to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
 
@@ -702,7 +702,7 @@ class Dropout(layers.Layer):
     """
     Dropout is a regularization technique for reducing overfitting by preventing
     neuron co-adaption during training as described in the paper:
-    `Improving neural networks by preventing co-adaptation of feature detectors <https://arxiv.org/abs/1207.0580>`_ 
+    `Improving neural networks by preventing co-adaptation of feature detectors <https://arxiv.org/abs/1207.0580>`_
     The dropout operator randomly sets the outputs of some units to zero, while upscale others
     according to the given dropout probability.
 
@@ -766,13 +766,13 @@ class Dropout(layers.Layer):
         return out
 
 
-class Dropout2D(layers.Layer):
+class Dropout2d(layers.Layer):
     """
     Randomly zero out entire channels (in the batched input 4d tensor with the shape `NCHW` ,
     a channel is a 2D feature map with the shape `HW`). Each channel will be zeroed out independently
     on every forward call with probability `p` using samples from a Bernoulli distribution.
-    Dropout2d will help promote independence between feature maps as described in the paper: 
-    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_ 
+    Dropout2d will help promote independence between feature maps as described in the paper:
+    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_
 
     See ``paddle.nn.functional.dropout2d`` for more details.
 
@@ -798,7 +798,7 @@ class Dropout2D(layers.Layer):
             paddle.disable_static()
             x = np.random.random(size=(2, 3, 4, 5)).astype('float32')
             x = paddle.to_tensor(x)
-            m = paddle.nn.Dropout2D(p=0.5)
+            m = paddle.nn.Dropout2d(p=0.5)
             y_train = m(x)
             m.eval()  # switch the model to test phase
             y_test = m(x)
@@ -808,7 +808,7 @@ class Dropout2D(layers.Layer):
    """
 
     def __init__(self, p=0.5, data_format='NCHW', name=None):
-        super(Dropout2D, self).__init__()
+        super(Dropout2d, self).__init__()
 
         self.p = p
         self.data_format = data_format
@@ -824,13 +824,13 @@ class Dropout2D(layers.Layer):
         return out
 
 
-class Dropout3D(layers.Layer):
+class Dropout3d(layers.Layer):
     """
     Randomly zero out entire channels (in the batched input 5d tensor with the shape `NCDHW` ,
     a channel is a 3D feature map with the shape `DHW` ). Each channel will be zeroed out independently
     on every forward call with probability `p` using samples from a Bernoulli distribution.
-    Dropout3d will help promote independence between feature maps as described in the paper: 
-    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_ 
+    Dropout3d will help promote independence between feature maps as described in the paper:
+    `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_
 
     See ``paddle.nn.functional.dropout3d`` for more details.
 
@@ -856,7 +856,7 @@ class Dropout3D(layers.Layer):
             paddle.disable_static()
             x = np.random.random(size=(2, 3, 4, 5, 6)).astype('float32')
             x = paddle.to_tensor(x)
-            m = paddle.nn.Dropout3D(p=0.5)
+            m = paddle.nn.Dropout3d(p=0.5)
             y_train = m(x)
             m.eval()  # switch the model to test phase
             y_test = m(x)
@@ -866,7 +866,7 @@ class Dropout3D(layers.Layer):
    """
 
     def __init__(self, p=0.5, data_format='NCDHW', name=None):
-        super(Dropout3D, self).__init__()
+        super(Dropout3d, self).__init__()
 
         self.p = p
         self.data_format = data_format
@@ -1547,3 +1547,131 @@ class CosineSimilarity(layers.Layer):
 
     def forward(self, x1, x2):
         return F.cosine_similarity(x1, x2, axis=self._axis, eps=self._eps)
+
+
+class Embedding(layers.Layer):
+    """
+    :alias_main: paddle.nn.Embedding
+	:alias: paddle.nn.Embedding,paddle.nn.layer.Embedding,paddle.nn.layer.common.Embedding
+	:old_api: paddle.fluid.dygraph.Embedding
+
+    **Embedding Layer**
+
+    This interface is used to construct a callable object of the ``Embedding`` class.
+    For specific usage, refer to code examples. It implements the function of the Embedding Layer.
+    This layer is used to lookup embeddings vector of ids provided by :attr:`input` .
+    It automatically constructs a 2D embedding matrix based on the
+    input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` .
+
+    The shape of output Tensor is generated by appending an emb_size dimension to the
+    last dimension of the input Tensor shape.
+
+    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` ,
+    otherwise the program will throw an exception and exit.
+
+    .. code-block:: text
+
+        Case 1:
+
+        input is a Tensor. padding_idx = -1
+            input.data = [[1, 3], [2, 4], [4, 127]
+            input.shape = [3, 2]
+        Given size = [128, 16]
+        output is a Tensor:
+            out.shape = [3, 2, 16]
+            out.data = [[[0.129435295, 0.244512452, ..., 0.436322452],
+                        [0.345421456, 0.524563927, ..., 0.144534654]],
+
+                        [[0.345249859, 0.124939536, ..., 0.194353745],
+                        [0.945345345, 0.435394634, ..., 0.435345365]],
+
+                        [[0.945345345, 0.435394634, ..., 0.435345365],
+                        [0.0,         0.0,         ..., 0.0        ]]]  # padding data
+        The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
+        It will pad all-zero data when ids is 127.
+
+    Parameters:
+        num_embeddings (int): Just one element which indicate the size
+            of the dictionary of embeddings.
+        embedding_dim:  Just one element which indicate the size of each embedding vector respectively.
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
+            If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
+            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
+            encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
+            If set None, it makes no effect to output. Default: None.
+        sparse(bool): The flag indicating whether to use sparse update. This parameter only
+            affects the performance of the backwards gradient update. It is recommended to set
+            True because sparse update is faster. But some optimizer does not support sparse update,
+            such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
+            :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
+            :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
+            In these case, is_sparse must be False. Default: False.
+        weight_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
+            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
+            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
+            The local word vector needs to be transformed into numpy format, and the shape of local word
+            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
+            is used to load custom or pre-trained word vectors. See code example 2 for details.
+        name(str|None): For detailed information, please refer
+               to :ref:`api_guide_Name`. Usually name is no need to set and
+               None by default.
+
+    Attribute:
+        **weight** (Parameter): the learnable weights of this layer.
+
+    Returns:
+        None
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          import numpy as np
+          paddle.disable_static()
+
+          # example 1
+          inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64')
+          inp_word.shape  # [2, 3]
+          dict_size = 20
+
+          emb = nn.Embedding(
+                    dict_size,
+                    32,
+                    sparse=False)
+    """
+
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 padding_idx=None,
+                 sparse=False,
+                 weight_attr=None,
+                 name=None):
+        super(Embedding, self).__init__()
+        self._num_embeddings = num_embeddings
+        self._embedding_dim = embedding_dim
+        self._sparse = sparse
+        self._is_distributed = False
+        self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
+            num_embeddings + padding_idx)
+        self._dtype = self._helper.get_default_dtype()
+        self._size = [self._num_embeddings, self._embedding_dim]
+
+        self._weight_attr = weight_attr
+        self._remote_prefetch = False
+        self._name = name
+        self._weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=self._size,
+            dtype=self._dtype,
+            is_bias=False)
+
+    def forward(self, x):
+        return F.embedding(
+            x,
+            weight=self._weight,
+            padding_idx=self._padding_idx,
+            sparse=self._sparse,
+            name=self._name)
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index b0917441de3fea640204a3891ed03e9a451e3f0f..334b71151b563f9f68fc4e7e1c89d83697e1fb4c 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -44,10 +44,10 @@ class PairwiseDistance(layers.Layer):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
-        x: :math:`(N, D)` where `D` is the dimension of vector, available dtype
+        x: :math:`[N, D]` where `D` is the dimension of vector, available dtype
             is float32, float64.
-        y: :math:`(N, D)`, y have the same shape and dtype as x.
-        out: :math:`(N)`. If :attr:`keepdim` is ``True``, the out shape is :math:`(N, 1)`.
+        y: :math:`[N, D]`, y have the same shape and dtype as x.
+        out: :math:`[N]`. If :attr:`keepdim` is ``True``, the out shape is :math:`[N, 1]`.
             The same dtype as input tensor.
 
     Examples:
@@ -58,8 +58,8 @@ class PairwiseDistance(layers.Layer):
             paddle.disable_static()
             x_np = np.array([[1., 3.], [3., 5.]]).astype(np.float64)
             y_np = np.array([[5., 6.], [7., 8.]]).astype(np.float64)
-            x = paddle.to_variable(x_np)
-            y = paddle.to_variable(y_np)
+            x = paddle.to_tensor(x_np)
+            y = paddle.to_tensor(y_np)
             dist = paddle.nn.PairwiseDistance()
             distance = dist(x, y)
             print(distance.numpy()) # [5. 5.]
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index a1c7d28a85e762ebb381c5f0075df1c7b00396f7..a60e615d5064bf4ef2229dd67193774030383888 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -376,8 +376,8 @@ class L1Loss(fluid.dygraph.Layer):
             paddle.disable_static()
             input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
             label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
-            input = paddle.to_variable(input_data)
-            label = paddle.to_variable(label_data)
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
 
             l1_loss = paddle.nn.loss.L1Loss()
             output = l1_loss(input, label)
@@ -455,7 +455,7 @@ class BCELoss(fluid.dygraph.Layer):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
-        input (Tensor): 2-D tensor with shape: (N, *), N is batch_size, `*` means
+        input (Tensor): 2-D tensor with shape: [N, *], N is batch_size, `*` means
             number of additional dimensions. The input ``input`` should always
             be the output of sigmod.  Available dtype is float32, float64.
         label (Tensor): 2-D tensor with the same shape as ``input``. The target
@@ -476,12 +476,11 @@ class BCELoss(fluid.dygraph.Layer):
             label_data = np.array([1.0, 0.0, 1.0]).astype("float32")
 
             paddle.disable_static()
-            input = paddle.to_variable(input_data)
-            label = paddle.to_variable(label_data)
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
             bce_loss = paddle.nn.loss.BCELoss()
             output = bce_loss(input, label)
             print(output.numpy())  # [0.65537095]
-            paddle.enable_static()
 
     """
 
@@ -584,9 +583,9 @@ class NLLLoss(fluid.dygraph.Layer):
 
                 place = paddle.CPUPlace()
                 paddle.disable_static(place)
-                input = paddle.to_variable(input_np)
+                input = paddle.to_tensor(input_np)
                 log_out = log_softmax(input)
-                label = paddle.to_variable(label_np)
+                label = paddle.to_tensor(label_np)
                 result = nll_loss(log_out, label)
                 print(result.numpy()) # [1.0720209]
 
@@ -729,14 +728,12 @@ class MarginRankingLoss(fluid.dygraph.Layer):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
-
             paddle.disable_static()
 
-            input = paddle.to_variable(np.array([[1, 2], [3, 4]]).astype("float32"))
-            other = paddle.to_variable(np.array([[2, 1], [2, 4]]).astype("float32"))
-            label = paddle.to_variable(np.array([[1, -1], [-1, -1]]).astype("float32"))
+            input = paddle.to_tensor([[1, 2], [3, 4]]), dtype="float32")
+            other = paddle.to_tensor([[2, 1], [2, 4]]), dtype="float32")
+            label = paddle.to_tensor([[1, -1], [-1, -1]], dtype="float32")
             margin_rank_loss = paddle.nn.MarginRankingLoss()
             loss = margin_rank_loss(input, other, label)
             print(loss.numpy()) # [0.75]
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 4d25418579d74ae896f8ca590400a0a334047e93..8bdb09c76918d747d644a8c781dabb6aab41522c 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -78,7 +78,7 @@ class _InstanceNormBase(layers.Layer):
         super(_InstanceNormBase, self).__init__()
 
         if weight_attr == False or bias_attr == False:
-            assert weight_attr == param_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
+            assert weight_attr == bias_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
         self._epsilon = epsilon
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
@@ -176,7 +176,7 @@ class InstanceNorm1d(_InstanceNormBase):
           instance_norm = paddle.nn.InstanceNorm1d(2)
           instance_norm_out = instance_norm(x)
 
-          print(instance_norm_out.numpy)
+          print(instance_norm_out.numpy())
 
     """
 
@@ -253,7 +253,7 @@ class InstanceNorm2d(_InstanceNormBase):
           instance_norm = paddle.nn.InstanceNorm2d(2)
           instance_norm_out = instance_norm(x)
 
-          print(instance_norm_out.numpy)
+          print(instance_norm_out.numpy())
     """
 
     def _check_input_dim(self, input):
@@ -329,7 +329,7 @@ class InstanceNorm3d(_InstanceNormBase):
           instance_norm = paddle.nn.InstanceNorm3d(2)
           instance_norm_out = instance_norm(x)
 
-          print(instance_norm_out.numpy)
+          print(instance_norm_out.numpy())
     """
 
     def _check_input_dim(self, input):
@@ -346,8 +346,8 @@ class GroupNorm(layers.Layer):
     Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
 
     Parameters:
-        num_channels(int): The number of channels of input.
         num_groups(int): The number of groups that divided from channels.
+        num_channels(int): The number of channels of input.
         epsilon(float, optional): The small value added to the variance to prevent
                                   division by zero. Default: 1e-05.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
@@ -375,19 +375,19 @@ class GroupNorm(layers.Layer):
           np.random.seed(123)
           x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32')
           x = paddle.to_tensor(x_data) 
-          group_norm = paddle.nn.GroupNorm(num_channels=3, num_groups=6)
+          group_norm = paddle.nn.GroupNorm(num_channels=6, num_groups=6)
           group_norm_out = group_norm(x)
 
-          print(group_norm_out.numpy)
+          print(group_norm_out.numpy())
     """
 
     def __init__(self,
-                 num_channels,
                  num_groups,
+                 num_channels,
                  epsilon=1e-05,
                  weight_attr=None,
                  bias_attr=None,
-                 data_layout='NCHW',
+                 data_format='NCHW',
                  name=None):
         super(GroupNorm, self).__init__()
         self._weight_attr = weight_attr
@@ -395,18 +395,33 @@ class GroupNorm(layers.Layer):
         self._epsilon = epsilon
         self._num_channels = num_channels
         self._num_groups = num_groups
-        if data_layout != 'NCHW':
+        if data_format != 'NCHW':
             raise ValueError("unsupported data layout:" + data_layout)
 
         param_shape = [self._num_channels]
 
-        self.weight = self.create_parameter(
-            attr=self._weight_attr or False,
-            shape=param_shape,
-            default_initializer=Constant(1.0))
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
 
-        self.bias = self.create_parameter(
-            attr=self._weight_attr or False, shape=param_shape, is_bias=True)
+        if bias_attr == False:
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
 
     def forward(self, input):
         inputs = {'X': input}
@@ -500,7 +515,7 @@ class LayerNorm(layers.Layer):
           layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
           layer_norm_out = layer_norm(x)
 
-          print(layer_norm_out.numpy)
+          print(layer_norm_out.numpy())
     """
 
     def __init__(self,
@@ -603,8 +618,7 @@ class _BatchNormBase(layers.Layer):
                 initializer=Constant(0.0),
                 trainable=False,
                 do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
+            shape=param_shape)
         self._mean.stop_gradient = True
 
         self._variance = self.create_parameter(
@@ -613,8 +627,7 @@ class _BatchNormBase(layers.Layer):
                 initializer=Constant(1.0),
                 trainable=False,
                 do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
+            shape=param_shape)
         self._variance.stop_gradient = True
 
         self._data_format = data_format
@@ -628,8 +641,13 @@ class _BatchNormBase(layers.Layer):
     def _check_input_dim(self, input):
         raise NotImplementedError("BatchNorm Base error")
 
+    def _check_data_format(self, input):
+        raise NotImplementedError("BatchNorm Base data format error")
+
     def forward(self, input):
 
+        self._check_data_format(self._data_format)
+
         self._check_input_dim(input)
 
         if not self.training and not self._track_running_stats:
@@ -730,9 +748,15 @@ class BatchNorm1d(_BatchNormBase):
           batch_norm = paddle.nn.BatchNorm1d(1)
           batch_norm_out = batch_norm(x)
 
-          print(batch_norm_out.numpy)
+          print(batch_norm_out.numpy())
     """
 
+    def _check_data_format(self, input):
+        if input == 'NCHW' or input == 'NC' or input == 'NCL':
+            self._data_format = 'NCHW'
+        else:
+            raise ValueError('expected NC , NCL or None for data_format input')
+
     def _check_input_dim(self, input):
         if len(input.shape) != 2 and len(input.shape) != 3:
             raise ValueError('expected 2D or 3D input (got {}D input)'.format(
@@ -816,9 +840,15 @@ class BatchNorm2d(_BatchNormBase):
           batch_norm = paddle.nn.BatchNorm2d(1)
           batch_norm_out = batch_norm(x)
 
-          print(batch_norm_out.numpy)
+          print(batch_norm_out.numpy())
     """
 
+    def _check_data_format(self, input):
+        if input == 'NCHW' or input == 'NCWH':
+            self._data_format = input
+        else:
+            raise ValueError('expected NCHW or NCWH for data_format input')
+
     def _check_input_dim(self, input):
         if len(input.shape) != 4:
             raise ValueError('expected 4D input (got {}D input)'.format(
@@ -902,9 +932,15 @@ class BatchNorm3d(_BatchNormBase):
           batch_norm = paddle.nn.BatchNorm3d(1)
           batch_norm_out = batch_norm(x)
 
-          print(batch_norm_out.numpy)
+          print(batch_norm_out.numpy())
     """
 
+    def _check_data_format(self, input):
+        if input == 'NCHW' or input == 'NCDHW':
+            self._data_format = 'NCHW'
+        else:
+            raise ValueError('expected NCDHW or None for data_format input')
+
     def _check_input_dim(self, input):
         if len(input.shape) != 5:
             raise ValueError('expected 5D input (got {}D input)'.format(
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 6f6b567849732ff889db4507708758cd8eeab2a8..4cb661cf541222ec4f05df0fdc69b6483f04cf55 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -613,8 +613,7 @@ class AdaptiveAvgPool1d(layers.Layer):
        Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
 
     Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain one int.
+        output_size (int): The target output size. It must be an integer.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -623,7 +622,7 @@ class AdaptiveAvgPool1d(layers.Layer):
         None.
 
     Raises:
-        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
+        ValueError: 'output_size' should be an integer.
 
     Shape:
         - x: 3-D tensor.
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 50a8755ac9f7b0a8e35c60f02a9fb825195ab80f..63069e83952172df3136458ebfee4b446749934d 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -25,12 +25,13 @@ __all__ = [
 import copy
 import collections
 
+from .common import Linear, Dropout
+from .norm import LayerNorm
+from .. import functional as F
+from ... import tensor
 from ...fluid import layers
+from ...fluid.dygraph import Layer, LayerList
 from ...fluid.param_attr import ParamAttr
-from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList
-from .. import functional as F
-from ...fluid.layers import utils
-from ...fluid.layers.utils import map_structure
 
 
 def _convert_param_attr_to_list(param_attr, n):
@@ -103,7 +104,7 @@ class MultiHeadAttention(Layer):
             # self attention mask: [batch_size, num_heads, query_len, query_len]
             attn_mask = paddle.rand((2, 2, 4, 4))
             multi_head_attn = paddle.MultiHeadAttention(128, 2)
-            output = multi_head_attn(query, attn_mask=attn_mask)  # [2, 4, 128]
+            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
     """
 
     Cache = collections.namedtuple("Cache", ["k", "v"])
@@ -176,8 +177,8 @@ class MultiHeadAttention(Layer):
                 and their data types are same as inputs.
         """
         q = self.q_proj(query)
-        q = layers.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
-        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
+        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
+        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
 
         if isinstance(cache, self.StaticCache):
             # for encoder-decoder attention in inference and has cached
@@ -187,8 +188,8 @@ class MultiHeadAttention(Layer):
 
         if isinstance(cache, self.Cache):
             # for decoder self-attention in inference
-            k = layers.concat([cache.k, k], axis=2)
-            v = layers.concat([cache.v, v], axis=2)
+            k = tensor.concat([cache.k, k], axis=2)
+            v = tensor.concat([cache.v, v], axis=2)
             cache = self.Cache(k, v)
 
         return (q, k, v) if cache is None else (q, k, v, cache)
@@ -219,10 +220,10 @@ class MultiHeadAttention(Layer):
         """
         k = self.k_proj(key)
         v = self.v_proj(value)
-        k = layers.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
-        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-        v = layers.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
-        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
+        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
+        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
+        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
         return k, v
 
     def gen_cache(self, key, value=None, type=Cache):
@@ -352,24 +353,25 @@ class MultiHeadAttention(Layer):
             q, k, v, cache = self._prepare_qkv(query, key, value, cache)
 
         # scale dot product attention
+        # TODO(guosheng): use tensor.matmul, however it doesn't support `alpha`
         product = layers.matmul(
             x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
         if attn_mask is not None:
             # TODO(guosheng): support bool mask
             product = product + attn_mask
-        weights = layers.softmax(product)
+        weights = F.softmax(product)
         if self.dropout:
-            weights = layers.dropout(
+            weights = F.dropout(
                 weights,
-                dropout_prob=self.dropout,
-                dropout_implementation="upscale_in_train",
-                is_test=False)
+                self.dropout,
+                training=self.training,
+                mode="upscale_in_train")
 
-        out = layers.matmul(weights, v)
+        out = tensor.matmul(weights, v)
 
         # combine heads
-        out = layers.transpose(out, perm=[0, 2, 1, 3])
-        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+        out = tensor.transpose(out, perm=[0, 2, 1, 3])
+        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
 
         # project to output
         out = self.out_proj(out)
@@ -429,7 +431,7 @@ class TransformerEncoderLayer(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle import TransformerEncoderLayer
+            from paddle.nn import TransformerEncoderLayer
 
             # encoder input: [batch_size, src_len, d_model]
             enc_input = paddle.rand((2, 4, 128))
@@ -470,17 +472,14 @@ class TransformerEncoderLayer(Layer):
             bias_attr=bias_attrs[0])
         self.linear1 = Linear(
             d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
-        self.dropout = Dropout(
-            act_dropout, dropout_implementation="upscale_in_train")
+        self.dropout = Dropout(act_dropout, mode="upscale_in_train")
         self.linear2 = Linear(
             dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])
         self.norm1 = LayerNorm(d_model)
         self.norm2 = LayerNorm(d_model)
-        self.dropout1 = Dropout(
-            dropout, dropout_implementation="upscale_in_train")
-        self.dropout2 = Dropout(
-            dropout, dropout_implementation="upscale_in_train")
-        self.activation = getattr(layers, activation)
+        self.dropout1 = Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
 
     def forward(self, src, src_mask=None):
         """
@@ -539,7 +538,7 @@ class TransformerEncoder(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle import TransformerEncoderLayer, TransformerEncoder
+            from paddle.nn import TransformerEncoderLayer, TransformerEncoder
 
             # encoder input: [batch_size, src_len, d_model]
             enc_input = paddle.rand((2, 4, 128))
@@ -643,7 +642,7 @@ class TransformerDecoderLayer(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle import TransformerDecoderLayer
+            from paddle.nn import TransformerDecoderLayer
 
             # decoder input: [batch_size, tgt_len, d_model]
             dec_input = paddle.rand((2, 4, 128))
@@ -697,20 +696,16 @@ class TransformerDecoderLayer(Layer):
             bias_attr=bias_attrs[1])
         self.linear1 = Linear(
             d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
-        self.dropout = Dropout(
-            act_dropout, dropout_implementation="upscale_in_train")
+        self.dropout = Dropout(act_dropout, mode="upscale_in_train")
         self.linear2 = Linear(
             dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
         self.norm1 = LayerNorm(d_model)
         self.norm2 = LayerNorm(d_model)
         self.norm3 = LayerNorm(d_model)
-        self.dropout1 = Dropout(
-            dropout, dropout_implementation="upscale_in_train")
-        self.dropout2 = Dropout(
-            dropout, dropout_implementation="upscale_in_train")
-        self.dropout3 = Dropout(
-            dropout, dropout_implementation="upscale_in_train")
-        self.activation = getattr(layers, activation)
+        self.dropout1 = Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = Dropout(dropout, mode="upscale_in_train")
+        self.dropout3 = Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
 
     def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
         """
@@ -834,7 +829,7 @@ class TransformerDecoder(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle import TransformerDecoderLayer, TransformerDecoder
+            from paddle.nn import TransformerDecoderLayer, TransformerDecoder
 
             # decoder input: [batch_size, tgt_len, d_model]
             dec_input = paddle.rand((2, 4, 128))
@@ -1017,7 +1012,7 @@ class Transformer(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle import Transformer
+            from paddle.nn import Transformer
 
             # src: [batch_size, tgt_len, d_model]
             enc_input = paddle.rand((2, 4, 128))
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 0da8053fe8a3495f5d3188a737638531347de648..3150b8c2d0363274dfb6fd3465110c89339cd4c9 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -45,8 +45,8 @@ class Adam(Optimizer):
     Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
 
     Args:
-        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
-            It can be a float value or a LearningRateDecay. The default value is 0.001.
+        learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a _LRScheduler. The default value is 0.001.
         beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
             It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 0.9.
@@ -55,7 +55,7 @@ class Adam(Optimizer):
             The default value is 0.999.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-08.
-	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
 	    This parameter is required in dygraph mode. \
 	    The default value is None in static mode, at this time all parameters will be updated.
 	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
@@ -143,6 +143,12 @@ class Adam(Optimizer):
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
+        if not 0 <= beta1 < 1:
+            raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
+        if not 0 <= beta2 < 1:
+            raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
+        if not 0 <= epsilon:
+            raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
         super(Adam, self).__init__(
             learning_rate=learning_rate,
             parameters=parameters,
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index 73a78b17cbba55c1ee90a2708f6c163940158a51..cca120efd450768520d9cf027f6a36aaad121d9e 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -47,15 +47,15 @@ class Adamax(Optimizer):
     it is added here for numerical stability to prevent the division by 0 error.
 
     Args:
-        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
-            It can be a float value or a LearningRateDecay. The default value is 0.001.
+        learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a _LRScheduler. The default value is 0.001.
         beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
             The default value is 0.9.
         beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
             The default value is 0.999.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-08.
-	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
 	    This parameter is required in dygraph mode. \
 	    The default value is None in static mode, at this time all parameters will be updated.
 	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
@@ -118,6 +118,12 @@ class Adamax(Optimizer):
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
+        if not 0 <= beta1 < 1:
+            raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
+        if not 0 <= beta2 < 1:
+            raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
+        if not 0 <= epsilon:
+            raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
         super(Adamax, self).__init__(
             learning_rate=learning_rate,
             parameters=parameters,
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index f498fcbffa24ec188b57ceb2d3c6884fc1e135d2..edaca7e8301676c8734eb3e60924844bea0121d9 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -19,112 +19,7 @@ import paddle
 __all__ = ['AdamW']
 
 
-class DecoupledWeightDecay(object):
-    def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
-        if not isinstance(coeff, float) and \
-                not isinstance(coeff, framework.Variable):
-            raise TypeError("coeff should be float or Tensor.")
-        self._params_name = set()
-        self._apply_decay_param_fun = apply_decay_param_fun
-        self._coeff = coeff
-        super(DecoupledWeightDecay, self).__init__(**kwargs)
-
-    def _scale_parameters(self, params_and_grads):
-        """
-        Adds weight decay ops.
-            scaled_parameter = parameter * coeff
-
-        Args:
-            params_and_grads: A list of (parameters, gradients) pairs,
-                the parameters need to decay.
-        Raises:
-            Exception: The type of coeff and parameter is not consistent.
-        """
-        if isinstance(self._coeff, float) and self._coeff == 0.0:
-            return
-
-        scaled_params = []
-        for param, grad in params_and_grads:
-            # If no gradient then we don't need to do anything
-            if grad is None:
-                continue
-            if self._apply_decay_param_fun is not None \
-                    and not self._apply_decay_param_fun(param.name):
-                continue
-
-            if isinstance(self._coeff, float):
-                assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
-                    "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
-            else:
-                assert self._coeff.dtype == param.dtype, \
-                    "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
-
-            with param.block.program._optimized_guard(
-                [param, grad]), framework.name_scope('weight decay'):
-                assert param.name not in self._params_name
-                scaled_params.append((param, grad, param * self._coeff))
-                self._params_name.add(param.name)
-        return scaled_params
-
-    def backward(self, **kargs):
-        return super(DecoupledWeightDecay, self).backward(**kargs)
-
-    def _apply_optimize(self, **kargs):
-        return super(DecoupledWeightDecay, self)._apply_optimize(**kargs)
-
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameters=None,
-                 no_grad_set=None):
-        params_grads = self.backward(
-            loss=loss,
-            startup_program=startup_program,
-            parameters=parameters,
-            no_grad_set=no_grad_set)
-        scaled_params = self._scale_parameters(params_grads)
-        for p_grad_sgrad in scaled_params:
-            param, grad, scaled_param = p_grad_sgrad
-            with param.block.program._optimized_guard(
-                [param, grad]), framework.name_scope('weight decay'):
-                updated_param = paddle.fluid.layers.elementwise_sub(
-                    x=param, y=scaled_param)
-                paddle.fluid.layers.assign(input=updated_param, output=param)
-
-        optimize_ops = self._apply_optimize(
-            loss=loss,
-            params_grads=params_grads,
-            startup_program=startup_program)
-        return optimize_ops, params_grads
-
-    @framework.dygraph_only
-    def step(self):
-        parameter_list = self._parameter_list
-        self._dtype = None
-        params_grads = []
-        for param in self._parameter_list:
-            if not param.trainable:
-                continue
-            if param._grad_ivar() is not None:
-                grad_var = param._grad_ivar()
-                params_grads.append((param, grad_var))
-
-        scaled_params = self._scale_parameters(params_grads)
-        for p_grad_sgrad in scaled_params:
-            param, grad, scaled_param = p_grad_sgrad
-            with param.block.program._optimized_guard(
-                [param, grad]), framework.name_scope('weight decay'):
-                updated_param = paddle.fluid.layers.elementwise_sub(
-                    x=param, y=scaled_param)
-                paddle.fluid.layers.assign(input=updated_param, output=param)
-        optimize_ops = self._apply_optimize(
-            loss=None, startup_program=None, params_grads=params_grads)
-
-    def __str__(self):
-        return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
-
-
-class AdamW(DecoupledWeightDecay, Adam):
+class AdamW(Adam):
     """
     The AdamW optimizer is implemented based on the AdamW Optimization 
     in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
@@ -145,8 +40,8 @@ class AdamW(DecoupledWeightDecay, Adam):
 
 
     Args:
-        learning_rate (float|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
-            It can be a float value or a LearningRateDecay. The default value is 0.001.
+        learning_rate (float|_LRScheduler, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a _LRScheduler. The default value is 0.001.
 	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
 	    This parameter is required in dygraph mode. \
 	    The default value is None in static mode, at this time all parameters will be updated.
@@ -157,9 +52,9 @@ class AdamW(DecoupledWeightDecay, Adam):
             It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 0.999.
         epsilon (float, optional): A small float value for numerical stability.
-        weight_decay (float|Tensor): The weight decay coefficient, it can be float or Tensor. The default value is 0.0.
             The default value is 1e-08.
-        apply_decay_param_fun (function|None): If it is not None,
+        weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
+        apply_decay_param_fun (function|None, optional): If it is not None,
             only tensors that makes apply_decay_param_fun(Tensor)==True 
             will be updated. It only works when we want to specify tensors.
             Default: None.
@@ -208,26 +103,129 @@ class AdamW(DecoupledWeightDecay, Adam):
 
     def __init__(self,
                  learning_rate=0.001,
-                 parameters=None,
                  beta1=0.9,
                  beta2=0.999,
                  epsilon=1e-8,
-                 weight_decay=0.0,
+                 parameters=None,
+                 weight_decay=0.01,
                  apply_decay_param_fun=None,
                  grad_clip=None,
                  name=None,
                  lazy_mode=False):
-        args_dict = {
-            "learning_rate": learning_rate,
-            "parameters": parameters,
-            "beta1": beta1,
-            "beta2": beta2,
-            "epsilon": epsilon,
-            "grad_clip": grad_clip,
-            "name": name,
-            "lazy_mode": lazy_mode
-        }
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        if not 0 <= beta1 < 1:
+            raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
+        if not 0 <= beta2 < 1:
+            raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
+        if not 0 <= epsilon:
+            raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
+        coeff = weight_decay
+        if not isinstance(coeff, float) and \
+                not isinstance(coeff, framework.Variable):
+            raise TypeError("coeff should be float or Tensor.")
+        self._params_name = set()
+        self._apply_decay_param_fun = apply_decay_param_fun
+        self._coeff = coeff
         super(AdamW, self).__init__(
-            weight_decay,
-            apply_decay_param_fun=apply_decay_param_fun,
-            **args_dict)
+            learning_rate=learning_rate,
+            parameters=parameters,
+            beta1=beta1,
+            beta2=beta2,
+            epsilon=epsilon,
+            grad_clip=grad_clip,
+            name=name,
+            lazy_mode=lazy_mode)
+
+    def _scale_parameters(self, params_and_grads):
+        """
+        Adds weight decay ops.
+            scaled_parameter = parameter * coeff
+
+        Args:
+            params_and_grads: A list of (parameters, gradients) pairs,
+                the parameters need to decay.
+        Raises:
+            Exception: The type of coeff and parameter is not consistent.
+        """
+
+        scaled_params = []
+        for param, grad in params_and_grads:
+            # If no gradient then we don't need to do anything
+            if grad is None:
+                continue
+            if self._apply_decay_param_fun is not None \
+                    and not self._apply_decay_param_fun(param.name):
+                continue
+
+            if isinstance(self._coeff, float):
+                assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
+                    "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
+            else:
+                assert self._coeff.dtype == param.dtype, \
+                    "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
+            if isinstance(self._learning_rate, float):
+                learning_rate = self._learning_rate
+            else:
+                self._learning_rate()
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                if param.name not in self._params_name:
+                    scaled_params.append(
+                        (param, grad, param * self._coeff * learning_rate))
+                    self._params_name.add(param.name)
+                    param = param * self._coeff
+        return scaled_params
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+        params_grads = self.backward(
+            loss=loss,
+            startup_program=startup_program,
+            parameters=parameters,
+            no_grad_set=no_grad_set)
+        scaled_params = self._scale_parameters(params_grads)
+        for p_grad_sgrad in scaled_params:
+            param, grad, scaled_param = p_grad_sgrad
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                updated_param = paddle.fluid.layers.elementwise_sub(
+                    x=param, y=scaled_param)
+                paddle.fluid.layers.assign(input=updated_param, output=param)
+
+        optimize_ops = self._apply_optimize(
+            loss=loss,
+            params_grads=params_grads,
+            startup_program=startup_program)
+        return optimize_ops, params_grads
+
+    @framework.dygraph_only
+    def step(self):
+        parameter_list = self._parameter_list
+        self._dtype = None
+        params_grads = []
+        for param in self._parameter_list:
+            if not param.trainable:
+                continue
+            if param._grad_ivar() is not None:
+                grad_var = param._grad_ivar()
+                params_grads.append((param, grad_var))
+
+        scaled_params = self._scale_parameters(params_grads)
+        for p_grad_sgrad in scaled_params:
+            param, grad, scaled_param = p_grad_sgrad
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                updated_param = paddle.fluid.layers.elementwise_sub(
+                    x=param, y=scaled_param)
+                param.set_value(updated_param.numpy())
+        optimize_ops = self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads)
+
+    def __str__(self):
+        return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 3f9de0cefc05d1aaee36fa3af5cfa9ae4affcb97..2f7bc94e646324b849b0308b219261f56eba1e28 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -80,7 +80,6 @@ class Optimizer(object):
         .. code-block:: python
 
             #Take the subclass adam as an example
-            #Optimizer 
             import paddle
             import numpy as np
 
@@ -98,7 +97,7 @@ class Optimizer(object):
 
     """
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def __init__(self,
                  learning_rate,
                  parameters=None,
@@ -215,6 +214,8 @@ class Optimizer(object):
                 adam.set_state_dict(opti_state_dict)
 
         '''
+        if isinstance(self._learning_rate, _LRScheduler):
+            self._learning_rate.set_dict(state_dict["LR_Scheduler"])
 
         if isinstance(self._learning_rate, _LRScheduler):
             self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
@@ -270,6 +271,7 @@ class Optimizer(object):
                 main_prog = framework.default_main_program()
                 main_prog.lr_sheduler = self._learning_rate
                 main_prog.lr_var = lr_var
+
                 self._learning_rate_map[framework.default_main_program(
                 )] = lr_var
 
@@ -300,7 +302,7 @@ class Optimizer(object):
         this API cannot be invoked, because it will lead to conflict.
 
         Args:
-            value (float|Tensor): the value of learning rate
+            value (float): the value of learning rate
 
         Returns:
             None
@@ -358,6 +360,7 @@ class Optimizer(object):
         Get current step learning rate. The return value is all the same When _LRScheduler is not used,
         otherwise return the current step learning rate.
 
+
         Returns:
             float: The learning rate of the current step.
 
@@ -655,7 +658,7 @@ class Optimizer(object):
                 paddle.disable_static()
                 value = np.arange(26).reshape(2, 13).astype("float32")
                 a = paddle.to_tensor(value)
-                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
                 adam = paddle.optimizer.Adam(learning_rate = 0.01, 
                                             parameters = linear.parameters())
@@ -798,7 +801,7 @@ class Optimizer(object):
                 paddle.disable_static()
                 value = np.arange(26).reshape(2, 13).astype("float32")
                 a = paddle.to_tensor(value)
-                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
                 adam = paddle.optimizer.Adam(learning_rate = 0.01, 
                                             parameters = linear.parameters())
@@ -812,7 +815,7 @@ class Optimizer(object):
             if p.trainable:
                 p.clear_gradient()
 
-    @imperative_base.no_grad()
+    @imperative_base.no_grad
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -836,36 +839,33 @@ class Optimizer(object):
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) tensor pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
+            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
             indicate program pruning. If so, the program will be pruned by ``feed`` and 
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
             .. code-block:: python
-
+ 
                 import paddle
-                import paddle.fluid as fluid
-
-                place = fluid.CPUPlace()
-                main = fluid.Program()
-                with fluid.program_guard(main):
-                    x = fluid.data(name='x', shape=[None, 13], dtype='float32')
-                    y = fluid.data(name='y', shape=[None, 1], dtype='float32')
-                    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                    avg_cost = fluid.layers.mean(cost)
-
-                    adam_optimizer = paddle.optimizer.Adam(0.01)
-                    adam_optimizer.minimize(avg_cost)
-
-                    fetch_list = [avg_cost]
-                    train_reader = paddle.batch(
-                        paddle.dataset.uci_housing.train(), batch_size=1)
-                    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-                    exe = fluid.Executor(place)
-                    exe.run(fluid.default_startup_program())
-                    for data in train_reader():
-                        exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+                import numpy as np
+
+                paddle.disable_static()
+                inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+                linear = paddle.nn.Linear(10, 10)
+                inp = paddle.to_tensor(inp)
+                out = linear(inp)
+                loss = paddle.mean(out)
+
+                beta1 = paddle.to_tensor([0.9], dtype="float32")
+                beta2 = paddle.to_tensor([0.99], dtype="float32")
+
+                adam = paddle.optimizer.Adam(learning_rate=0.1,
+                        parameters=linear.parameters(),
+                        weight_decay=0.01)
+                out.backward()
+                adam.minimize(loss)
+                adam.clear_grad()
+
         """
         assert isinstance(loss, Variable), "The loss should be an Tensor."
 
@@ -885,7 +885,7 @@ class Optimizer(object):
     @framework.dygraph_only
     def step(self):
         """
-        Execute the optimizer once.
+        Execute the optimizer and update parameters once.
         
         Returns:
             None
@@ -898,7 +898,7 @@ class Optimizer(object):
                 paddle.disable_static()
                 value = np.arange(26).reshape(2, 13).astype("float32")
                 a = paddle.to_tensor(value)
-                linear = paddle.nn.Linear(13, 5, dtype="float32")
+                linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
                 adam = paddle.optimizer.Adam(learning_rate = 0.01, 
                                             parameters = linear.parameters())
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 0bc4c9bfd53dc15449f03d6de6c8942e977bf562..2609972d85ccdc2a867765431fefe21b9ba2de16 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -69,8 +69,8 @@ class RMSProp(Optimizer):
 
 
     Parameters:
-        learning_rate (float|LearningRateDecay): The learning rate used to update ``Parameter``.
-            It can be a float value or a LearningRateDecay.
+        learning_rate (float|_LRScheduler): The learning rate used to update ``Parameter``.
+            It can be a float value or a _LRScheduler.
         rho(float): rho is :math: `\\rho` in equation, default is 0.95.
         epsilon(float): :math: `\\epsilon` in equation is smoothing term to
             avoid division by zero, default is 1e-6.
@@ -80,7 +80,7 @@ class RMSProp(Optimizer):
             the gradient; if False, by the uncentered second moment. Setting this to
             True may help with training, but is slightly more expensive in terms of
             computation and memory. Defaults to False.
-	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
 	    This parameter is required in dygraph mode. \
 	    The default value is None in static mode, at this time all parameters will be updated.
 	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
@@ -147,6 +147,12 @@ class RMSProp(Optimizer):
             raise ValueError("epsilon is not set.")
         if momentum is None:
             raise ValueError("momentum is not set.")
+        if not 0.0 <= epsilon:
+            raise ValueError("Invalid value of epsilon, expect epsilon >= 0.")
+        if not 0.0 <= momentum:
+            raise ValueError("Invalid value of momentum, expect momentum >= 0.")
+        if not 0.0 <= rho:
+            raise ValueError("Invalid value of rho, expect rho >= 0.")
 
         super(RMSProp, self).__init__(
             learning_rate=learning_rate,
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index bb3a578e15724e9501d69dc209bdedc65afeb82b..133c3dfb24fed82e4d666321585932d7e58a6f29 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -85,7 +85,7 @@ class SGD(Optimizer):
             name=name)
         self.type = "sgd"
 
-    @no_grad()
+    @no_grad
     def _append_optimize_op(self, block, param_and_grad):
         lr = self._create_param_lr(param_and_grad)
         if framework.in_dygraph_mode():
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 845d2cf4d199328bbf8d0e03cd3a7a24a61aafd2..5a01fff88c16bfa584479d71ea93d78999de40df 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -433,8 +433,8 @@ def stack(x, axis=0, name=None):
                           [5.0, 6.0] ] ]
 
     Args:
-        x (Tensor|list[Tensor]): Input ``x`` can be a single tensor, or a ``list`` of tensors.
-                                     If ``x`` is a ``list``, the Tensors in ``x``
+        x (Tensor|list[Tensor]|tuple[Tensor]): Input ``x`` can be a single tensor, or a ``list`` or ``tuple`` of tensors.
+                                     If ``x`` is a ``list`` or ``tuple`` , the Tensors in ``x``
                                      must be of the same shape and dtype. Supported data types: float32, float64, int32, int64.
         axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
                               where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 0d87c1c2cf705372de7b8534cf8faea1bb5320a6..d2db2a7cb71945e137e46d6793f8cba1f7adf12f 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1618,6 +1618,10 @@ def clip(x, min=None, max=None, name=None):
     fmax = float(np.finfo(np_dtype).max)
 
     if in_dygraph_mode():
+        if isinstance(min, Variable):
+            min = min.numpy().item(0)
+        if isinstance(max, Variable):
+            max = max.numpy().item(0)
         min = fmin if min is None else min
         max = fmax if max is None else max
         return core.ops.clip(x, "min", min, "max", max)
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index c652d0f1891c8bd0a4c85ea777527a2fd82ad11b..6b08599fad1dfc6b5d60c3798bba802a5ddefd02 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -94,7 +94,7 @@ def bernoulli(x, name=None):
     return out
 
 
-def gaussian_random(shape, mean=0.0, std=1.0, dtype='float32', name=None):
+def gaussian_random(shape, mean=0.0, std=1.0, dtype=None, name=None):
     """
     This OP returns a Tensor filled with random values sampled from a Gaussian
     distribution, with ``shape`` and ``dtype``.
@@ -109,9 +109,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, dtype='float32', name=None):
         std(float|int, optional): Standard deviation of the output tensor, default
             is 1.0.
         seed(int, optional): ${seed_comment}
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
-            the output Tensor. Supported data types: float32, float64.
-            Default is float32.
+        dtype(str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
@@ -120,6 +121,13 @@ def gaussian_random(shape, mean=0.0, std=1.0, dtype='float32', name=None):
         Tensor: A Tensor filled with random values sampled from a Gaussian
         distribution, with ``shape`` and ``dtype``. 
     """
+    if dtype is None:
+        dtype = paddle.framework.get_default_dtype()
+        if dtype not in ['float32', 'float64']:
+            raise TypeError(
+                "gaussian_random only supports [float32, float64], but the default dtype is %s"
+                % dtype)
+
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
     seed = 0
@@ -169,9 +177,10 @@ def standard_normal(shape, dtype=None, name=None):
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
-            output tensor. Supported data types: float32, float64. If ``dytpe``
-            is None, the data type is float32. Default is None.
+        dtype(str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
@@ -216,7 +225,11 @@ def standard_normal(shape, dtype=None, name=None):
 
     """
     if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.framework.get_default_dtype()
+        if dtype not in ['float32', 'float64']:
+            raise TypeError(
+                "standard_normal only supports [float32, float64], but the default dtype is %s"
+                % dtype)
 
     return gaussian_random(
         shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
@@ -325,7 +338,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
     return out
 
 
-def uniform(shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None):
+def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     """
     This OP returns a Tensor filled with random values sampled from a uniform
     distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
@@ -343,9 +356,10 @@ def uniform(shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None):
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        dtype(str|np.dtype, optional): The data type of
-            the output Tensor. Supported data types: float32, float64.
-            Default is float32.
+        dtype(str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
         min(float|int, optional): The lower bound on the range of random values
             to generate, ``min`` is included in the range. Default is -1.0.
         max(float|int, optional): The upper bound on the range of random values
@@ -401,6 +415,13 @@ def uniform(shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None):
 
 
     """
+    if dtype is None:
+        dtype = paddle.framework.get_default_dtype()
+        if dtype not in ['float32', 'float64']:
+            raise TypeError(
+                "uniform only supports [float32, float64], but the default dtype is %s"
+                % dtype)
+
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
@@ -447,7 +468,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64). Default is [1].
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
+        dtype(str|np.dtype, optional): The data type of the
             output tensor. Supported data types: int32, int64. If ``dytpe``
             is None, the data type is int64. Default is None.
         name(str, optional): The default value is None.  Normally there is no
@@ -550,7 +571,7 @@ def randperm(n, dtype="int64", name=None):
 
     Args:
         n(int): The upper bound (exclusive), and it should be greater than 0.
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
+        dtype(str|np.dtype, optional): The data type of
             the output Tensor. Supported data types: int32, int64, float32,
             float64. Default is int64.
         name(str, optional): The default value is None. Normally there is no
@@ -622,9 +643,10 @@ def rand(shape, dtype=None, name=None):
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the
-            output tensor. Supported data types: float32, float64. If ``dytpe``
-            is None, the data type is float32. Default is None.
+        dtype(str|np.dtype, optional): The data type of the output Tensor.
+            Supported data types: float32, float64.
+            Default is None, use global default dtype (see ``get_default_dtype``
+            for details).
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
@@ -668,7 +690,11 @@ def rand(shape, dtype=None, name=None):
 
     """
     if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.framework.get_default_dtype()
+        if dtype not in ['float32', 'float64']:
+            raise TypeError(
+                "rand only supports [float32, float64], but the default dtype is %s"
+                % dtype)
 
     out = uniform(shape, dtype, min=0.0, max=1.0, name=name)
     out.stop_gradient = True
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index eede022e05ba61bc23da517e7af7cd2eb58f5416..552da3401c61d9c046c29bc86b429a8ae1242fa5 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -18,7 +18,6 @@ from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtyp
 from ..fluid import core, layers
 
 # TODO: define searching & indexing functions of a tensor  
-from ..fluid.layers import argmin  #DEFINE_ALIAS
 from ..fluid.layers import has_inf  #DEFINE_ALIAS
 from ..fluid.layers import has_nan  #DEFINE_ALIAS
 
@@ -124,7 +123,7 @@ def argsort(x, axis=-1, descending=False, name=None):
     return ids
 
 
-def argmax(x, axis=None, dtype=None, keepdim=False, name=None):
+def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
     """
     This OP computes the indices of the max elements of the input tensor's
     element along the provided axis.
@@ -135,10 +134,10 @@ def argmax(x, axis=None, dtype=None, keepdim=False, name=None):
         axis(int, optional): Axis to compute indices along. The effective range
             is [-R, R), where R is x.ndim. when axis < 0, it works the same way
             as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
-        dtype(str): Data type of the output tensor which can
-                    be int32, int64. The default value is None, and it will
-                    return the int64 indices.
         keepdim(bool, optional): Keep the axis that selecting max. The defalut value is False.
+        dtype(str|np.dtype, optional): Data type of the output tensor which can
+                    be int32, int64. The default value is 'int64', and it will
+                    return the int64 indices.
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
@@ -166,48 +165,39 @@ def argmax(x, axis=None, dtype=None, keepdim=False, name=None):
             print(out3.numpy()) 
             # [2 3 1]
     """
+    if axis is not None and not isinstance(axis, int):
+        raise TypeError(
+            "The type of 'axis'  must be int or None in argmax, but received %s."
+            % (type(axis)))
+    var_dtype = convert_np_dtype_to_dtype_(dtype)
+    check_dtype(var_dtype, 'dtype', ['int32', 'int64'], 'argmin')
     flatten = False
     if axis is None:
         flatten = True
         axis = 0
 
     if in_dygraph_mode():
-        if dtype != None:
-            var_dtype = convert_np_dtype_to_dtype_(dtype)
-            out = core.ops.arg_max(x, 'axis', axis, 'dtype', var_dtype,
-                                   'keepdim', keepdim, 'flatten', flatten)
-        else:
-            out = core.ops.arg_max(x, 'axis', axis, 'keepdim', keepdim,
-                                   'flatten', flatten)
+        out = core.ops.arg_max(x, 'axis', axis, 'dtype', var_dtype, 'keepdims',
+                               keepdim, 'flatten', flatten)
         return out
 
     helper = LayerHelper("argmax", **locals())
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
         'paddle.argmax')
-    var_dtype = None
     attrs = {}
-    if dtype is not None:
-        if dtype not in ['int32', 'int64']:
-            raise ValueError(
-                "The value of 'dtype' in argmax op must be int32, int64, but received of {}".
-                format(dtype))
-        var_dtype = convert_np_dtype_to_dtype_(dtype)
-        attrs["dtype"] = var_dtype
-    else:
-        var_dtype = VarDesc.VarType.INT64
-
     out = helper.create_variable_for_type_inference(var_dtype)
     attrs['keepdims'] = keepdim
     attrs['axis'] = axis
     attrs['flatten'] = flatten
+    attrs['dtype'] = var_dtype
     helper.append_op(
         type='arg_max', inputs={'X': x}, outputs={'Out': [out]}, attrs=attrs)
     out.stop_gradient = True
     return out
 
 
-def argmin(x, axis=None, dtype=None, keepdim=False, name=None):
+def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
     """
     This OP computes the indices of the min elements of the input tensor's
     element along the provided axis.
@@ -218,10 +208,10 @@ def argmin(x, axis=None, dtype=None, keepdim=False, name=None):
         axis(int, optional): Axis to compute indices along. The effective range
             is [-R, R), where R is x.ndim. when axis < 0, it works the same way
             as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
+        keepdim(bool, optional): Keep the axis that selecting min. The defalut value is False.
         dtype(str): Data type of the output tensor which can
-                    be int32, int64. The default value is None, and it will
+                    be int32, int64. The default value is 'int64', and it will
                     return the int64 indices.
-        keepdim(bool, optional): Keep the axis that selecting min. The defalut value is False.
         name(str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
@@ -249,41 +239,32 @@ def argmin(x, axis=None, dtype=None, keepdim=False, name=None):
             print(out3.numpy()) 
             # [0 0 2]
     """
+    if axis is not None and not isinstance(axis, int):
+        raise TypeError(
+            "The type of 'axis'  must be int or None in argmin, but received %s."
+            % (type(axis)))
+    var_dtype = convert_np_dtype_to_dtype_(dtype)
+    check_dtype(var_dtype, 'dtype', ['int32', 'int64'], 'argmin')
     flatten = False
     if axis is None:
         flatten = True
         axis = 0
 
     if in_dygraph_mode():
-        if dtype != None:
-            var_dtype = convert_np_dtype_to_dtype_(dtype)
-            out = core.ops.arg_min(x, 'axis', axis, 'dtype', var_dtype,
-                                   'keepdim', keepdim, 'flatten', flatten)
-        else:
-            out = core.ops.arg_min(x, 'axis', axis, 'keepdim', keepdim,
-                                   'flatten', flatten)
+        out = core.ops.arg_min(x, 'axis', axis, 'dtype', var_dtype, 'keepdims',
+                               keepdim, 'flatten', flatten)
         return out
 
     helper = LayerHelper("argmin", **locals())
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
         'paddle.argmin')
-    var_dtype = None
-    attrs = {}
-    if dtype is not None:
-        if dtype not in ['int32', 'int64']:
-            raise ValueError(
-                "The value of 'dtype' in argmin op must be int32, int64, but received of {}".
-                format(dtype))
-        var_dtype = convert_np_dtype_to_dtype_(dtype)
-        attrs["dtype"] = var_dtype
-    else:
-        var_dtype = VarDesc.VarType.INT64
-
     out = helper.create_variable_for_type_inference(var_dtype)
+    attrs = {}
     attrs['keepdims'] = keepdim
     attrs['axis'] = axis
     attrs['flatten'] = flatten
+    attrs['dtype'] = var_dtype
     helper.append_op(
         type='arg_min', inputs={'X': x}, outputs={'Out': [out]}, attrs=attrs)
     out.stop_gradient = True
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index 79bec8c4ad34d682895250bc29b1fddb3a569bd4..e1bc65a5d15c2883e14d20c5e06c2ee3cd726ea5 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -1,6 +1,41 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
+string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
+
+foreach(TEST_OP ${DIST_TEST_OPS})
+    list(REMOVE_ITEM TEST_OPS ${TEST_OP})
+endforeach()
+
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+
+function(py_dist_test TARGET_NAME)
+  if(WITH_TESTING)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS ARGS ENVS)
+    cmake_parse_arguments(py_dist_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(WITH_COVERAGE AND WITH_GPU AND WITH_NCCL AND NOT WIN32)
+      add_test(NAME ${TARGET_NAME}
+               COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true
+               FLAGS_cpu_deterministic=true NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1
+               PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_dist_test_ENVS}
+               COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data
+               ${PYTHON_EXECUTABLE} -u ${py_dist_test_SRCS} ${py_dist_test_ARGS}
+               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+      # No unit test should exceed 10 minutes.
+      set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=DIST")
+    endif()
+
+    
+  endif()
+endfunction()
+
+foreach(src ${DIST_TEST_OPS})
+    message(STATUS ${src})
+    py_dist_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py b/python/paddle/tests/dist_hapi_mnist_dynamic.py
similarity index 91%
rename from python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
rename to python/paddle/tests/dist_hapi_mnist_dynamic.py
index ede99a50c2fa72da3bd1999204a5fe1e5a656be2..13d966bf38f2aaed35e120aa4d25705cfc36c230 100644
--- a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py
@@ -20,14 +20,15 @@ import unittest
 import numpy as np
 import contextlib
 
-from paddle import fluid
+import paddle
+import paddle.fluid as fluid
 
-from paddle.incubate.hapi import Model, Input, set_device
+from paddle import Model, set_device
+from paddle.static import InputSpec as Input
 from paddle.nn.layer.loss import CrossEntropyLoss
-from paddle.incubate.hapi.vision.models import LeNet
 from paddle.metric import Accuracy
-from paddle.incubate.hapi.callbacks import ProgBarLogger
-from paddle.incubate.hapi.datasets import MNIST
+from paddle.vision.models import LeNet
+from paddle.vision.datasets import MNIST
 
 
 class MnistDataset(MNIST):
@@ -76,7 +77,7 @@ class TestDistTraning(unittest.TestCase):
         val_dataset = MnistDataset(mode='test')
         test_dataset = MnistDataset(mode='test', return_label=False)
 
-        cbk = ProgBarLogger(50)
+        cbk = paddle.callbacks.ProgBarLogger(50)
         model.fit(train_dataset,
                   val_dataset,
                   epochs=2,
diff --git a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py b/python/paddle/tests/dist_hapi_mnist_static.py
similarity index 91%
rename from python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
rename to python/paddle/tests/dist_hapi_mnist_static.py
index 28305fc6a6fd08c160f946920e85391cd444caef..9d8e5f3652c9810579a0b66035a64d1d3b915bff 100644
--- a/python/paddle/incubate/hapi/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/tests/dist_hapi_mnist_static.py
@@ -20,14 +20,15 @@ import unittest
 import numpy as np
 import contextlib
 
-from paddle import fluid
+import paddle
+import paddle.fluid as fluid
 
-from paddle.incubate.hapi import Model, Input, set_device
+from paddle import Model, set_device
+from paddle.static import InputSpec as Input
 from paddle.nn.layer.loss import CrossEntropyLoss
-from paddle.incubate.hapi.vision.models import LeNet
 from paddle.metric import Accuracy
-from paddle.incubate.hapi.callbacks import ProgBarLogger
-from paddle.incubate.hapi.datasets import MNIST
+from paddle.vision.models import LeNet
+from paddle.vision.datasets import MNIST
 
 
 class MnistDataset(MNIST):
@@ -75,7 +76,7 @@ class TestDistTraning(unittest.TestCase):
         val_dataset = MnistDataset(mode='test')
         test_dataset = MnistDataset(mode='test', return_label=False)
 
-        cbk = ProgBarLogger(50)
+        cbk = paddle.callbacks.ProgBarLogger(50)
         model.fit(train_dataset,
                   val_dataset,
                   epochs=2,
diff --git a/python/paddle/incubate/hapi/tests/test_callbacks.py b/python/paddle/tests/test_callbacks.py
similarity index 93%
rename from python/paddle/incubate/hapi/tests/test_callbacks.py
rename to python/paddle/tests/test_callbacks.py
index e49bf215c276c8b495b0f991a5821d4c674f48d2..f0d9a132b90eb1c7006fd53557a03376394ee2ab 100644
--- a/python/paddle/incubate/hapi/tests/test_callbacks.py
+++ b/python/paddle/tests/test_callbacks.py
@@ -18,9 +18,10 @@ import random
 import tempfile
 import shutil
 
-from paddle.incubate.hapi.model import Model, Input
-from paddle.incubate.hapi.vision.models import LeNet
-from paddle.incubate.hapi.callbacks import config_callbacks
+from paddle import Model
+from paddle.static import InputSpec
+from paddle.vision.models import LeNet
+from paddle.hapi.callbacks import config_callbacks
 
 
 class TestCallbacks(unittest.TestCase):
@@ -36,7 +37,7 @@ class TestCallbacks(unittest.TestCase):
         freq = 2
         eval_steps = 20
 
-        inputs = [Input([None, 1, 28, 28], 'float32', 'image')]
+        inputs = [InputSpec([None, 1, 28, 28], 'float32', 'image')]
         lenet = Model(LeNet(), inputs)
         lenet.prepare()
 
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_cifar.py b/python/paddle/tests/test_dataset_cifar.py
similarity index 93%
rename from python/paddle/incubate/hapi/tests/test_dataset_cifar.py
rename to python/paddle/tests/test_dataset_cifar.py
index 08d9f4353c0ed639f5ad907c921bf7b2c88271f5..2ecc41c3f0a81a56cc34e826483ea4f5cc6681d9 100644
--- a/python/paddle/incubate/hapi/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
@@ -13,14 +13,9 @@
 # limitations under the License.
 
 import unittest
-import os
 import numpy as np
-import tempfile
-import shutil
-import cv2
 
-from paddle.incubate.hapi.datasets import *
-from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+from paddle.vision.datasets import *
 
 
 class TestCifar10Train(unittest.TestCase):
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_conll05.py b/python/paddle/tests/test_dataset_conll05.py
similarity index 87%
rename from python/paddle/incubate/hapi/tests/test_dataset_conll05.py
rename to python/paddle/tests/test_dataset_conll05.py
index 0ed2a4180d0cb341f5d57bdf1cb9d8ef145a44fb..e35c04275d20478336da76c9ba47c98960a9ea24 100644
--- a/python/paddle/incubate/hapi/tests/test_dataset_conll05.py
+++ b/python/paddle/tests/test_dataset_conll05.py
@@ -12,15 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
 import os
+import unittest
 import numpy as np
-import tempfile
-import shutil
-import cv2
 
-from paddle.incubate.hapi.datasets import *
-from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+from paddle.text.datasets import *
 
 
 class TestConll05st(unittest.TestCase):
@@ -36,6 +32,8 @@ class TestConll05st(unittest.TestCase):
         for s in sample:
             self.assertTrue(len(s.shape) == 1)
 
+        assert os.path.exists(conll05st.get_embedding())
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_imdb.py b/python/paddle/tests/test_dataset_imdb.py
similarity index 90%
rename from python/paddle/incubate/hapi/tests/test_dataset_imdb.py
rename to python/paddle/tests/test_dataset_imdb.py
index cef73634b6b5fb114fa88b785bb77a87fe129bd5..62c75ab232c8db10f99257fdae17191f94726b61 100644
--- a/python/paddle/incubate/hapi/tests/test_dataset_imdb.py
+++ b/python/paddle/tests/test_dataset_imdb.py
@@ -13,14 +13,9 @@
 # limitations under the License.
 
 import unittest
-import os
 import numpy as np
-import tempfile
-import shutil
-import cv2
 
-from paddle.incubate.hapi.datasets import *
-from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+from paddle.text.datasets import *
 
 
 class TestImdbTrain(unittest.TestCase):
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py b/python/paddle/tests/test_dataset_imikolov.py
similarity index 89%
rename from python/paddle/incubate/hapi/tests/test_dataset_imikolov.py
rename to python/paddle/tests/test_dataset_imikolov.py
index f3d97d314acbf7f55a8482fd386581fef7f16e03..f4f0b8e48367725abb4ebe1fe5b0598ed6e749f1 100644
--- a/python/paddle/incubate/hapi/tests/test_dataset_imikolov.py
+++ b/python/paddle/tests/test_dataset_imikolov.py
@@ -13,14 +13,9 @@
 # limitations under the License.
 
 import unittest
-import os
 import numpy as np
-import tempfile
-import shutil
-import cv2
 
-from paddle.incubate.hapi.datasets import *
-from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+from paddle.text.datasets import *
 
 
 class TestImikolovTrain(unittest.TestCase):
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py b/python/paddle/tests/test_dataset_movie_reviews.py
similarity index 90%
rename from python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py
rename to python/paddle/tests/test_dataset_movie_reviews.py
index ae8a7a3035ee0e86f8ee2fa9e8a23f6036758d2d..e6e6667013f89aca305f82a744c00de2af818736 100644
--- a/python/paddle/incubate/hapi/tests/test_dataset_movie_reviews.py
+++ b/python/paddle/tests/test_dataset_movie_reviews.py
@@ -13,14 +13,9 @@
 # limitations under the License.
 
 import unittest
-import os
 import numpy as np
-import tempfile
-import shutil
-import cv2
 
-from paddle.incubate.hapi.datasets import *
-from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+from paddle.text.datasets import *
 
 
 class TestMovieReviewsTrain(unittest.TestCase):
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_movielens.py b/python/paddle/tests/test_dataset_movielens.py
similarity index 91%
rename from python/paddle/incubate/hapi/tests/test_dataset_movielens.py
rename to python/paddle/tests/test_dataset_movielens.py
index f94269f930e05e04b3bdfc4324e5ae1ea15b1fb9..3b61fd6f5c7c22bca5114579fdafe46405f77118 100644
--- a/python/paddle/incubate/hapi/tests/test_dataset_movielens.py
+++ b/python/paddle/tests/test_dataset_movielens.py
@@ -13,14 +13,9 @@
 # limitations under the License.
 
 import unittest
-import os
 import numpy as np
-import tempfile
-import shutil
-import cv2
 
-from paddle.incubate.hapi.datasets import *
-from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+from paddle.text.datasets import *
 
 
 class TestMovielensTrain(unittest.TestCase):
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py b/python/paddle/tests/test_dataset_uci_housing.py
similarity index 96%
rename from python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py
rename to python/paddle/tests/test_dataset_uci_housing.py
index 768367bff9911a352ea6b13f279d5b71938bc85b..623c7d24d09da7501edd6a8d86e60fc3b772d086 100644
--- a/python/paddle/incubate/hapi/tests/test_dataset_uci_housing.py
+++ b/python/paddle/tests/test_dataset_uci_housing.py
@@ -19,8 +19,7 @@ import tempfile
 import shutil
 import cv2
 
-from paddle.incubate.hapi.datasets import *
-from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+from paddle.text.datasets import *
 
 
 class TestUCIHousingTrain(unittest.TestCase):
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_voc.py b/python/paddle/tests/test_dataset_voc.py
similarity index 92%
rename from python/paddle/incubate/hapi/tests/test_dataset_voc.py
rename to python/paddle/tests/test_dataset_voc.py
index 85766ab8e30a3a7abd5e2966e6353b116c03e926..d45df419b1283a40b46252bee2d37a9e2fdaadb9 100644
--- a/python/paddle/incubate/hapi/tests/test_dataset_voc.py
+++ b/python/paddle/tests/test_dataset_voc.py
@@ -15,12 +15,8 @@
 import unittest
 import os
 import numpy as np
-import tempfile
-import shutil
-import cv2
 
-from paddle.incubate.hapi.datasets import voc2012, VOC2012
-from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+from paddle.vision.datasets import voc2012, VOC2012
 
 # VOC2012 is too large for unittest to download, stub a small dataset here
 voc2012.VOC_URL = 'https://paddlemodels.bj.bcebos.com/voc2012_stub/VOCtrainval_11-May-2012.tar'
diff --git a/python/paddle/incubate/hapi/tests/test_dataset_wmt.py b/python/paddle/tests/test_dataset_wmt.py
similarity index 95%
rename from python/paddle/incubate/hapi/tests/test_dataset_wmt.py
rename to python/paddle/tests/test_dataset_wmt.py
index 987e55676aadb77582c58b13e626d7258f3c75b5..b4945cb90f991e907812129f3918ef0137565244 100644
--- a/python/paddle/incubate/hapi/tests/test_dataset_wmt.py
+++ b/python/paddle/tests/test_dataset_wmt.py
@@ -13,14 +13,9 @@
 # limitations under the License.
 
 import unittest
-import os
 import numpy as np
-import tempfile
-import shutil
-import cv2
 
-from paddle.incubate.hapi.datasets import *
-from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+from paddle.text.datasets import *
 
 
 class TestWMT14Train(unittest.TestCase):
diff --git a/python/paddle/incubate/hapi/tests/test_datasets.py b/python/paddle/tests/test_datasets.py
similarity index 97%
rename from python/paddle/incubate/hapi/tests/test_datasets.py
rename to python/paddle/tests/test_datasets.py
index 7f544e5ad84d5aa2041e8fdb6c1ac77cc34d8164..1e50ff60aa5c3039c21d6e1e3a714c32000462c7 100644
--- a/python/paddle/incubate/hapi/tests/test_datasets.py
+++ b/python/paddle/tests/test_datasets.py
@@ -19,8 +19,8 @@ import tempfile
 import shutil
 import cv2
 
-from paddle.incubate.hapi.datasets import *
-from paddle.incubate.hapi.datasets.utils import _check_exists_and_download
+from paddle.vision.datasets import *
+from paddle.dataset.common import _check_exists_and_download
 
 
 class TestFolderDatasets(unittest.TestCase):
diff --git a/python/paddle/incubate/hapi/tests/test_dist_hapi_model.py b/python/paddle/tests/test_dist_hapi_model.py
similarity index 100%
rename from python/paddle/incubate/hapi/tests/test_dist_hapi_model.py
rename to python/paddle/tests/test_dist_hapi_model.py
diff --git a/python/paddle/incubate/hapi/tests/test_download.py b/python/paddle/tests/test_download.py
similarity index 97%
rename from python/paddle/incubate/hapi/tests/test_download.py
rename to python/paddle/tests/test_download.py
index e8bd8306daf651dfbe96881424a02d4ffdb2a9e6..6fb53573c21a1589e474e337d058294c09f65f38 100644
--- a/python/paddle/incubate/hapi/tests/test_download.py
+++ b/python/paddle/tests/test_download.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from paddle.incubate.hapi.download import get_weights_path_from_url
+from paddle.utils.download import get_weights_path_from_url
 
 
 class TestDownload(unittest.TestCase):
diff --git a/python/paddle/incubate/hapi/tests/test_logger.py b/python/paddle/tests/test_logger.py
similarity index 96%
rename from python/paddle/incubate/hapi/tests/test_logger.py
rename to python/paddle/tests/test_logger.py
index f25d0ee4f7e2f0db1031f1f2884fb6df338003cc..b6edec8674a64fb7ce41f4e60d8d6b8822c514e3 100644
--- a/python/paddle/incubate/hapi/tests/test_logger.py
+++ b/python/paddle/tests/test_logger.py
@@ -21,7 +21,7 @@ import numpy as np
 import shutil
 import tempfile
 
-from paddle.incubate.hapi.logger import setup_logger
+from paddle.hapi.logger import setup_logger
 
 
 class TestSetupLogger(unittest.TestCase):
diff --git a/python/paddle/tests/test_metrics.py b/python/paddle/tests/test_metrics.py
index 2272a81b3f602ec46972c9d4620ded9680e2ff5f..f05cdf9c6da10bdcb68739e7018933d0ebe006dc 100644
--- a/python/paddle/tests/test_metrics.py
+++ b/python/paddle/tests/test_metrics.py
@@ -22,7 +22,7 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 
-from paddle.incubate.hapi.utils import to_list
+from paddle.hapi.model import to_list
 
 
 def accuracy(pred, label, topk=(1, )):
diff --git a/python/paddle/incubate/hapi/tests/test_model.py b/python/paddle/tests/test_model.py
similarity index 85%
rename from python/paddle/incubate/hapi/tests/test_model.py
rename to python/paddle/tests/test_model.py
index 7fc471aa1e2eeb80ae81d4a32b09eeff74193e6f..e078595dc9551763f2c4fc1b17f5b4220e3b1f6d 100644
--- a/python/paddle/incubate/hapi/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -24,21 +24,22 @@ import tempfile
 
 import paddle
 from paddle import fluid
+from paddle import to_tensor
 from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
-from paddle.fluid.dygraph.base import to_variable
 
-import paddle.incubate.hapi as hapi
-from paddle.incubate.hapi import Model, Input
+from paddle import Model
+from paddle.static import InputSpec
 from paddle.nn.layer.loss import CrossEntropyLoss
 from paddle.metric import Accuracy
-from paddle.incubate.hapi.datasets import MNIST
-from paddle.incubate.hapi.vision.models import LeNet
-from paddle.incubate.hapi.distributed import DistributedBatchSampler, prepare_distributed_context
+from paddle.vision.datasets import MNIST
+from paddle.vision.models import LeNet
+from paddle.io import DistributedBatchSampler
+from paddle.hapi.model import prepare_distributed_context
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
 
 
-class LeNetDygraph(fluid.dygraph.Layer):
+class LeNetDygraph(paddle.nn.Layer):
     def __init__(self, num_classes=10, classifier_activation=None):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
@@ -154,7 +155,7 @@ class TestModel(unittest.TestCase):
     def setUpClass(cls):
         if not fluid.is_compiled_with_cuda():
             self.skipTest('module not tested when ONLY_CPU compling')
-        cls.device = hapi.set_device('gpu')
+        cls.device = paddle.set_device('gpu')
         fluid.enable_dygraph(cls.device)
 
         sp_num = 1280
@@ -180,8 +181,8 @@ class TestModel(unittest.TestCase):
 
         cls.acc1 = dynamic_evaluate(dy_lenet, cls.val_loader)
 
-        cls.inputs = [Input([-1, 1, 28, 28], 'float32', 'image')]
-        cls.labels = [Input([None, 1], 'int64', 'label')]
+        cls.inputs = [InputSpec([-1, 1, 28, 28], 'float32', 'image')]
+        cls.labels = [InputSpec([None, 1], 'int64', 'label')]
 
         cls.save_dir = tempfile.mkdtemp()
         cls.weight_path = os.path.join(cls.save_dir, 'lenet')
@@ -314,7 +315,7 @@ class TestModel(unittest.TestCase):
         fluid.disable_dygraph() if dynamic else None
 
 
-class MyModel(fluid.dygraph.Layer):
+class MyModel(paddle.nn.Layer):
     def __init__(self, classifier_activation='softmax'):
         super(MyModel, self).__init__()
         self._fc = Linear(20, 10)
@@ -343,8 +344,8 @@ class TestModelFunction(unittest.TestCase):
             optim = fluid.optimizer.SGD(learning_rate=0.001,
                                         parameter_list=m.parameters())
             m.train()
-            output = m(to_variable(data))
-            loss = CrossEntropyLoss(reduction='sum')(output, to_variable(label))
+            output = m(to_tensor(data))
+            loss = CrossEntropyLoss(reduction='sum')(output, to_tensor(label))
             avg_loss = fluid.layers.reduce_sum(loss)
             avg_loss.backward()
             optim.minimize(avg_loss)
@@ -354,7 +355,7 @@ class TestModelFunction(unittest.TestCase):
 
         ref = get_expect()
         for dynamic in [True, False]:
-            device = hapi.set_device('cpu')
+            device = paddle.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
             self.set_seed()
 
@@ -362,8 +363,8 @@ class TestModelFunction(unittest.TestCase):
             optim2 = fluid.optimizer.SGD(learning_rate=0.001,
                                          parameter_list=net.parameters())
 
-            inputs = [Input([None, dim], 'float32', 'x')]
-            labels = [Input([None, 1], 'int64', 'label')]
+            inputs = [InputSpec([None, dim], 'float32', 'x')]
+            labels = [InputSpec([None, 1], 'int64', 'label')]
             model = Model(net, inputs, labels)
             model.prepare(optim2, loss=CrossEntropyLoss(reduction="sum"))
             loss, = model.train_batch([data], [label])
@@ -379,17 +380,17 @@ class TestModelFunction(unittest.TestCase):
             self.set_seed()
             m = MyModel()
             m.eval()
-            output = m(to_variable(data))
+            output = m(to_tensor(data))
             fluid.disable_dygraph()
             return output.numpy()
 
         ref = get_expect()
         for dynamic in [True, False]:
-            device = hapi.set_device('cpu')
+            device = paddle.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
             self.set_seed()
             net = MyModel()
-            inputs = [Input([None, dim], 'float32', 'x')]
+            inputs = [InputSpec([None, dim], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
             out, = model.test_batch([data])
@@ -400,11 +401,11 @@ class TestModelFunction(unittest.TestCase):
     def test_save_load(self):
         path = tempfile.mkdtemp()
         for dynamic in [True, False]:
-            device = hapi.set_device('cpu')
+            device = paddle.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
             net = MyModel(classifier_activation=None)
-            inputs = [Input([None, 20], 'float32', 'x')]
-            labels = [Input([None, 1], 'int64', 'label')]
+            inputs = [InputSpec([None, 20], 'float32', 'x')]
+            labels = [InputSpec([None, 1], 'int64', 'label')]
             optim = fluid.optimizer.SGD(learning_rate=0.001,
                                         parameter_list=net.parameters())
             model = Model(net, inputs, labels)
@@ -415,10 +416,33 @@ class TestModelFunction(unittest.TestCase):
             shutil.rmtree(path)
             fluid.disable_dygraph() if dynamic else None
 
+    def test_dynamic_load(self):
+        mnist_data = MnistDataset(mode='train')
+        for new_optimizer in [True, False]:
+            path = tempfile.mkdtemp()
+            paddle.disable_static()
+            net = LeNet()
+            inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
+            labels = [InputSpec([None, 1], 'int64', 'label')]
+            if new_optimizer:
+                optim = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=net.parameters())
+            else:
+                optim = fluid.optimizer.Adam(
+                    learning_rate=0.001, parameter_list=net.parameters())
+            model = Model(net, inputs, labels)
+            model.prepare(
+                optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
+            model.fit(mnist_data, batch_size=64, verbose=0)
+            model.save(path + '/test')
+            model.load(path + '/test')
+            shutil.rmtree(path)
+            paddle.enable_static()
+
     def test_dynamic_save_static_load(self):
         path = tempfile.mkdtemp()
         # dynamic saving
-        device = hapi.set_device('cpu')
+        device = paddle.set_device('cpu')
         fluid.enable_dygraph(device)
         model = Model(MyModel(classifier_activation=None))
         optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -427,8 +451,8 @@ class TestModelFunction(unittest.TestCase):
         model.save(path + '/test')
         fluid.disable_dygraph()
 
-        inputs = [Input([None, 20], 'float32', 'x')]
-        labels = [Input([None, 1], 'int64', 'label')]
+        inputs = [InputSpec([None, 20], 'float32', 'x')]
+        labels = [InputSpec([None, 1], 'int64', 'label')]
         model = Model(MyModel(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
@@ -440,20 +464,20 @@ class TestModelFunction(unittest.TestCase):
         path = tempfile.mkdtemp()
 
         net = MyModel(classifier_activation=None)
-        inputs = [Input([None, 20], 'float32', 'x')]
-        labels = [Input([None, 1], 'int64', 'label')]
+        inputs = [InputSpec([None, 20], 'float32', 'x')]
+        labels = [InputSpec([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=net.parameters())
         model = Model(net, inputs, labels)
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
         model.save(path + '/test')
 
-        device = hapi.set_device('cpu')
+        device = paddle.set_device('cpu')
         fluid.enable_dygraph(device)  #if dynamic else None
 
         net = MyModel(classifier_activation=None)
-        inputs = [Input([None, 20], 'float32', 'x')]
-        labels = [Input([None, 1], 'int64', 'label')]
+        inputs = [InputSpec([None, 20], 'float32', 'x')]
+        labels = [InputSpec([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=net.parameters())
         model = Model(net, inputs, labels)
@@ -464,10 +488,10 @@ class TestModelFunction(unittest.TestCase):
 
     def test_parameters(self):
         for dynamic in [True, False]:
-            device = hapi.set_device('cpu')
+            device = paddle.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
             net = MyModel()
-            inputs = [Input([None, 20], 'float32', 'x')]
+            inputs = [InputSpec([None, 20], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
             params = model.parameters()
@@ -482,7 +506,7 @@ class TestModelFunction(unittest.TestCase):
             prog_translator = ProgramTranslator()
             prog_translator.enable(False) if not dynamic else None
             net = LeNetDeclarative()
-            inputs = [Input([None, 1, 28, 28], 'float32', 'x')]
+            inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
             save_dir = tempfile.mkdtemp()
@@ -514,8 +538,8 @@ class TestRaiseError(unittest.TestCase):
     def test_input_without_name(self):
         net = MyModel(classifier_activation=None)
 
-        inputs = [Input([None, 10], 'float32')]
-        labels = [Input([None, 1], 'int64', 'label')]
+        inputs = [InputSpec([None, 10], 'float32')]
+        labels = [InputSpec([None, 1], 'int64', 'label')]
         with self.assertRaises(ValueError):
             model = Model(net, inputs, labels)
 
diff --git a/python/paddle/incubate/hapi/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
similarity index 82%
rename from python/paddle/incubate/hapi/tests/test_pretrained_model.py
rename to python/paddle/tests/test_pretrained_model.py
index 334ebff449d4f34c9a5a9b56ee7998b4dbc5abf0..641147d39e94f7c2bbb426900ed484546bad49c6 100644
--- a/python/paddle/incubate/hapi/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -15,9 +15,9 @@
 import unittest
 import numpy as np
 
-import paddle.fluid as fluid
-import paddle.incubate.hapi.vision.models as models
-from paddle.incubate.hapi import Model, Input
+import paddle
+from paddle.static import InputSpec
+import paddle.vision.models as models
 
 
 # test the predicted resutls of static graph and dynamic graph are equal
@@ -25,16 +25,16 @@ from paddle.incubate.hapi import Model, Input
 class TestPretrainedModel(unittest.TestCase):
     def infer(self, x, arch, dygraph=True):
         if dygraph:
-            fluid.enable_dygraph()
+            paddle.disable_static()
 
         net = models.__dict__[arch](pretrained=True, classifier_activation=None)
-        inputs = [Input([None, 3, 224, 224], 'float32', 'image')]
-        model = Model(network=net, inputs=inputs)
+        inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')]
+        model = paddle.Model(network=net, inputs=inputs)
         model.prepare()
         res = model.test_batch(x)
 
         if dygraph:
-            fluid.disable_dygraph()
+            paddle.enable_static()
         return res
 
     def test_models(self):
diff --git a/python/paddle/incubate/hapi/tests/test_progressbar.py b/python/paddle/tests/test_progressbar.py
similarity index 97%
rename from python/paddle/incubate/hapi/tests/test_progressbar.py
rename to python/paddle/tests/test_progressbar.py
index ff315ef505606aaf45b46a722de8f0386ae2d5ed..4726522918238a2f88b73edbdebb3dea6fbe1281 100644
--- a/python/paddle/incubate/hapi/tests/test_progressbar.py
+++ b/python/paddle/tests/test_progressbar.py
@@ -17,7 +17,7 @@ import unittest
 import random
 import time
 
-from paddle.incubate.hapi.progressbar import ProgressBar
+from paddle.hapi.progressbar import ProgressBar
 
 
 class TestProgressBar(unittest.TestCase):
diff --git a/python/paddle/incubate/hapi/tests/test_text.py b/python/paddle/tests/test_text.py
similarity index 99%
rename from python/paddle/incubate/hapi/tests/test_text.py
rename to python/paddle/tests/test_text.py
index c4fef0d749ce788e50d8cffdf9b7041e33d078af..43968896c18bda6445de46773899128e1bedff53 100644
--- a/python/paddle/incubate/hapi/tests/test_text.py
+++ b/python/paddle/tests/test_text.py
@@ -24,8 +24,9 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Embedding, Linear, Layer
 from paddle.fluid.layers import BeamSearchDecoder
-from paddle.incubate.hapi import Model, Input, set_device
-from paddle.incubate.hapi.text import *
+from paddle import Model, set_device
+from paddle.static import InputSpec as Input
+from paddle.text import *
 
 
 class ModuleApiTest(unittest.TestCase):
diff --git a/python/paddle/incubate/hapi/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
similarity index 96%
rename from python/paddle/incubate/hapi/tests/test_transforms.py
rename to python/paddle/tests/test_transforms.py
index 84208fda1e947f343de52a0a3c8de68322672013..6c2944d1e750faa9e9ed8d3c2b832b8aff59b954 100644
--- a/python/paddle/incubate/hapi/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# when test, you should add hapi root path to the PYTHONPATH,
-# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
 import unittest
 import os
 import tempfile
@@ -21,9 +19,9 @@ import cv2
 import shutil
 import numpy as np
 
-from paddle.incubate.hapi.datasets import DatasetFolder
-from paddle.incubate.hapi.vision.transforms import transforms
-import paddle.incubate.hapi.vision.transforms.functional as F
+from paddle.vision.datasets import DatasetFolder
+from paddle.vision.transforms import transforms
+import paddle.vision.transforms.functional as F
 
 
 class TestTransforms(unittest.TestCase):
diff --git a/python/paddle/incubate/hapi/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
similarity index 86%
rename from python/paddle/incubate/hapi/tests/test_vision_models.py
rename to python/paddle/tests/test_vision_models.py
index 2dc9355bcc3005d48b7046123b024fa2a91594c3..44f9ab5390122f086af4168e225fe2b5a2d8a9b2 100644
--- a/python/paddle/incubate/hapi/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -15,8 +15,9 @@
 import unittest
 import numpy as np
 
-import paddle.incubate.hapi.vision.models as models
-import paddle.incubate.hapi as hapi
+import paddle
+from paddle.static import InputSpec
+import paddle.vision.models as models
 
 
 class TestVisonModels(unittest.TestCase):
@@ -28,8 +29,8 @@ class TestVisonModels(unittest.TestCase):
         else:
             net = models.__dict__[arch](pretrained=pretrained)
 
-        input = hapi.Input([None, 3, 224, 224], 'float32', 'image')
-        model = hapi.Model(net, input)
+        input = InputSpec([None, 3, 224, 224], 'float32', 'image')
+        model = paddle.Model(net, input)
         model.prepare()
 
         model.test_batch(x)
@@ -71,8 +72,8 @@ class TestVisonModels(unittest.TestCase):
         self.models_infer('resnet152')
 
     def test_lenet(self):
-        input = hapi.Input([None, 1, 28, 28], 'float32', 'x')
-        lenet = hapi.Model(models.__dict__['LeNet'](), input)
+        input = InputSpec([None, 1, 28, 28], 'float32', 'x')
+        lenet = paddle.Model(models.__dict__['LeNet'](), input)
         lenet.prepare()
 
         x = np.array(np.random.random((2, 1, 28, 28)), dtype=np.float32)
diff --git a/python/paddle/incubate/hapi/text/__init__.py b/python/paddle/text/__init__.py
similarity index 86%
rename from python/paddle/incubate/hapi/text/__init__.py
rename to python/paddle/text/__init__.py
index 7caab7071c9977e2ea1148e415cd51c33bfd1de0..083bfbd1d2528eceb070f32e5cc502382e4d6ea4 100644
--- a/python/paddle/incubate/hapi/text/__init__.py
+++ b/python/paddle/text/__init__.py
@@ -15,4 +15,8 @@
 from . import text
 from .text import *
 
-__all__ = text.__all__
+from . import datasets
+from .datasets import *
+
+__all__ = text.__all__ \
+        + datasets.__all__
diff --git a/python/paddle/incubate/hapi/datasets/__init__.py b/python/paddle/text/datasets/__init__.py
similarity index 75%
rename from python/paddle/incubate/hapi/datasets/__init__.py
rename to python/paddle/text/datasets/__init__.py
index a88b0e6bbf1975d97bfeb68025b978ce877c6baf..b5cea40a4f4924fee7a76bad6030a21fa5a61268 100644
--- a/python/paddle/incubate/hapi/datasets/__init__.py
+++ b/python/paddle/text/datasets/__init__.py
@@ -12,11 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import folder
-from . import mnist
-from . import flowers
-from . import cifar
-from . import voc2012
 from . import conll05
 from . import imdb
 from . import imikolov
@@ -26,11 +21,6 @@ from . import uci_housing
 from . import wmt14
 from . import wmt16
 
-from .folder import *
-from .mnist import *
-from .flowers import *
-from .cifar import *
-from .voc2012 import *
 from .conll05 import *
 from .imdb import *
 from .imikolov import *
@@ -40,12 +30,7 @@ from .uci_housing import *
 from .wmt14 import *
 from .wmt16 import *
 
-__all__ = folder.__all__ \
-          + mnist.__all__ \
-          + flowers.__all__ \
-          + cifar.__all__ \
-          + voc2012.__all__ \
-          + conll05.__all__ \
+__all__ = conll05.__all__ \
           + imdb.__all__ \
           + imikolov.__all__ \
           + movielens.__all__ \
diff --git a/python/paddle/incubate/hapi/datasets/conll05.py b/python/paddle/text/datasets/conll05.py
similarity index 86%
rename from python/paddle/incubate/hapi/datasets/conll05.py
rename to python/paddle/text/datasets/conll05.py
index 094e3559335363524c4ae893f70294a4afaa7037..8dd6db656ebe4ad08db301209e0dfe19fa1cf895 100644
--- a/python/paddle/incubate/hapi/datasets/conll05.py
+++ b/python/paddle/text/datasets/conll05.py
@@ -22,7 +22,7 @@ from six.moves import cPickle as pickle
 
 from paddle.io import Dataset
 import paddle.compat as cpt
-from .utils import _check_exists_and_download
+from paddle.dataset.common import _check_exists_and_download
 
 __all__ = ['Conll05st']
 
@@ -71,29 +71,29 @@ class Conll05st(Dataset):
 
         .. code-block:: python
 
-	    import paddle
-	    from paddle.incubate.hapi.datasets import Conll05st
+            import paddle
+            from paddle.text.datasets import Conll05st
 
-	    class SimpleNet(paddle.nn.Layer):
-		def __init__(self):
-		    super(SimpleNet, self).__init__()
+            class SimpleNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
 
-		def forward(self, pred_idx, mark, label):
-		    return paddle.sum(pred_idx), paddle.sum(mark), paddle.sum(label)
+                def forward(self, pred_idx, mark, label):
+                    return paddle.sum(pred_idx), paddle.sum(mark), paddle.sum(label)
 
-	    paddle.disable_static()
+            paddle.disable_static()
 
-	    conll05st = Conll05st()
+            conll05st = Conll05st()
 
-	    for i in range(10):
-		pred_idx, mark, label= conll05st[i][-3:]
-		pred_idx = paddle.to_tensor(pred_idx)
-		mark = paddle.to_tensor(mark)
-		label = paddle.to_tensor(label)
+            for i in range(10):
+                pred_idx, mark, label= conll05st[i][-3:]
+                pred_idx = paddle.to_tensor(pred_idx)
+                mark = paddle.to_tensor(mark)
+                label = paddle.to_tensor(label)
 
-		model = SimpleNet()
-		pred_idx, mark, label= model(pred_idx, mark, label)
-		print(pred_idx.numpy(), mark.numpy(), label.numpy())
+                model = SimpleNet()
+                pred_idx, mark, label= model(pred_idx, mark, label)
+                print(pred_idx.numpy(), mark.numpy(), label.numpy())
 
     """
 
@@ -131,6 +131,12 @@ class Conll05st(Dataset):
                 target_dict_file, TRGDICT_URL, TRGDICT_MD5, 'conll05st',
                 download)
 
+        self.emb_file = emb_file
+        if self.emb_file is None:
+            assert download, "emb_file is not set and downloading automatically is disabled"
+            self.emb_file = _check_exists_and_download(
+                emb_file, EMB_URL, EMB_MD5, 'conll05st', download)
+
         self.word_dict = self._load_dict(self.word_dict_file)
         self.predicate_dict = self._load_dict(self.verb_dict_file)
         self.label_dict = self._load_label_dict(self.target_dict_file)
@@ -290,8 +296,27 @@ class Conll05st(Dataset):
     def get_dict(self):
         """
         Get the word, verb and label dictionary of Wikipedia corpus.
+
+        Examples:
+    
+            .. code-block:: python
+    
+            from paddle.text.datasets import Conll05st
+            conll05st = Conll05st()
+            word_dict, predicate_dict, label_dict = conll05st.get_dict()
         """
         return self.word_dict, self.predicate_dict, self.label_dict
 
     def get_embedding(self):
+        """
+        Get the embedding dictionary file.
+
+        Examples:
+    
+            .. code-block:: python
+    
+            from paddle.text.datasets import Conll05st
+            conll05st = Conll05st()
+            emb_file = conll05st.get_embedding()
+        """
         return self.emb_file
diff --git a/python/paddle/incubate/hapi/datasets/imdb.py b/python/paddle/text/datasets/imdb.py
similarity index 84%
rename from python/paddle/incubate/hapi/datasets/imdb.py
rename to python/paddle/text/datasets/imdb.py
index 12d166bc784a382ac5ae70491d3e8061ad1d1e9f..f1bf247efcaf7591fe8062976d6329898ee15258 100644
--- a/python/paddle/incubate/hapi/datasets/imdb.py
+++ b/python/paddle/text/datasets/imdb.py
@@ -22,7 +22,7 @@ import numpy as np
 import collections
 
 from paddle.io import Dataset
-from .utils import _check_exists_and_download
+from paddle.dataset.common import _check_exists_and_download
 
 __all__ = ['Imdb']
 
@@ -49,28 +49,28 @@ class Imdb(Dataset):
 
         .. code-block:: python
 
-	    import paddle
-	    from paddle.incubate.hapi.datasets import Imdb
+            import paddle
+            from paddle.text.datasets import Imdb
 
-	    class SimpleNet(paddle.nn.Layer):
-		def __init__(self):
-		    super(SimpleNet, self).__init__()
+            class SimpleNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
 
-		def forward(self, doc, label):
-		    return paddle.sum(doc), label
+                def forward(self, doc, label):
+                    return paddle.sum(doc), label
 
-	    paddle.disable_static()
+            paddle.disable_static()
 
-	    imdb = Imdb(mode='train')
+            imdb = Imdb(mode='train')
 
-	    for i in range(10):
-		doc, label = imdb[i]
-		doc = paddle.to_tensor(doc)
-		label = paddle.to_tensor(label)
+            for i in range(10):
+                doc, label = imdb[i]
+                doc = paddle.to_tensor(doc)
+                label = paddle.to_tensor(label)
 
-		model = SimpleNet()
-		image, label = model(doc, label)
-		print(doc.numpy().shape, label.numpy().shape)
+                model = SimpleNet()
+                image, label = model(doc, label)
+                print(doc.numpy().shape, label.numpy().shape)
 
     """
 
diff --git a/python/paddle/incubate/hapi/datasets/imikolov.py b/python/paddle/text/datasets/imikolov.py
similarity index 87%
rename from python/paddle/incubate/hapi/datasets/imikolov.py
rename to python/paddle/text/datasets/imikolov.py
index 2e6ad43b506265ee8c9c8617a87eba5a041632bd..cfd437021b953942535a880e4ce6ee41edb932d6 100644
--- a/python/paddle/incubate/hapi/datasets/imikolov.py
+++ b/python/paddle/text/datasets/imikolov.py
@@ -20,7 +20,7 @@ import numpy as np
 import collections
 
 from paddle.io import Dataset
-from .utils import _check_exists_and_download
+from paddle.dataset.common import _check_exists_and_download
 
 __all__ = ['Imikolov']
 
@@ -49,28 +49,28 @@ class Imikolov(Dataset):
 
         .. code-block:: python
 
-	    import paddle
-	    from paddle.incubate.hapi.datasets import Imikolov
+            import paddle
+            from paddle.text.datasets import Imikolov
 
-	    class SimpleNet(paddle.nn.Layer):
-		def __init__(self):
-		    super(SimpleNet, self).__init__()
+            class SimpleNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
 
-		def forward(self, src, trg):
-		    return paddle.sum(src), paddle.sum(trg)
+                def forward(self, src, trg):
+                    return paddle.sum(src), paddle.sum(trg)
 
-	    paddle.disable_static()
+            paddle.disable_static()
 
-	    imikolov = Imikolov(mode='train', data_type='SEQ', window_size=2)
+            imikolov = Imikolov(mode='train', data_type='SEQ', window_size=2)
 
-	    for i in range(10):
-		src, trg = imikolov[i]
-		src = paddle.to_tensor(src)
-		trg = paddle.to_tensor(trg)
+            for i in range(10):
+                src, trg = imikolov[i]
+                src = paddle.to_tensor(src)
+                trg = paddle.to_tensor(trg)
 
-		model = SimpleNet()
-		src, trg = model(src, trg)
-		print(src.numpy().shape, trg.numpy().shape)
+                model = SimpleNet()
+                src, trg = model(src, trg)
+                print(src.numpy().shape, trg.numpy().shape)
 
     """
 
diff --git a/python/paddle/incubate/hapi/datasets/movie_reviews.py b/python/paddle/text/datasets/movie_reviews.py
similarity index 80%
rename from python/paddle/incubate/hapi/datasets/movie_reviews.py
rename to python/paddle/text/datasets/movie_reviews.py
index 7bf0684ebcd315807b9dc736c5481383073e5ba8..db5b15654f96712abc842ca0c99654c1b7378808 100644
--- a/python/paddle/incubate/hapi/datasets/movie_reviews.py
+++ b/python/paddle/text/datasets/movie_reviews.py
@@ -54,28 +54,28 @@ class MovieReviews(Dataset):
 
         .. code-block:: python
 
-	    import paddle
-	    from paddle.incubate.hapi.datasets import MovieReviews
+            import paddle
+            from paddle.text.datasets import MovieReviews
 
-	    class SimpleNet(paddle.nn.Layer):
-		def __init__(self):
-		    super(SimpleNet, self).__init__()
+            class SimpleNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
 
-		def forward(self, word, category):
-		    return paddle.sum(word), category
+                def forward(self, word, category):
+                    return paddle.sum(word), category
 
-	    paddle.disable_static()
+            paddle.disable_static()
 
-	    movie_reviews = MovieReviews(mode='train')
+            movie_reviews = MovieReviews(mode='train')
 
-	    for i in range(10):
-		word_list, category = movie_reviews[i]
-		word_list = paddle.to_tensor(word_list)
-		category = paddle.to_tensor(category)
+            for i in range(10):
+                word_list, category = movie_reviews[i]
+                word_list = paddle.to_tensor(word_list)
+                category = paddle.to_tensor(category)
 
-		model = SimpleNet()
-		word_list, category = model(word_list, category)
-		print(word_list.numpy().shape, category.numpy())
+                model = SimpleNet()
+                word_list, category = model(word_list, category)
+                print(word_list.numpy().shape, category.numpy())
 
     """
 
@@ -91,10 +91,10 @@ class MovieReviews(Dataset):
 
     def _get_word_dict(self):
         """
-	Sorted the words by the frequency of words which occur in sample
-	:return:
-	    words_freq_sorted
-	"""
+        Sorted the words by the frequency of words which occur in sample
+        :return:
+            words_freq_sorted
+        """
         words_freq_sorted = list()
         word_freq_dict = collections.defaultdict(int)
 
@@ -110,10 +110,10 @@ class MovieReviews(Dataset):
 
     def _sort_files(self):
         """
-	Sorted the sample for cross reading the sample
-	:return:
-	    files_list
-	"""
+        Sorted the sample for cross reading the sample
+        :return:
+            files_list
+        """
         files_list = list()
         neg_file_list = movie_reviews.fileids('neg')
         pos_file_list = movie_reviews.fileids('pos')
@@ -123,10 +123,10 @@ class MovieReviews(Dataset):
 
     def _load_sentiment_data(self):
         """
-	Load the data set
-	:return:
-	    data_set
-	"""
+        Load the data set
+        :return:
+            data_set
+        """
         self.data = []
         words_ids = dict(self._get_word_dict())
         for sample_file in self._sort_files():
@@ -138,8 +138,8 @@ class MovieReviews(Dataset):
 
     def _download_data_if_not_yet(self):
         """
-	Download the data set, if the data set is not download.
-	"""
+        Download the data set, if the data set is not download.
+        """
         try:
             # download and extract movie_reviews.zip
             paddle.dataset.common.download(
diff --git a/python/paddle/incubate/hapi/datasets/movielens.py b/python/paddle/text/datasets/movielens.py
similarity index 87%
rename from python/paddle/incubate/hapi/datasets/movielens.py
rename to python/paddle/text/datasets/movielens.py
index 228e9dc6d477cf539683963dc6ddaa3c02c8fe95..75b59cfbb0d8177e0ced784904962ef777b289cb 100644
--- a/python/paddle/incubate/hapi/datasets/movielens.py
+++ b/python/paddle/text/datasets/movielens.py
@@ -24,7 +24,7 @@ import six
 import paddle
 from paddle.io import Dataset
 import paddle.compat as cpt
-from .utils import _check_exists_and_download
+from paddle.dataset.common import _check_exists_and_download
 
 __all__ = ['Movielens']
 
@@ -106,29 +106,29 @@ class Movielens(Dataset):
 
         .. code-block:: python
 
-	    import paddle
-	    from paddle.incubate.hapi.datasets import Movielens
+            import paddle
+            from paddle.text.datasets import Movielens
 
-	    class SimpleNet(paddle.nn.Layer):
-		def __init__(self):
-		    super(SimpleNet, self).__init__()
+            class SimpleNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
 
-		def forward(self, category, title, rating):
-		    return paddle.sum(category), paddle.sum(title), paddle.sum(rating)
+                def forward(self, category, title, rating):
+                    return paddle.sum(category), paddle.sum(title), paddle.sum(rating)
 
-	    paddle.disable_static()
+            paddle.disable_static()
 
-	    movielens = Movielens(mode='train')
+            movielens = Movielens(mode='train')
 
-	    for i in range(10):
-		category, title, rating = movielens[i][-3:]
-		category = paddle.to_tensor(category)
-		title = paddle.to_tensor(title)
-		rating = paddle.to_tensor(rating)
+            for i in range(10):
+                category, title, rating = movielens[i][-3:]
+                category = paddle.to_tensor(category)
+                title = paddle.to_tensor(title)
+                rating = paddle.to_tensor(rating)
 
-		model = SimpleNet()
-		category, title, rating = model(category, title, rating)
-		print(category.numpy().shape, title.numpy().shape, rating.numpy().shape)
+                model = SimpleNet()
+                category, title, rating = model(category, title, rating)
+                print(category.numpy().shape, title.numpy().shape, rating.numpy().shape)
 
     """
 
diff --git a/python/paddle/incubate/hapi/datasets/uci_housing.py b/python/paddle/text/datasets/uci_housing.py
similarity index 78%
rename from python/paddle/incubate/hapi/datasets/uci_housing.py
rename to python/paddle/text/datasets/uci_housing.py
index c1f2c4a5bb5d9d60ba1316e3e2a5f174df94fe99..a0d465eb1775431ffa0527dfae8031bebd6fc340 100644
--- a/python/paddle/incubate/hapi/datasets/uci_housing.py
+++ b/python/paddle/text/datasets/uci_housing.py
@@ -17,9 +17,8 @@ from __future__ import print_function
 import six
 import numpy as np
 
-import paddle.dataset.common
 from paddle.io import Dataset
-from .utils import _check_exists_and_download
+from paddle.dataset.common import _check_exists_and_download
 
 __all__ = ["UCIHousing"]
 
@@ -50,28 +49,28 @@ class UCIHousing(Dataset):
         
         .. code-block:: python
 
-	    import paddle
-	    from paddle.incubate.hapi.datasets import UCIHousing
+            import paddle
+            from paddle.text.datasets import UCIHousing
 
-	    class SimpleNet(paddle.nn.Layer):
-		def __init__(self):
-		    super(SimpleNet, self).__init__()
+            class SimpleNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
 
-		def forward(self, feature, target):
-		    return paddle.sum(feature), target
+                def forward(self, feature, target):
+                    return paddle.sum(feature), target
 
-	    paddle.disable_static()
+            paddle.disable_static()
 
-	    uci_housing = UCIHousing(mode='train')
+            uci_housing = UCIHousing(mode='train')
 
-	    for i in range(10):
-		feature, target = uci_housing[i]
-		feature = paddle.to_tensor(feature)
-		target = paddle.to_tensor(target)
+            for i in range(10):
+                feature, target = uci_housing[i]
+                feature = paddle.to_tensor(feature)
+                target = paddle.to_tensor(target)
 
-		model = SimpleNet()
-		feature, target = model(feature, target)
-		print(feature.numpy().shape, target.numpy())
+                model = SimpleNet()
+                feature, target = model(feature, target)
+                print(feature.numpy().shape, target.numpy())
 
     """
 
diff --git a/python/paddle/incubate/hapi/datasets/wmt14.py b/python/paddle/text/datasets/wmt14.py
similarity index 78%
rename from python/paddle/incubate/hapi/datasets/wmt14.py
rename to python/paddle/text/datasets/wmt14.py
index b495ea931a80425b8e24b81cdf8fdfd2c0920a3e..36cb6dfd3e5b7652da3e4e9233dd5b16076a53b6 100644
--- a/python/paddle/incubate/hapi/datasets/wmt14.py
+++ b/python/paddle/text/datasets/wmt14.py
@@ -20,7 +20,7 @@ import gzip
 
 from paddle.io import Dataset
 import paddle.compat as cpt
-from .utils import _check_exists_and_download
+from paddle.dataset.common import _check_exists_and_download
 
 __all__ = ['WMT14']
 
@@ -60,29 +60,29 @@ class WMT14(Dataset):
 
         .. code-block:: python
 
-	    import paddle
-	    from paddle.incubate.hapi.datasets import WMT14
+            import paddle
+            from paddle.text.datasets import WMT14
 
-	    class SimpleNet(paddle.nn.Layer):
-		def __init__(self):
-		    super(SimpleNet, self).__init__()
+            class SimpleNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
 
-		def forward(self, src_ids, trg_ids, trg_ids_next):
-		    return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)
+                def forward(self, src_ids, trg_ids, trg_ids_next):
+                    return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)
 
-	    paddle.disable_static()
+            paddle.disable_static()
 
-	    wmt14 = WMT14(mode='train', dict_size=50)
+            wmt14 = WMT14(mode='train', dict_size=50)
 
-	    for i in range(10):
-		src_ids, trg_ids, trg_ids_next = wmt14[i]
-		src_ids = paddle.to_tensor(src_ids)
-		trg_ids = paddle.to_tensor(trg_ids)
-		trg_ids_next = paddle.to_tensor(trg_ids_next)
+            for i in range(10):
+                src_ids, trg_ids, trg_ids_next = wmt14[i]
+                src_ids = paddle.to_tensor(src_ids)
+                trg_ids = paddle.to_tensor(trg_ids)
+                trg_ids_next = paddle.to_tensor(trg_ids_next)
 
-		model = SimpleNet()
-		src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
-		print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy())
+                model = SimpleNet()
+                src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
+                print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy())
 
     """
 
@@ -173,6 +173,25 @@ class WMT14(Dataset):
         return len(self.src_ids)
 
     def get_dict(self, reverse=False):
+        """
+        Get the source and target dictionary.
+
+        Args:
+            reverse (bool): wether to reverse key and value in dictionary,
+                i.e. key: value to value: key.
+    
+        Returns:
+            Two dictionaries, the source and target dictionary.
+    
+        Examples:
+    
+            .. code-block:: python
+    
+                from paddle.text.datasets import WMT14
+                wmt14 = WMT14(mode='train', dict_size=50)
+                src_dict, trg_dict = wmt14.get_dict()
+        """
+        src_dict, trg_dict = self.src_dict, self.trg_dict
         if reverse:
             src_dict = {v: k for k, v in six.iteritems(src_dict)}
             trg_dict = {v: k for k, v in six.iteritems(trg_dict)}
diff --git a/python/paddle/incubate/hapi/datasets/wmt16.py b/python/paddle/text/datasets/wmt16.py
similarity index 79%
rename from python/paddle/incubate/hapi/datasets/wmt16.py
rename to python/paddle/text/datasets/wmt16.py
index 6d3cb8bfacadd15f6c0f973a09dbf544bbc396c0..03a62e9347035101f77cec971c32164b97dd844f 100644
--- a/python/paddle/incubate/hapi/datasets/wmt16.py
+++ b/python/paddle/text/datasets/wmt16.py
@@ -25,7 +25,7 @@ from collections import defaultdict
 import paddle
 from paddle.io import Dataset
 import paddle.compat as cpt
-from .utils import _check_exists_and_download
+from paddle.dataset.common import _check_exists_and_download
 
 __all__ = ['WMT16']
 
@@ -77,29 +77,29 @@ class WMT16(Dataset):
 
         .. code-block:: python
 
-	    import paddle
-	    from paddle.incubate.hapi.datasets import WMT16
+            import paddle
+            from paddle.text.datasets import WMT16
 
-	    class SimpleNet(paddle.nn.Layer):
-		def __init__(self):
-		    super(SimpleNet, self).__init__()
+            class SimpleNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
 
-		def forward(self, src_ids, trg_ids, trg_ids_next):
-		    return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)
+                def forward(self, src_ids, trg_ids, trg_ids_next):
+                    return paddle.sum(src_ids), paddle.sum(trg_ids), paddle.sum(trg_ids_next)
 
-	    paddle.disable_static()
+            paddle.disable_static()
 
-	    wmt16 = WMT16(mode='train', src_dict_size=50, trg_dict_size=50)
+            wmt16 = WMT16(mode='train', src_dict_size=50, trg_dict_size=50)
 
-	    for i in range(10):
-		src_ids, trg_ids, trg_ids_next = wmt16[i]
-		src_ids = paddle.to_tensor(src_ids)
-		trg_ids = paddle.to_tensor(trg_ids)
-		trg_ids_next = paddle.to_tensor(trg_ids_next)
+            for i in range(10):
+                src_ids, trg_ids, trg_ids_next = wmt16[i]
+                src_ids = paddle.to_tensor(src_ids)
+                trg_ids = paddle.to_tensor(trg_ids)
+                trg_ids_next = paddle.to_tensor(trg_ids_next)
 
-		model = SimpleNet()
-		src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
-		print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy())
+                model = SimpleNet()
+                src_ids, trg_ids, trg_ids_next = model(src_ids, trg_ids, trg_ids_next)
+                print(src_ids.numpy(), trg_ids.numpy(), trg_ids_next.numpy())
 
     """
 
@@ -222,21 +222,29 @@ class WMT16(Dataset):
 
     def get_dict(self, lang, reverse=False):
         """
-	return the word dictionary for the specified language.
-
-	Args:
-	    lang(string): A string indicating which language is the source
-			  language. Available options are: "en" for English
-			  and "de" for Germany.
-	    reverse(bool): If reverse is set to False, the returned python
-			   dictionary will use word as key and use index as value.
-			   If reverse is set to True, the returned python
-			   dictionary will use index as key and word as value.
-
-	Returns:
-	    dict: The word dictionary for the specific language.
-	"""
+        return the word dictionary for the specified language.
+
+        Args:
+            lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+            reverse(bool): If reverse is set to False, the returned python
+                           dictionary will use word as key and use index as value.
+                           If reverse is set to True, the returned python
+                           dictionary will use index as key and word as value.
+
+        Returns:
+            dict: The word dictionary for the specific language.
+
+        Examples:
+    
+            .. code-block:: python
+    
+                from paddle.text.datasets import WMT16
+                wmt16 = WMT16(mode='train', src_dict_size=50, trg_dict_size=50)
+                en_dict = wmt16.get_dict('en')
 
+        """
         dict_size = self.src_dict_size if lang == self.lang else self.trg_dict_size
 
         dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
@@ -244,4 +252,4 @@ class WMT16(Dataset):
         assert os.path.exists(dict_path), "Word dictionary does not exist. "
         "Please invoke paddle.dataset.wmt16.train/test/validation first "
         "to build the dictionary."
-        return _load_dict(lang, dict_size)
+        return self._load_dict(lang, dict_size)
diff --git a/python/paddle/incubate/hapi/text/text.py b/python/paddle/text/text.py
similarity index 98%
rename from python/paddle/incubate/hapi/text/text.py
rename to python/paddle/text/text.py
index a2940fbe6cf483bce905c596a4b50294129fab54..a0fa4791c5b1ca3dd5cfe85b03f6db9353803ba9 100644
--- a/python/paddle/incubate/hapi/text/text.py
+++ b/python/paddle/text/text.py
@@ -227,7 +227,7 @@ class BasicLSTMCell(RNNCell):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import BasicLSTMCell, RNN
+            from paddle.text import BasicLSTMCell, RNN
 
             inputs = paddle.rand((2, 4, 32))
             cell = BasicLSTMCell(input_size=32, hidden_size=64)
@@ -358,7 +358,7 @@ class BasicGRUCell(RNNCell):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import BasicGRUCell, RNN
+            from paddle.text import BasicGRUCell, RNN
 
             inputs = paddle.rand((2, 4, 32))
             cell = BasicGRUCell(input_size=32, hidden_size=64)
@@ -495,7 +495,7 @@ class RNN(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+            from paddle.text import StackedLSTMCell, RNN
 
             inputs = paddle.rand((2, 4, 32))
             cell = StackedLSTMCell(input_size=32, hidden_size=64)
@@ -648,7 +648,7 @@ class StackedRNNCell(RNNCell):
 
         .. code-block:: python
 
-            from paddle.incubate.hapi.text import BasicLSTMCell, StackedRNNCell
+            from paddle.text import BasicLSTMCell, StackedRNNCell
 
             cells = [BasicLSTMCell(32, 32), BasicLSTMCell(32, 32)]
             stack_rnn = StackedRNNCell(cells)
@@ -789,7 +789,7 @@ class StackedLSTMCell(RNNCell):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+            from paddle.text import StackedLSTMCell, RNN
 
             inputs = paddle.rand((2, 4, 32))
             cell = StackedLSTMCell(input_size=32, hidden_size=64)
@@ -948,7 +948,7 @@ class LSTM(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import LSTM
+            from paddle.text import LSTM
 
             inputs = paddle.rand((2, 4, 32))
             lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
@@ -1023,7 +1023,7 @@ class BidirectionalRNN(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle.incubate.hapi.text import StackedLSTMCell, BidirectionalRNN
+            from paddle.text import StackedLSTMCell, BidirectionalRNN
 
             inputs = paddle.rand((2, 4, 32))
             cell_fw = StackedLSTMCell(32, 64)
@@ -1215,7 +1215,7 @@ class BidirectionalLSTM(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import BidirectionalLSTM
+            from paddle.text import BidirectionalLSTM
 
             inputs = paddle.rand((2, 4, 32))
             bi_lstm = BidirectionalLSTM(input_size=32, hidden_size=64, num_layers=2)
@@ -1384,7 +1384,7 @@ class StackedGRUCell(RNNCell):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import StackedGRUCell, RNN
+            from paddle.text import StackedGRUCell, RNN
 
             inputs = paddle.rand((2, 4, 32))
             cell = StackedGRUCell(input_size=32, hidden_size=64)
@@ -1524,7 +1524,7 @@ class GRU(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import GRU
+            from paddle.text import GRU
 
             inputs = paddle.rand((2, 4, 32))
             gru = GRU(input_size=32, hidden_size=64, num_layers=2)
@@ -1644,7 +1644,7 @@ class BidirectionalGRU(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import BidirectionalGRU
+            from paddle.text import BidirectionalGRU
 
             inputs = paddle.rand((2, 4, 32))
             bi_gru = BidirectionalGRU(input_size=32, hidden_size=64, num_layers=2)
@@ -1802,7 +1802,7 @@ class DynamicDecode(Layer):
             import paddle
             import paddle.fluid as fluid
             from paddle.fluid.layers import BeamSearchDecoder
-            from paddle.incubate.hapi.text import StackedLSTMCell, DynamicDecode
+            from paddle.text import StackedLSTMCell, DynamicDecode
 
             paddle.disable_static()
 
@@ -2033,7 +2033,7 @@ class Conv1dPoolLayer(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import Conv1dPoolLayer
+            from paddle.text import Conv1dPoolLayer
 
             # input: [batch_size, num_channels, sequence_length]
             input = paddle.rand((2, 32, 4))
@@ -2162,7 +2162,7 @@ class CNNEncoder(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import CNNEncoder
+            from paddle.text import CNNEncoder
 
             # input: [batch_size, num_channels, sequence_length]
             input = paddle.rand((2, 32, 8))
@@ -2273,10 +2273,10 @@ class TransformerCell(RNNCell):
             import paddle
             import paddle.fluid as fluid
             from paddle.fluid.dygraph import Embedding, Linear
-            from paddle.incubate.hapi.text import TransformerDecoder
-            from paddle.incubate.hapi.text import TransformerCell
-            from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
-            from paddle.incubate.hapi.text import DynamicDecode
+            from paddle.text import TransformerDecoder
+            from paddle.text import TransformerCell
+            from paddle.text import TransformerBeamSearchDecoder
+            from paddle.text import DynamicDecode
 
             paddle.disable_static()
 
@@ -2440,10 +2440,10 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
             import paddle
             import paddle.fluid as fluid
             from paddle.fluid.dygraph import Embedding, Linear
-            from paddle.incubate.hapi.text import TransformerDecoder
-            from paddle.incubate.hapi.text import TransformerCell
-            from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
-            from paddle.incubate.hapi.text import DynamicDecode
+            from paddle.text import TransformerDecoder
+            from paddle.text import TransformerCell
+            from paddle.text import TransformerBeamSearchDecoder
+            from paddle.text import DynamicDecode
 
             paddle.disable_static()
 
@@ -2627,7 +2627,7 @@ class PrePostProcessLayer(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import PrePostProcessLayer
+            from paddle.text import PrePostProcessLayer
 
             # input: [batch_size, sequence_length, d_model]
             x = paddle.rand((2, 4, 32))
@@ -2709,7 +2709,7 @@ class MultiHeadAttention(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import MultiHeadAttention
+            from paddle.text import MultiHeadAttention
 
             # encoder input: [batch_size, sequence_length, d_model]
             query = paddle.rand((2, 4, 128))
@@ -2917,7 +2917,7 @@ class FFN(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import FFN
+            from paddle.text import FFN
 
             # input: [batch_size, sequence_length, d_model]
             x = paddle.rand((2, 4, 32))
@@ -2992,7 +2992,7 @@ class TransformerEncoderLayer(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import TransformerEncoderLayer
+            from paddle.text import TransformerEncoderLayer
 
             # encoder input: [batch_size, src_len, d_model]
             enc_input = paddle.rand((2, 4, 128))
@@ -3095,7 +3095,7 @@ class TransformerEncoder(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import TransformerEncoder
+            from paddle.text import TransformerEncoder
 
             # encoder input: [batch_size, src_len, d_model]
             enc_input = paddle.rand((2, 4, 128))
@@ -3206,7 +3206,7 @@ class TransformerDecoderLayer(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import TransformerDecoderLayer
+            from paddle.text import TransformerDecoderLayer
 
             # decoder input: [batch_size, trg_len, d_model]
             dec_input = paddle.rand((2, 4, 128))
@@ -3348,7 +3348,7 @@ class TransformerDecoder(Layer):
 
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import TransformerDecoder
+            from paddle.text import TransformerDecoder
 
             # decoder input: [batch_size, trg_len, d_model]
             dec_input = paddle.rand((2, 4, 128))
@@ -3561,7 +3561,7 @@ class LinearChainCRF(Layer):
             import numpy as np
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import LinearChainCRF
+            from paddle.text import LinearChainCRF
 
             # emission: [batch_size, sequence_length, num_tags]
             emission = paddle.rand((2, 8, 5))
@@ -3689,7 +3689,7 @@ class CRFDecoding(Layer):
             import numpy as np
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import CRFDecoding
+            from paddle.text import CRFDecoding
 
             # emission: [batch_size, sequence_length, num_tags]
             emission = paddle.rand((2, 8, 5))
@@ -3858,7 +3858,7 @@ class SequenceTagging(Layer):
             import numpy as np
             import paddle
             import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import SequenceTagging
+            from paddle.text import SequenceTagging
 
             # word: [batch_size, sequence_length]
             # dummy input just for example
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 85d0e133fa406df414977c0d69e0537ab5833891..f6299980b3e5c0bd0c7551b6b51c9b067d7960b5 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -17,8 +17,9 @@ from .profiler import ProfilerOptions
 from .profiler import Profiler
 from .profiler import get_profiler
 from .deprecated import deprecated
+from . import download
 
-__all__ = ['dump_config', 'Ploter', 'deprecated']
+__all__ = ['dump_config', 'Ploter', 'deprecated', 'download']
 
 #TODO: define new api under this directory
 # __all__ = ['unique_name',
diff --git a/python/paddle/incubate/hapi/download.py b/python/paddle/utils/download.py
similarity index 99%
rename from python/paddle/incubate/hapi/download.py
rename to python/paddle/utils/download.py
index 9d935e48995742ca8dfadce79cb2ce7395051a29..d8c0a2fc8c28450108a01c57fb1d2c1f7303101c 100644
--- a/python/paddle/incubate/hapi/download.py
+++ b/python/paddle/utils/download.py
@@ -26,7 +26,6 @@ import tarfile
 import zipfile
 import time
 from collections import OrderedDict
-from paddle.fluid.dygraph.parallel import ParallelEnv
 
 try:
     from tqdm import tqdm
@@ -156,6 +155,9 @@ def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
     Returns:
         str: a local path to save downloaded models & weights & datasets.
     """
+
+    from paddle.fluid.dygraph.parallel import ParallelEnv
+
     assert is_url(url), "downloading from {} not a url".format(url)
     # parse path after download to decompress under root_dir
     fullpath = _map_path(url, root_dir)
diff --git a/python/paddle/utils/plotcurve.py b/python/paddle/utils/plotcurve.py
deleted file mode 100644
index 9c298acf01db66459ca163bf1297f8c7d2be6cb0..0000000000000000000000000000000000000000
--- a/python/paddle/utils/plotcurve.py
+++ /dev/null
@@ -1,155 +0,0 @@
-#!/usr/bin/python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Plot training and testing curve from paddle log.
-
-It takes input from a file or stdin, and output to a file or stdout.
-
-Note: must have numpy and matplotlib installed in order to use this tool.
-
-usage: Plot training and testing curves from paddle log file.
-       [-h] [-i INPUT] [-o OUTPUT] [--format FORMAT] [key [key ...]]
-
-positional arguments:
-  key                   keys of scores to plot, the default will be AvgCost
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -i INPUT, --input INPUT
-                        input filename of paddle log, default will be standard
-                        input
-  -o OUTPUT, --output OUTPUT
-                        output filename of figure, default will be standard
-                        output
-  --format FORMAT       figure format(png|pdf|ps|eps|svg)
-
-
-The keys must be in the order of paddle output(!!!).
-
-For example, paddle.INFO contains the following log
-   I0406 21:26:21.325584  3832 Trainer.cpp:601]  Pass=0 Batch=7771 AvgCost=0.624935 Eval: error=0.260972
-
-To use this script to generate plot for AvgCost, error:
-   python plotcurve.py -i paddle.INFO -o figure.png AvgCost error
-"""
-
-import six
-import sys
-import matplotlib
-# the following line is added immediately after import matplotlib
-# and before import pylot. The purpose is to ensure the plotting
-# works even under remote login (i.e. headless display)
-matplotlib.use('Agg')
-from matplotlib import cm
-import matplotlib.pyplot as pyplot
-import numpy
-import argparse
-import re
-import os
-
-
-def plot_paddle_curve(keys, inputfile, outputfile, format='png',
-                      show_fig=False):
-    """Plot curves from paddle log and save to outputfile.
-
-    :param keys: a list of strings to be plotted, e.g. AvgCost
-    :param inputfile: a file object for input
-    :param outputfile: a file object for output
-    :return: None
-    """
-    pass_pattern = r"Pass=([0-9]*)"
-    test_pattern = r"Test samples=([0-9]*)"
-    if not keys:
-        keys = ['AvgCost']
-    for k in keys:
-        pass_pattern += r".*?%s=([0-9e\-\.]*)" % k
-        test_pattern += r".*?%s=([0-9e\-\.]*)" % k
-    data = []
-    test_data = []
-    compiled_pattern = re.compile(pass_pattern)
-    compiled_test_pattern = re.compile(test_pattern)
-    for line in inputfile:
-        found = compiled_pattern.search(line)
-        found_test = compiled_test_pattern.search(line)
-        if found:
-            data.append([float(x) for x in found.groups()])
-        if found_test:
-            test_data.append([float(x) for x in found_test.groups()])
-    x = numpy.array(data)
-    x_test = numpy.array(test_data)
-    if x.shape[0] <= 0:
-        sys.stderr.write("No data to plot. Exiting!\n")
-        return
-    m = len(keys) + 1
-    for i in six.moves.xrange(1, m):
-        pyplot.plot(
-            x[:, 0],
-            x[:, i],
-            color=cm.jet(1.0 * (i - 1) / (2 * m)),
-            label=keys[i - 1])
-        if (x_test.shape[0] > 0):
-            pyplot.plot(
-                x[:, 0],
-                x_test[:, i],
-                color=cm.jet(1.0 - 1.0 * (i - 1) / (2 * m)),
-                label="Test " + keys[i - 1])
-    pyplot.xlabel('number of epoch')
-    pyplot.legend(loc='best')
-    if show_fig:
-        pyplot.show()
-    pyplot.savefig(outputfile, bbox_inches='tight')
-    pyplot.clf()
-
-
-def main(argv):
-    """
-    main method of plotting curves.
-    """
-    cmdparser = argparse.ArgumentParser(
-        "Plot training and testing curves from paddle log file.")
-    cmdparser.add_argument(
-        'key', nargs='*', help='keys of scores to plot, the default is AvgCost')
-    cmdparser.add_argument(
-        '-i',
-        '--input',
-        help='input filename of paddle log, '
-        'default will be standard input')
-    cmdparser.add_argument(
-        '-o',
-        '--output',
-        help='output filename of figure, '
-        'default will be standard output')
-    cmdparser.add_argument('--format', help='figure format(png|pdf|ps|eps|svg)')
-    args = cmdparser.parse_args(argv)
-    keys = args.key
-    if args.input:
-        inputfile = open(args.input)
-    else:
-        inputfile = sys.stdin
-    format = args.format
-    if args.output:
-        outputfile = open(args.output, 'wb')
-        if not format:
-            format = os.path.splitext(args.output)[1]
-            if not format:
-                format = 'png'
-    else:
-        outputfile = sys.stdout
-    plot_paddle_curve(keys, inputfile, outputfile, format)
-    inputfile.close()
-    outputfile.close()
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
diff --git a/python/paddle/utils/preprocess_img.py b/python/paddle/utils/preprocess_img.py
deleted file mode 100644
index e54393fa4a029a510699e3e2bafef9f4d78c51e0..0000000000000000000000000000000000000000
--- a/python/paddle/utils/preprocess_img.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-import random
-import numpy as np
-import PIL.Image as Image
-from six.moves import cStringIO as StringIO
-from . import preprocess_util
-from .image_util import crop_img
-
-
-def resize_image(img, target_size):
-    """
-    Resize an image so that the shorter edge has length target_size.
-    img: the input image to be resized.
-    target_size: the target resized image size.
-    """
-    percent = (target_size / float(min(img.size[0], img.size[1])))
-    resized_size = int(round(img.size[0] * percent)),\
-                   int(round(img.size[1] * percent))
-    img = img.resize(resized_size, Image.ANTIALIAS)
-    return img
-
-
-class DiskImage:
-    """
-    A class of image data on disk.
-    """
-
-    def __init__(self, path, target_size):
-        """
-        path: path of the image.
-        target_size: target resize size.
-        """
-        self.path = path
-        self.target_size = target_size
-        self.img = None
-        pass
-
-    def read_image(self):
-        if self.img is None:
-            print("reading: " + self.path)
-            image = resize_image(Image.open(self.path), self.target_size)
-            self.img = image
-
-    def convert_to_array(self):
-        self.read_image()
-        np_array = np.array(self.img)
-        if len(np_array.shape) == 3:
-            np_array = np.swapaxes(np_array, 1, 2)
-            np_array = np.swapaxes(np_array, 1, 0)
-        return np_array
-
-    def convert_to_paddle_format(self):
-        """
-        convert the image into the paddle batch format.
-        """
-        self.read_image()
-        output = StringIO()
-        self.img.save(output, "jpeg")
-        contents = output.getvalue()
-        return contents
-
-
-class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
-    """
-    A class to process data for image classification.
-    """
-
-    def __init__(self, data_path, target_size, color=True):
-        """
-        data_path: the path to store the training data and batches.
-        target_size: processed image size in a batch.
-        color: whether to use color images.
-        """
-        preprocess_util.DatasetCreater.__init__(self, data_path)
-        self.target_size = target_size
-        self.color = color
-        self.keys = ["images", "labels"]
-        self.permute_key = "labels"
-
-    def create_meta_file(self, data):
-        """
-        Create a meta file for image classification.
-        The meta file contains the meam image, as well as some configs.
-        data: the training Dataaet.
-        """
-        output_path = os.path.join(self.data_path, self.batch_dir_name,
-                                   self.meta_filename)
-        if self.color:
-            mean_img = np.zeros((3, self.target_size, self.target_size))
-        else:
-            mean_img = np.zeros((self.target_size, self.target_size))
-        for d in data.data:
-            img = d[0].convert_to_array()
-            cropped_img = crop_img(img, self.target_size, self.color)
-            mean_img += cropped_img
-        mean_img /= len(data.data)
-        mean_img = mean_img.astype('int32').flatten()
-        preprocess_util.save_file({
-            "data_mean": mean_img,
-            "image_size": self.target_size,
-            "mean_image_size": self.target_size,
-            "num_classes": self.num_classes,
-            "color": self.color
-        }, output_path)
-        pass
-
-    def create_dataset_from_list(self, path):
-        data = []
-        label_set = []
-        for line in open(path):
-            items = line.rstrip.split()
-            image_path = items[0]
-            label_name = items[1]
-            if not label_name in label_set:
-                label_set[label_name] = len(list(label_set.keys()))
-            img = DiskImage(path=image_path, target_size=self.target_size)
-            label = preprocess_util.Lablel(
-                label=label_set[label_name], name=label_name)
-        return preprocess_util.Dataset(data, self.keys), label_set
-
-    def create_dataset_from_dir(self, path):
-        """
-        Create a Dataset object for image classification.
-        Each folder in the path directory corresponds to a set of images of
-        this label, and the name of the folder is the name of the
-        path: the path of the image dataset.
-        """
-        if self.from_list:
-            return self.create_dataset_from_list(path)
-        label_set = preprocess_util.get_label_set_from_dir(path)
-        data = []
-        for l_name in list(label_set.keys()):
-            image_paths = preprocess_util.list_images(
-                os.path.join(path, l_name))
-            for p in image_paths:
-                img = DiskImage(path=p, target_size=self.target_size)
-                label = preprocess_util.Label(
-                    label=label_set[l_name], name=l_name)
-                data.append((img, label))
-        random.shuffle(data)
-        return preprocess_util.Dataset(data, self.keys), label_set
diff --git a/python/paddle/utils/preprocess_util.py b/python/paddle/utils/preprocess_util.py
deleted file mode 100644
index 471cb07c84bc31a34d659e9ccc8bdd57442b8489..0000000000000000000000000000000000000000
--- a/python/paddle/utils/preprocess_util.py
+++ /dev/null
@@ -1,362 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import math
-import six.moves.cPickle as pickle
-import random
-import collections
-
-
-def save_file(data, filename):
-    """
-    Save data into pickle format.
-    data: the data to save.
-    filename: the output filename.
-    """
-    pickle.dump(data, open(filename, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
-
-
-def save_list(l, outfile):
-    """
-    Save a list of string into a text file. There is one line for each string.
-    l: the list of string to save
-    outfile: the output file
-    """
-    open(outfile, "w").write("\n".join(l))
-
-
-def exclude_pattern(f):
-    """
-    Return whether f is in the exclude pattern.
-    Exclude the files that starts with . or ends with ~.
-    """
-    return f.startswith(".") or f.endswith("~")
-
-
-def list_dirs(path):
-    """
-    Return a list of directories in path. Exclude all the directories that
-    start with '.'.
-    path: the base directory to search over.
-    """
-    return [
-        os.path.join(path, d) for d in next(os.walk(path))[1]
-        if not exclude_pattern(d)
-    ]
-
-
-def list_images(path, exts=set(["jpg", "png", "bmp", "jpeg"])):
-    """
-    Return a list of images in path.
-    path: the base directory to search over.
-    exts: the extensions of the images to find.
-    """
-    return [os.path.join(path, d) for d in  os.listdir(path) \
-            if os.path.isfile(os.path.join(path, d)) and not exclude_pattern(d)\
-            and os.path.splitext(d)[-1][1:] in exts]
-
-
-def list_files(path):
-    """
-    Return a list of files in path.
-    path: the base directory to search over.
-    exts: the extensions of the images to find.
-    """
-    return [os.path.join(path, d) for d in  os.listdir(path) \
-            if os.path.isfile(os.path.join(path, d)) and not exclude_pattern(d)]
-
-
-def get_label_set_from_dir(path):
-    """
-    Return a dictionary of the labels and label ids from a path.
-    Assume each directory in the path corresponds to a unique label.
-    The keys of the dictionary is the label name.
-    The values of the dictionary is the label id.
-    """
-    dirs = list_dirs(path)
-    return dict([(os.path.basename(d), i) for i, d in enumerate(sorted(dirs))])
-
-
-class Label:
-    """
-    A class of label data.
-    """
-
-    def __init__(self, label, name):
-        """
-        label: the id of the label.
-        name: the name of the label.
-        """
-        self.label = label
-        self.name = name
-
-    def convert_to_paddle_format(self):
-        """
-        convert the image into the paddle batch format.
-        """
-        return int(self.label)
-
-    def __hash__(self):
-        return hash((self.label))
-
-
-class Dataset:
-    """
-    A class to represent a dataset. A dataset contains a set of items.
-    Each item contains multiple slots of data.
-    For example: in image classification dataset, each item contains two slot,
-    The first slot is an image, and the second slot is a label.
-    """
-
-    def __init__(self, data, keys):
-        """
-        data: a list of data.
-              Each data is a tuple containing multiple slots of data.
-              Each slot is an object with convert_to_paddle_format function.
-        keys: contains a list of keys for all the slots.
-        """
-        self.data = data
-        self.keys = keys
-
-    def check_valid(self):
-        for d in self.data:
-            assert (len(d) == len(self.keys))
-
-    def permute(self, key_id, num_per_batch):
-        """
-        Permuate data for batching. It supports two types now:
-        1. if key_id == None, the batching process is completely random.
-        2. if key_id is not None. The batching process Permuate the data so that the key specified by key_id are
-        uniformly distributed in batches. See the comments of permute_by_key for details.
-        """
-        if key_id is None:
-            self.uniform_permute()
-        else:
-            self.permute_by_key(key_id, num_per_batch)
-
-    def uniform_permute(self):
-        """
-        Permuate the data randomly.
-        """
-        random.shuffle(self.data)
-
-    def permute_by_key(self, key_id, num_per_batch):
-        """
-        Permuate the data so that the key specified by key_id are
-        uniformly distributed in batches.
-        For example: if we have three labels, and the number of data
-        for each label are 100, 200, and 300, respectively.  The number of batches is 4.
-        Then, the number of data for these labels is 25, 50, and 75.
-        """
-        # Store the indices of the data that has the key value
-        # specified by key_id.
-        keyvalue_indices = collections.defaultdict(list)
-        for idx in range(len(self.data)):
-            keyvalue_indices[self.data[idx][key_id].label].append(idx)
-        for k in keyvalue_indices:
-            random.shuffle(keyvalue_indices[k])
-
-        num_data_per_key_batch = \
-            math.ceil(num_per_batch / float(len(list(keyvalue_indices.keys()))))
-
-        if num_data_per_key_batch < 2:
-            raise Exception("The number of data in a batch is too small")
-
-        permuted_data = []
-        keyvalue_readpointer = collections.defaultdict(int)
-        while len(permuted_data) < len(self.data):
-            for k in keyvalue_indices:
-                begin_idx = keyvalue_readpointer[k]
-                end_idx = int(
-                    min(begin_idx + num_data_per_key_batch,
-                        len(keyvalue_indices[k])))
-                print("begin_idx, end_idx")
-                print(begin_idx, end_idx)
-                for idx in range(begin_idx, end_idx):
-                    permuted_data.append(self.data[keyvalue_indices[k][idx]])
-                keyvalue_readpointer[k] = end_idx
-        self.data = permuted_data
-
-
-class DataBatcher:
-    """
-    A class that is used to create batches for both training and testing
-    datasets.
-    """
-
-    def __init__(self, train_data, test_data, label_set):
-        """
-        train_data, test_data: Each one is a dataset object representing
-        training and testing data, respectively.
-        label_set: a dictionary storing the mapping from label name to label id.
-        """
-        self.train_data = train_data
-        self.test_data = test_data
-        self.label_set = label_set
-        self.num_per_batch = 5000
-        assert (self.train_data.keys == self.test_data.keys)
-
-    def create_batches_and_list(self, output_path, train_list_name,
-                                test_list_name, label_set_name):
-        """
-        Create batches for both training and testing objects.
-        It also create train.list and test.list to indicate the list
-        of the batch files for training and testing data, respectively.
-        """
-        train_list = self.create_batches(self.train_data, output_path, "train_",
-                                         self.num_per_batch)
-        test_list = self.create_batches(self.test_data, output_path, "test_",
-                                        self.num_per_batch)
-        save_list(train_list, os.path.join(output_path, train_list_name))
-        save_list(test_list, os.path.join(output_path, test_list_name))
-        save_file(self.label_set, os.path.join(output_path, label_set_name))
-
-    def create_batches(self,
-                       data,
-                       output_path,
-                       prefix="",
-                       num_data_per_batch=5000):
-        """
-        Create batches for a Dataset object.
-        data: the Dataset object to process.
-        output_path: the output path of the batches.
-        prefix: the prefix of each batch.
-        num_data_per_batch: number of data in each batch.
-        """
-        num_batches = int(math.ceil(len(data.data) / float(num_data_per_batch)))
-        batch_names = []
-        data.check_valid()
-        num_slots = len(data.keys)
-        for i in range(num_batches):
-            batch_name = os.path.join(output_path, prefix + "batch_%03d" % i)
-            out_data = dict([(k, []) for k in data.keys])
-            begin_idx = i * num_data_per_batch
-            end_idx = min((i + 1) * num_data_per_batch, len(data.data))
-            for j in range(begin_idx, end_idx):
-                for slot_id in range(num_slots):
-                    out_data[data.keys[slot_id]].\
-                        append(data.data[j][slot_id].convert_to_paddle_format())
-            save_file(out_data, batch_name)
-            batch_names.append(batch_name)
-        return batch_names
-
-
-class DatasetCreater(object):
-    """
-    A virtual class for creating datasets.
-    The derived class needs to implement the following methods:
-       - create_dataset()
-       - create_meta_file()
-    """
-
-    def __init__(self, data_path):
-        """
-        data_path: the path to store the training data and batches.
-        train_dir_name: relative training data directory.
-        test_dir_name: relative testing data directory.
-        batch_dir_name: relative batch directory.
-        num_per_batch: the number of data in a batch.
-        meta_filename: the filename of the meta file.
-        train_list_name: training batch list name.
-        test_list_name: testing batch list name.
-        label_set: label set name.
-        overwrite: whether to overwrite the files if the batches are already in
-                   the given path.
-        """
-        self.data_path = data_path
-        self.train_dir_name = 'train'
-        self.test_dir_name = 'test'
-        self.batch_dir_name = 'batches'
-        self.num_per_batch = 50000
-        self.meta_filename = "batches.meta"
-        self.train_list_name = "train.list"
-        self.test_list_name = "test.list"
-        self.label_set_name = "labels.pkl"
-        self.output_path = os.path.join(self.data_path, self.batch_dir_name)
-        self.overwrite = False
-        self.permutate_key = "labels"
-        self.from_list = False
-
-    def create_meta_file(self, data):
-        """
-        Create a meta file from training data.
-        data: training data given in a Dataset format.
-        """
-        raise NotImplementedError
-
-    def create_dataset(self, path):
-        """
-        Create a data set object from a path.
-        It will use directory structure or a file list to determine dataset if
-        self.from_list is True. Otherwise, it will uses a file list  to
-        determine the dataset.
-        path: the path of the dataset.
-        return a tuple of Dataset object, and a mapping from label set
-        to label id.
-        """
-        if self.from_list:
-            return self.create_dataset_from_list(path)
-        else:
-            return self.create_dataset_from_dir(path)
-
-    def create_dataset_from_list(self, path):
-        """
-        Create a data set object from a path.
-        It will uses a file list to determine the dataset.
-        path: the path of the dataset.
-        return a tuple of Dataset object, and a mapping from label set
-        to label id
-        """
-        raise NotImplementedError
-
-    def create_dataset_from_dir(self, path):
-        """
-        Create a data set object from a path.
-        It will use directory structure or a file list to determine dataset if
-        self.from_list is True.
-        path: the path of the dataset.
-        return a tuple of Dataset object, and a mapping from label set
-        to label id
-        """
-        raise NotImplementedError
-
-    def create_batches(self):
-        """
-        create batches and meta file.
-        """
-        train_path = os.path.join(self.data_path, self.train_dir_name)
-        test_path = os.path.join(self.data_path, self.test_dir_name)
-        out_path = os.path.join(self.data_path, self.batch_dir_name)
-        if not os.path.exists(out_path):
-            os.makedirs(out_path)
-        if (self.overwrite or not os.path.exists(
-                os.path.join(out_path, self.train_list_name))):
-            train_data, train_label_set = \
-                self.create_dataset(train_path)
-            test_data, test_label_set = \
-                self.create_dataset(test_path)
-
-            train_data.permute(
-                self.keys.index(self.permutate_key), self.num_per_batch)
-
-            assert (train_label_set == test_label_set)
-            data_batcher = DataBatcher(train_data, test_data, train_label_set)
-            data_batcher.num_per_batch = self.num_per_batch
-            data_batcher.create_batches_and_list(
-                self.output_path, self.train_list_name, self.test_list_name,
-                self.label_set_name)
-            self.num_classes = len(list(train_label_set.keys()))
-            self.create_meta_file(train_data)
-        return out_path
diff --git a/python/paddle/utils/show_pb.py b/python/paddle/utils/show_pb.py
deleted file mode 100644
index da7a71a665aea4d93d366e8508f438a9aba88e94..0000000000000000000000000000000000000000
--- a/python/paddle/utils/show_pb.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Show the content of proto buffer data file of PADDLE
-"""
-
-from __future__ import print_function
-
-import os
-import sys
-from google.protobuf.internal.decoder import _DecodeVarint
-import paddle.proto.DataFormat_pb2 as DataFormat
-
-
-def read_proto(file, message):
-    """
-    read a protobuffer struct from file, the length of the struct is stored as
-    a varint, then followed by the actual struct data.
-    @return True success, False for end of file
-    """
-
-    buf = file.read(8)
-    if not buf:
-        return False
-    result, pos = _DecodeVarint(buf, 0)
-    buf = buf[pos:] + file.read(result - len(buf) + pos)
-    message.ParseFromString(buf)
-
-    return True
-
-
-def usage():
-    print("Usage: python show_pb.py PROTO_DATA_FILE", file=sys.stderr)
-    exit(1)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) < 2:
-        usage()
-
-    f = open(sys.argv[1])
-    header = DataFormat.DataHeader()
-    read_proto(f, header)
-    print(header)
-
-    sample = DataFormat.DataSample()
-    while read_proto(f, sample):
-        print(sample)
diff --git a/python/paddle/utils/torch2paddle.py b/python/paddle/utils/torch2paddle.py
deleted file mode 100644
index 398d3aa4e02cc74b7885f7e676937d7fd254bc5e..0000000000000000000000000000000000000000
--- a/python/paddle/utils/torch2paddle.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Convert torch parameter file to paddle model files.
-
-Note: must have torchfile installed in order to use this tool.
-
-Usage: python torch2paddle.py -i torchfile.t7 -l layers.txt -o path/to/paddle_model
-"""
-
-import os
-import sys
-import struct
-import numpy as np
-import torchfile
-import six.moves.cPickle as pickle
-import argparse
-
-
-# save parameters
-def save_layer_parameters(outfile, feats):
-    version = 0
-    value_size = 4
-    ret = ""
-    for feat in feats:
-        ret += feat.tostring()
-    size = len(ret) / 4
-    fo = open(outfile, 'wb')
-    fo.write(struct.pack('iIQ', version, value_size, size))
-    fo.write(ret)
-    fo.close()
-
-
-def save_net_parameters(layers, params, output_path):
-    for i in range(len(layers)):
-        weight = params[i * 2]
-        biases = params[i * 2 + 1]
-        weight_file = os.path.join(output_path, '_%s.w0' % layers[i])
-        biases_file = os.path.join(output_path, '_%s.wbias' % layers[i])
-        print("Saving for layer %s." % layers[i])
-        save_layer_parameters(weight_file, [weight])
-        save_layer_parameters(biases_file, biases)
-
-
-def load_layer_parameters(filename):
-    fn = open(filename, 'rb')
-    version, = struct.unpack('i', fn.read(4))
-    value_length, = struct.unpack("I", fn.read(4))
-    dtype = 'float32' if value_length == 4 else 'float64'
-    param_size, = struct.unpack("L", fn.read(8))
-    value = np.fromfile(fn, dtype)
-    return value
-
-
-def main(argv):
-    """
-    main method of converting torch to paddle files.
-    :param argv:
-    :return:
-    """
-    cmdparser = argparse.ArgumentParser(
-        "Convert torch parameter file to paddle model files.")
-    cmdparser.add_argument(
-        '-i', '--input', help='input filename of torch parameters')
-    cmdparser.add_argument('-l', '--layers', help='list of layer names')
-    cmdparser.add_argument(
-        '-o', '--output', help='output file path of paddle model')
-
-    args = cmdparser.parse_args(argv)
-    if args.input and args.layers and args.output:
-        params = torchfile.load(args.input)
-        layers = [line.strip() for line in open(args.layers, 'r')]
-        save_net_parameters(layers, params, args.output)
-    else:
-        print(
-            'Usage: python torch2paddle.py -i torchfile.t7 -l layers.txt -o path/to/paddle_model'
-        )
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
diff --git a/python/paddle/incubate/hapi/vision/__init__.py b/python/paddle/vision/__init__.py
similarity index 87%
rename from python/paddle/incubate/hapi/vision/__init__.py
rename to python/paddle/vision/__init__.py
index c9d65db18653bf842f5d95ccf472686d5b08c84d..7d28d567cefa2f0cf3ab4f7077d71ea27edc936a 100644
--- a/python/paddle/incubate/hapi/vision/__init__.py
+++ b/python/paddle/vision/__init__.py
@@ -13,9 +13,14 @@
 # limitations under the License.
 
 from . import models
-from . import transforms
 from .models import *
+
+from . import transforms
 from .transforms import *
 
+from . import datasets
+from .datasets import *
+
 __all__ = models.__all__ \
-        + transforms.__all__
+        + transforms.__all__ \
+        + datasets.__all__
diff --git a/python/paddle/incubate/hapi/datasets/utils.py b/python/paddle/vision/datasets/__init__.py
similarity index 60%
rename from python/paddle/incubate/hapi/datasets/utils.py
rename to python/paddle/vision/datasets/__init__.py
index 171f794ba9df4270727a23cc6cd039a9faa81970..6703aa4197603be2d82d930e3cd2622ff6b4cd77 100644
--- a/python/paddle/incubate/hapi/datasets/utils.py
+++ b/python/paddle/vision/datasets/__init__.py
@@ -12,18 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
+from . import folder
+from . import mnist
+from . import flowers
+from . import cifar
+from . import voc2012
 
-import os
-import paddle.dataset.common
+from .folder import *
+from .mnist import *
+from .flowers import *
+from .cifar import *
+from .voc2012 import *
 
-
-def _check_exists_and_download(path, url, md5, module_name, download=True):
-    if path and os.path.exists(path):
-        return path
-
-    if download:
-        return paddle.dataset.common.download(url, module_name, md5)
-    else:
-        raise ValueError('{} not exists and auto download disabled'.format(
-            path))
+__all__ = folder.__all__ \
+          + mnist.__all__ \
+          + flowers.__all__ \
+          + cifar.__all__ \
+          + voc2012.__all__
diff --git a/python/paddle/incubate/hapi/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
similarity index 67%
rename from python/paddle/incubate/hapi/datasets/cifar.py
rename to python/paddle/vision/datasets/cifar.py
index adfa786e615368ba90dab154924678de79104b55..1193be26da56780058beadfe15640bc76533114a 100644
--- a/python/paddle/incubate/hapi/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -20,7 +20,7 @@ import six
 from six.moves import cPickle as pickle
 
 from paddle.io import Dataset
-from .utils import _check_exists_and_download
+from paddle.dataset.common import _check_exists_and_download
 
 __all__ = ['Cifar10', 'Cifar100']
 
@@ -58,33 +58,36 @@ class Cifar10(Dataset):
 
         .. code-block:: python
 
-	    import paddle
-	    from paddle.incubate.hapi.datasets import Cifar10
-	    from paddle.incubate.hapi.vision.transforms import Normalize
+            import paddle
+            import paddle.nn as nn
+            from paddle.vision.datasets import Cifar10
+            from paddle.vision.transforms import Normalize
 
-	    class SimpleNet(paddle.nn.Layer):
-		def __init__(self):
-		    super(SimpleNet, self).__init__()
-		    self.fc = paddle.nn.Linear(3072, 10, act='softmax')
+            class SimpleNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
+                    self.fc = nn.Sequential(
+                        nn.Linear(3072, 10),
+                        nn.Softmax())
 
-		def forward(self, image, label):
-		    image = paddle.reshape(image, (3, -1))
-		    return self.fc(image), label
+                def forward(self, image, label):
+                    image = paddle.reshape(image, (3, -1))
+                    return self.fc(image), label
 
-	    paddle.disable_static()
+            paddle.disable_static()
 
-	    normalize = Normalize(mean=[0.5, 0.5, 0.5],
-				std=[0.5, 0.5, 0.5])
-	    cifar10 = Cifar10(mode='train', transform=normalize)
+            normalize = Normalize(mean=[0.5, 0.5, 0.5],
+                                std=[0.5, 0.5, 0.5])
+            cifar10 = Cifar10(mode='train', transform=normalize)
 
-	    for i in range(10):
-		image, label = cifar10[i]
-		image = paddle.to_tensor(image)
-		label = paddle.to_tensor(label)
+            for i in range(10):
+                image, label = cifar10[i]
+                image = paddle.to_tensor(image)
+                label = paddle.to_tensor(label)
 
-		model = SimpleNet()
-		image, label = model(image, label)
-		print(image.numpy().shape, label.numpy().shape)
+                model = SimpleNet()
+                image, label = model(image, label)
+                print(image.numpy().shape, label.numpy().shape)
 
     """
 
@@ -164,33 +167,36 @@ class Cifar100(Cifar10):
 
         .. code-block:: python
 
-	    import paddle
-	    from paddle.incubate.hapi.datasets import Cifar100
-	    from paddle.incubate.hapi.vision.transforms import Normalize
+            import paddle
+            import paddle.nn as nn
+            from paddle.vision.datasets import Cifar100
+            from paddle.vision.transforms import Normalize
 
-	    class SimpleNet(paddle.nn.Layer):
-		def __init__(self):
-		    super(SimpleNet, self).__init__()
-		    self.fc = paddle.nn.Linear(3072, 100, act='softmax')
+            class SimpleNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
+                    self.fc = nn.Sequential(
+                        nn.Linear(3072, 10),
+                        nn.Softmax())
 
-		def forward(self, image, label):
-		    image = paddle.reshape(image, (3, -1))
-		    return self.fc(image), label
+                def forward(self, image, label):
+                    image = paddle.reshape(image, (3, -1))
+                    return self.fc(image), label
 
-	    paddle.disable_static()
+            paddle.disable_static()
 
-	    normalize = Normalize(mean=[0.5, 0.5, 0.5],
-				std=[0.5, 0.5, 0.5])
-	    cifar100 = Cifar100(mode='train', transform=normalize)
+            normalize = Normalize(mean=[0.5, 0.5, 0.5],
+                                std=[0.5, 0.5, 0.5])
+            cifar100 = Cifar100(mode='train', transform=normalize)
 
-	    for i in range(10):
-		image, label = cifar100[i]
-		image = paddle.to_tensor(image)
-		label = paddle.to_tensor(label)
+            for i in range(10):
+                image, label = cifar100[i]
+                image = paddle.to_tensor(image)
+                label = paddle.to_tensor(label)
 
-		model = SimpleNet()
-		image, label = model(image, label)
-		print(image.numpy().shape, label.numpy().shape)
+                model = SimpleNet()
+                image, label = model(image, label)
+                print(image.numpy().shape, label.numpy().shape)
 
     """
 
diff --git a/python/paddle/incubate/hapi/datasets/flowers.py b/python/paddle/vision/datasets/flowers.py
similarity index 97%
rename from python/paddle/incubate/hapi/datasets/flowers.py
rename to python/paddle/vision/datasets/flowers.py
index 141d2a53b577b8c9be9ac153a36c5b2fa51ded77..1c0f41123e2313d9db6f5e846d133ecdebc7f1af 100644
--- a/python/paddle/incubate/hapi/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
@@ -22,7 +22,7 @@ import scipy.io as scio
 from PIL import Image
 
 from paddle.io import Dataset
-from .utils import _check_exists_and_download
+from paddle.dataset.common import _check_exists_and_download
 
 __all__ = ["Flowers"]
 
@@ -60,7 +60,7 @@ class Flowers(Dataset):
         
         .. code-block:: python
 
-            from paddle.incubate.hapi.datasets import Flowers
+            from paddle.vision.datasets import Flowers
 
             flowers = Flowers(mode='test')
 
diff --git a/python/paddle/incubate/hapi/datasets/folder.py b/python/paddle/vision/datasets/folder.py
similarity index 98%
rename from python/paddle/incubate/hapi/datasets/folder.py
rename to python/paddle/vision/datasets/folder.py
index 358e7681eb8e64364600732f0399e6b97f0d64e0..725fd9acafbab7b6adaf07139d02da8e2c9aaada 100644
--- a/python/paddle/incubate/hapi/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
@@ -94,7 +94,7 @@ class DatasetFolder(Dataset):
             import tempfile
             import shutil
             import numpy as np
-            from paddle.incubate.hapi.datasets import DatasetFolder
+            from paddle.vision.datasets import DatasetFolder
 
             def make_fake_dir():
                 data_dir = tempfile.mkdtemp()
@@ -224,7 +224,7 @@ class ImageFolder(Dataset):
             import tempfile
             import shutil
             import numpy as np
-            from paddle.incubate.hapi.datasets import ImageFolder
+            from paddle.vision.datasets import ImageFolder
 
             def make_fake_dir():
                 data_dir = tempfile.mkdtemp()
diff --git a/python/paddle/incubate/hapi/datasets/mnist.py b/python/paddle/vision/datasets/mnist.py
similarity index 97%
rename from python/paddle/incubate/hapi/datasets/mnist.py
rename to python/paddle/vision/datasets/mnist.py
index ed046e5a1d9bbcc33f3148c6ecde8a349e478cb0..a98561333921d182c0b3a3f486c90a94e79b6a3d 100644
--- a/python/paddle/incubate/hapi/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
@@ -19,9 +19,8 @@ import gzip
 import struct
 import numpy as np
 
-import paddle.dataset.common
 from paddle.io import Dataset
-from .utils import _check_exists_and_download
+from paddle.dataset.common import _check_exists_and_download
 
 __all__ = ["MNIST"]
 
@@ -58,7 +57,7 @@ class MNIST(Dataset):
         
         .. code-block:: python
 
-            from paddle.incubate.hapi.datasets import MNIST
+            from paddle.vision.datasets import MNIST
 
             mnist = MNIST(mode='test')
 
diff --git a/python/paddle/incubate/hapi/datasets/voc2012.py b/python/paddle/vision/datasets/voc2012.py
similarity index 82%
rename from python/paddle/incubate/hapi/datasets/voc2012.py
rename to python/paddle/vision/datasets/voc2012.py
index 1811c455db530710a0559c077975ab08d6a94ac3..ae14ea3016363c828d17ba34aca8e1a6663ecf76 100644
--- a/python/paddle/incubate/hapi/datasets/voc2012.py
+++ b/python/paddle/vision/datasets/voc2012.py
@@ -20,7 +20,7 @@ import numpy as np
 from PIL import Image
 
 from paddle.io import Dataset
-from .utils import _check_exists_and_download
+from paddle.dataset.common import _check_exists_and_download
 
 __all__ = ["VOC2012"]
 
@@ -52,28 +52,28 @@ class VOC2012(Dataset):
 
         .. code-block:: python
 
-	    import paddle
-	    from paddle.incubate.hapi.datasets import VOC2012
+            import paddle
+            from paddle.vision.datasets import VOC2012
 
-	    class SimpleNet(paddle.nn.Layer):
-		def __init__(self):
-		    super(SimpleNet, self).__init__()
+            class SimpleNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
 
-		def forward(self, image, label):
-		    return paddle.sum(image), label
+                def forward(self, image, label):
+                    return paddle.sum(image), label
 
-	    paddle.disable_static()
+            paddle.disable_static()
 
-	    voc2012 = VOC2012(mode='train')
+            voc2012 = VOC2012(mode='train')
 
-	    for i in range(10):
-		image, label= voc2012[i]
-		image = paddle.cast(paddle.to_tensor(image), 'float32')
-		label = paddle.to_tensor(label)
+            for i in range(10):
+                image, label= voc2012[i]
+                image = paddle.cast(paddle.to_tensor(image), 'float32')
+                label = paddle.to_tensor(label)
 
-		model = SimpleNet()
-		image, label= model(image, label)
-		print(image.numpy().shape, label.numpy().shape)
+                model = SimpleNet()
+                image, label= model(image, label)
+                print(image.numpy().shape, label.numpy().shape)
 
     """
 
diff --git a/python/paddle/incubate/hapi/vision/models/__init__.py b/python/paddle/vision/models/__init__.py
similarity index 100%
rename from python/paddle/incubate/hapi/vision/models/__init__.py
rename to python/paddle/vision/models/__init__.py
diff --git a/python/paddle/incubate/hapi/vision/models/lenet.py b/python/paddle/vision/models/lenet.py
similarity index 96%
rename from python/paddle/incubate/hapi/vision/models/lenet.py
rename to python/paddle/vision/models/lenet.py
index 169f70562f6edfe1773a1c8d75004c25831cedcb..c2d4be7cda10d580af44154e6a03e0871ec20706 100644
--- a/python/paddle/incubate/hapi/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
@@ -30,7 +30,7 @@ class LeNet(fluid.dygraph.Layer):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import LeNet
+            from paddle.vision.models import LeNet
 
             model = LeNet()
     """
diff --git a/python/paddle/incubate/hapi/vision/models/mobilenetv1.py b/python/paddle/vision/models/mobilenetv1.py
similarity index 98%
rename from python/paddle/incubate/hapi/vision/models/mobilenetv1.py
rename to python/paddle/vision/models/mobilenetv1.py
index 5022a065a597553bc870b5da6cd5107e24b4ef0a..10defbf593dca642386e73b65094612f93dce9dc 100644
--- a/python/paddle/incubate/hapi/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
@@ -17,7 +17,7 @@ from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 
-from ...download import get_weights_path_from_url
+from paddle.utils.download import get_weights_path_from_url
 
 __all__ = ['MobileNetV1', 'mobilenet_v1']
 
@@ -116,7 +116,7 @@ class MobileNetV1(fluid.dygraph.Layer):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import MobileNetV1
+            from paddle.vision.models import MobileNetV1
 
             model = MobileNetV1()
     """
@@ -291,7 +291,7 @@ def mobilenet_v1(pretrained=False, scale=1.0, **kwargs):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import mobilenet_v1
+            from paddle.vision.models import mobilenet_v1
 
             # build model
             model = mobilenet_v1()
diff --git a/python/paddle/incubate/hapi/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
similarity index 97%
rename from python/paddle/incubate/hapi/vision/models/mobilenetv2.py
rename to python/paddle/vision/models/mobilenetv2.py
index d5cbfc7b96114dd9a3c122d646f47ca26afcb743..c08fb88f8bdb234fec99ed139aa7eb6249965c79 100644
--- a/python/paddle/incubate/hapi/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -18,7 +18,7 @@ import paddle.fluid as fluid
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 
-from ...download import get_weights_path_from_url
+from paddle.utils.download import get_weights_path_from_url
 
 __all__ = ['MobileNetV2', 'mobilenet_v2']
 
@@ -163,7 +163,7 @@ class MobileNetV2(fluid.dygraph.Layer):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import MobileNetV2
+            from paddle.vision.models import MobileNetV2
 
             model = MobileNetV2()
     """
@@ -267,7 +267,7 @@ def mobilenet_v2(pretrained=False, scale=1.0, **kwargs):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import mobilenet_v2
+            from paddle.vision.models import mobilenet_v2
 
             # build model
             model = mobilenet_v2()
diff --git a/python/paddle/incubate/hapi/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
similarity index 95%
rename from python/paddle/incubate/hapi/vision/models/resnet.py
rename to python/paddle/vision/models/resnet.py
index 858934e1c179fa75b5d3510e0e9b6c53bca8e608..da0c3e9eb3f67f0aad67cdef3c5527cb2275e844 100644
--- a/python/paddle/incubate/hapi/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -21,7 +21,7 @@ import paddle.fluid as fluid
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.container import Sequential
 
-from ...download import get_weights_path_from_url
+from paddle.utils.download import get_weights_path_from_url
 
 __all__ = [
     'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'
@@ -180,8 +180,8 @@ class ResNet(fluid.dygraph.Layer):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import ResNet
-            from paddle.incubate.hapi.vision.models.resnet import BottleneckBlock, BasicBlock
+            from paddle.vision.models import ResNet
+            from paddle.vision.models.resnet import BottleneckBlock, BasicBlock
 
             resnet50 = ResNet(BottleneckBlock, 50)
 
@@ -292,7 +292,7 @@ def resnet18(pretrained=False, **kwargs):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import resnet18
+            from paddle.vision.models import resnet18
 
             # build model
             model = resnet18()
@@ -312,7 +312,7 @@ def resnet34(pretrained=False, **kwargs):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import resnet34
+            from paddle.vision.models import resnet34
 
             # build model
             model = resnet34()
@@ -332,7 +332,7 @@ def resnet50(pretrained=False, **kwargs):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import resnet50
+            from paddle.vision.models import resnet50
 
             # build model
             model = resnet50()
@@ -352,7 +352,7 @@ def resnet101(pretrained=False, **kwargs):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import resnet101
+            from paddle.vision.models import resnet101
 
             # build model
             model = resnet101()
@@ -372,7 +372,7 @@ def resnet152(pretrained=False, **kwargs):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import resnet152
+            from paddle.vision.models import resnet152
 
             # build model
             model = resnet152()
diff --git a/python/paddle/incubate/hapi/vision/models/vgg.py b/python/paddle/vision/models/vgg.py
similarity index 94%
rename from python/paddle/incubate/hapi/vision/models/vgg.py
rename to python/paddle/vision/models/vgg.py
index 4352a768eb7206ca30acead580a64a7d04b7701b..8bfacda2476d0e24e549513b379181bf47e40d45 100644
--- a/python/paddle/incubate/hapi/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -16,7 +16,7 @@ import paddle.fluid as fluid
 from paddle.nn import Conv2d, Pool2D, BatchNorm, Linear, ReLU, Softmax
 from paddle.fluid.dygraph.container import Sequential
 
-from ...download import get_weights_path_from_url
+from paddle.utils.download import get_weights_path_from_url
 
 __all__ = [
     'VGG',
@@ -65,8 +65,8 @@ class VGG(fluid.dygraph.Layer):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import VGG
-            from paddle.incubate.hapi.vision.models.vgg import make_layers
+            from paddle.vision.models import VGG
+            from paddle.vision.models.vgg import make_layers
 
             vgg11_cfg = [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M']
 
@@ -160,7 +160,7 @@ def vgg11(pretrained=False, batch_norm=False, **kwargs):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import vgg11
+            from paddle.vision.models import vgg11
 
             # build model
             model = vgg11()
@@ -184,7 +184,7 @@ def vgg13(pretrained=False, batch_norm=False, **kwargs):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import vgg13
+            from paddle.vision.models import vgg13
 
             # build model
             model = vgg13()
@@ -208,7 +208,7 @@ def vgg16(pretrained=False, batch_norm=False, **kwargs):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import vgg16
+            from paddle.vision.models import vgg16
 
             # build model
             model = vgg16()
@@ -232,7 +232,7 @@ def vgg19(pretrained=False, batch_norm=False, **kwargs):
     Examples:
         .. code-block:: python
 
-            from paddle.incubate.hapi.vision.models import vgg19
+            from paddle.vision.models import vgg19
 
             # build model
             model = vgg19()
diff --git a/python/paddle/incubate/hapi/vision/transforms/__init__.py b/python/paddle/vision/transforms/__init__.py
similarity index 100%
rename from python/paddle/incubate/hapi/vision/transforms/__init__.py
rename to python/paddle/vision/transforms/__init__.py
diff --git a/python/paddle/incubate/hapi/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
similarity index 96%
rename from python/paddle/incubate/hapi/vision/transforms/functional.py
rename to python/paddle/vision/transforms/functional.py
index b118ee3fc7553dc7d02028ae273be33166829635..b5668fa8c7d6812664512a58faf836b5d9f09300 100644
--- a/python/paddle/incubate/hapi/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -64,7 +64,7 @@ def flip(image, code):
         .. code-block:: python
 
             import numpy as np
-            from paddle.incubate.hapi.vision.transforms import functional as F
+            from paddle.vision.transforms import functional as F
 
             fake_img = np.random.rand(224, 224, 3)
 
@@ -94,7 +94,7 @@ def resize(img, size, interpolation=cv2.INTER_LINEAR):
         .. code-block:: python
 
             import numpy as np
-            from paddle.incubate.hapi.vision.transforms import functional as F
+            from paddle.vision.transforms import functional as F
 
             fake_img = np.random.rand(256, 256, 3)
 
@@ -155,7 +155,7 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms.functional import pad
+            from paddle.vision.transforms.functional import pad
 
             fake_img = np.random.rand(500, 500, 3).astype('float32')
 
@@ -243,7 +243,7 @@ def rotate(img,
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms.functional import rotate
+            from paddle.vision.transforms.functional import rotate
 
             fake_img = np.random.rand(500, 500, 3).astype('float32')
 
@@ -305,7 +305,7 @@ def to_grayscale(img, num_output_channels=1):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms.functional import to_grayscale
+            from paddle.vision.transforms.functional import to_grayscale
 
             fake_img = np.random.rand(500, 500, 3).astype('float32')
 
diff --git a/python/paddle/incubate/hapi/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
similarity index 94%
rename from python/paddle/incubate/hapi/vision/transforms/transforms.py
rename to python/paddle/vision/transforms/transforms.py
index d46faa0685aa13790be217e0c99ab407790dd2ca..14809e0c1acaa1b6d5a494e6e3df1801e1c8f61b 100644
--- a/python/paddle/incubate/hapi/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -76,8 +76,8 @@ class Compose(object):
     
         .. code-block:: python
 
-            from paddle.incubate.hapi.datasets import Flowers
-            from paddle.incubate.hapi.vision.transforms import Compose, ColorJitter, Resize
+            from paddle.vision.datasets import Flowers
+            from paddle.vision.transforms import Compose, ColorJitter, Resize
 
             transform = Compose([ColorJitter(), Resize(size=608)])
             flowers = Flowers(mode='test', transform=transform)
@@ -130,9 +130,9 @@ class BatchCompose(object):
             import numpy as np
             from paddle.io import DataLoader
 
-            from paddle.incubate.hapi import set_device
-            from paddle.incubate.hapi.datasets import Flowers
-            from paddle.incubate.hapi.vision.transforms import Compose, BatchCompose, Resize
+            from paddle import set_device
+            from paddle.vision.datasets import Flowers
+            from paddle.vision.transforms import Compose, BatchCompose, Resize
 
             class NormalizeBatch(object):
                 def __init__(self,
@@ -222,7 +222,7 @@ class Resize(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import Resize
+            from paddle.vision.transforms import Resize
 
             transform = Resize(size=224)
 
@@ -259,7 +259,7 @@ class RandomResizedCrop(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import RandomResizedCrop
+            from paddle.vision.transforms import RandomResizedCrop
 
             transform = RandomResizedCrop(224)
 
@@ -336,7 +336,7 @@ class CenterCropResize(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import CenterCropResize
+            from paddle.vision.transforms import CenterCropResize
 
             transform = CenterCropResize(224)
 
@@ -380,7 +380,7 @@ class CenterCrop(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import CenterCrop
+            from paddle.vision.transforms import CenterCrop
 
             transform = CenterCrop(224)
 
@@ -422,7 +422,7 @@ class RandomHorizontalFlip(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import RandomHorizontalFlip
+            from paddle.vision.transforms import RandomHorizontalFlip
 
             transform = RandomHorizontalFlip(224)
 
@@ -453,7 +453,7 @@ class RandomVerticalFlip(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import RandomVerticalFlip
+            from paddle.vision.transforms import RandomVerticalFlip
 
             transform = RandomVerticalFlip(224)
 
@@ -488,7 +488,7 @@ class Normalize(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import Normalize
+            from paddle.vision.transforms import Normalize
 
             normalize = Normalize(mean=[0.5, 0.5, 0.5], 
                                 std=[0.5, 0.5, 0.5])
@@ -530,7 +530,7 @@ class Permute(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import Permute
+            from paddle.vision.transforms import Permute
 
             transform = Permute()
 
@@ -569,7 +569,7 @@ class GaussianNoise(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import GaussianNoise
+            from paddle.vision.transforms import GaussianNoise
 
             transform = GaussianNoise()
 
@@ -603,7 +603,7 @@ class BrightnessTransform(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import BrightnessTransform
+            from paddle.vision.transforms import BrightnessTransform
 
             transform = BrightnessTransform(0.4)
 
@@ -642,7 +642,7 @@ class ContrastTransform(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import ContrastTransform
+            from paddle.vision.transforms import ContrastTransform
 
             transform = ContrastTransform(0.4)
 
@@ -682,7 +682,7 @@ class SaturationTransform(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import SaturationTransform
+            from paddle.vision.transforms import SaturationTransform
 
             transform = SaturationTransform(0.4)
 
@@ -723,7 +723,7 @@ class HueTransform(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import HueTransform
+            from paddle.vision.transforms import HueTransform
 
             transform = HueTransform(0.4)
 
@@ -775,7 +775,7 @@ class ColorJitter(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import ColorJitter
+            from paddle.vision.transforms import ColorJitter
 
             transform = ColorJitter(0.4)
 
@@ -822,7 +822,7 @@ class RandomCrop(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import RandomCrop
+            from paddle.vision.transforms import RandomCrop
 
             transform = RandomCrop(224)
 
@@ -909,7 +909,7 @@ class RandomErasing(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import RandomCrop
+            from paddle.vision.transforms import RandomCrop
 
             transform = RandomCrop(224)
 
@@ -995,7 +995,7 @@ class Pad(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import Pad
+            from paddle.vision.transforms import Pad
 
             transform = Pad(2)
 
@@ -1051,7 +1051,7 @@ class RandomRotate(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import RandomRotate
+            from paddle.vision.transforms import RandomRotate
 
             transform = RandomRotate(90)
 
@@ -1119,7 +1119,7 @@ class Grayscale(object):
 
             import numpy as np
 
-            from paddle.incubate.hapi.vision.transforms import Grayscale
+            from paddle.vision.transforms import Grayscale
 
             transform = Grayscale()
 
diff --git a/python/requirements.txt b/python/requirements.txt
index 5e81ec680897024e7c32d193bef1716e9b25b4a4..c8d3b2af1794bb0858b187d6a4c641322f50cdd1 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -15,12 +15,9 @@ rarfile
 Pillow
 graphviz
 six
-funcsigs
-pyyaml
 decorator
 prettytable
 objgraph
 astor
 pathlib
 netifaces
-psutil
diff --git a/python/setup.py.in b/python/setup.py.in
index 5b206296bd641bf909115d1c580518afe85a37b6..64ac2b9b9a4d210c59193e117c6000986bfb07a0 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -188,12 +188,13 @@ packages=['paddle',
           'paddle.fluid.incubate.fleet.parameter_server.ir',
           'paddle.fluid.incubate.fleet.collective',
           'paddle.fluid.incubate.fleet.utils',
-          'paddle.incubate.hapi',
-          'paddle.incubate.hapi.datasets',
-          'paddle.incubate.hapi.vision',
-          'paddle.incubate.hapi.vision.models',
-          'paddle.incubate.hapi.vision.transforms',
-          'paddle.incubate.hapi.text',
+          'paddle.hapi',
+          'paddle.vision',
+          'paddle.vision.models',
+          'paddle.vision.transforms',
+          'paddle.vision.datasets',
+          'paddle.text',
+          'paddle.text.datasets',
           'paddle.incubate',
           'paddle.io',
           'paddle.optimizer',
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index f7ee09e11ea5e3a3b5ba4ce6b3be8af4abe7cae4..b787ae625017d783a7221006ddd6867c21e238e8 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -53,8 +53,8 @@ fi
 
 op_desc_diff=`python ${PADDLE_ROOT}/tools/check_op_desc.py ${PADDLE_ROOT}/paddle/fluid/OP_DESC_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/OP_DESC_PR.spec`
 if [ "$op_desc_diff" != "" ]; then
-    echo_line="You must have one RD (liym27 (Recommend), zhhsplendid, Aurelius84, lanxianghit or phlrain) approval for the changes of Inputs/Output/Attrs of OPs. The changes of OPs will cause that the new version inference fails to load model trained by the old version. Please modify your code. \n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/OP-Input-Output-Attribute-Compatibility-Modification].\n${op_desc_diff}\n"
-    check_approval 1 33742067 7913861 9301846 47554610 43953930
+    echo_line="You must have one RD (cyj1986, Superjomn) approval for the changes of Inputs/Output/Attrs of OPs. The changes of OPs will cause that the new version inference fails to load model trained by the old version. Please modify your code. \n For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/OP-Input-Output-Attribute-Compatibility-Modification].\n${op_desc_diff}\n"
+    check_approval 1 39645414 328693
 fi
 
 DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_DEV.spec
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 2c575e4abf1beed039d3293821b8df356d4e9295..1e5179d0282d7f35c4232d9b9783cb831e83f462 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -19,8 +19,8 @@ API_FILES=("CMakeLists.txt"
            "paddle/fluid/framework/ir/node.h"
            "paddle/fluid/framework/ir/graph.h"
            "paddle/fluid/framework/framework.proto"
-	   "python/paddle/distributed/__init"
-	   "python/paddle/distributed/fleet/__init__.py"
+	    "python/paddle/distributed/__init"
+	    "python/paddle/distributed/fleet/__init__.py"
            "python/requirements.txt"
            "python/paddle/fluid/__init__.py"
            "python/paddle/fluid/compiler.py"
@@ -39,6 +39,7 @@ API_FILES=("CMakeLists.txt"
            "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py"
            "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py"
            "tools/wlist.json"
+           "paddle/scripts/paddle_build.bat"
            )
 
 approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
@@ -114,17 +115,20 @@ for API_FILE in ${API_FILES[*]}; do
           echo_line="You must have one RD (luotao1 or phlrain) approval for ${API_FILE}, which manages the white list of batch size 1 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-batch_size=1-in-sequence-OP-test]. \n"
           check_approval 1 6836917 43953930
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py" ];then
-        echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
-        check_approval 1 39303645 6836917 43953930
+          echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
+          check_approval 1 39303645 6836917 43953930
       elif [ "${API_FILE}" == "tools/wlist.json" ];then
-        echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
-        check_approval 1 29231
+          echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
+          check_approval 1 29231
       elif [ "${API_FILE}" == "python/paddle/distributed/fleet/__init__.py" ]; then
-	echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
-	check_approval 1 35550832 38231817
+	      echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
+	      check_approval 1 35550832 38231817
       elif [ "${API_FILE}" == "python/paddle/distributed/__init__.py" ]; then
-	echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
-	check_approval 1 35550832 38231817
+	      echo_line="You must have (guru4elephant,raindrops2sea) approval for ${API_FILE} changes "
+	      check_approval 1 35550832 38231817
+      elif [ "${API_FILE}" == "paddle/scripts/paddle_build.bat" ]; then
+	      echo_line="You must have one RD (zhouwei25 (Recommend), luotao1) approval for ${API_FILE} changes, which manages all Paddle CI task on Windows.\n"
+	      check_approval 1 52485244 6836917
       else
           echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
           check_approval 1 3048612 46782768 12538138 6836917
@@ -159,7 +163,7 @@ fi
 
 HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH | grep "^+[[:space:]]\{0,\}@unittest.skip" || true`
 if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), liuwei1031, or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
+    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
     check_approval 1 22165420 6836917 46661762
   fi
 
diff --git a/tools/count_all_enforce.sh b/tools/enforce/count_all_enforce.sh
similarity index 100%
rename from tools/count_all_enforce.sh
rename to tools/enforce/count_all_enforce.sh
diff --git a/tools/count_enforce_by_dir.sh b/tools/enforce/count_enforce_by_dir.sh
similarity index 100%
rename from tools/count_enforce_by_dir.sh
rename to tools/enforce/count_enforce_by_dir.sh
diff --git a/tools/count_enforce_by_file.sh b/tools/enforce/count_enforce_by_file.sh
similarity index 100%
rename from tools/count_enforce_by_file.sh
rename to tools/enforce/count_enforce_by_file.sh
diff --git a/tools/grep_invalid_enforce.sh b/tools/enforce/grep_invalid_enforce.sh
similarity index 100%
rename from tools/grep_invalid_enforce.sh
rename to tools/enforce/grep_invalid_enforce.sh