diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 7a94bda0f5f73e48081f68d7b2730e3df1e46232..c78fe5f6c7fbd44e0820747f200e3e8168dc3783 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -107,6 +107,9 @@ function(select_nvcc_arch_flags out_variable)
   elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
     set(cuda_arch_bin "50")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
+      add_definitions("-DSUPPORTS_CUDA_FP16")
+    endif()
     set(cuda_arch_bin "60 61")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 96d54ec86917432837d61f681ece91da2ddcab10..aec27bd9d91e5afb6bf11037e60ff213162ad97f 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -527,6 +527,8 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
         VLOG(0) << "error: the number of ids is a negative number: " << num;
         VLOG(0) << "please check line<" << instance_cout << "> in file<"
                 << filename << ">";
+        VLOG(0) << "Error occured when parsing " << i
+                << " th slot with total slots number: " << all_slots_.size();
         return false;
       } else if (num == 0) {
         VLOG(0)
@@ -536,42 +538,66 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
                "characters.";
         VLOG(0) << "please check line<" << instance_cout << "> in file<"
                 << filename << ">";
+        VLOG(0) << "Error occured when parsing " << i
+                << " th slot with total slots number: " << all_slots_.size();
         return false;
       } else if (errno == ERANGE || num > INT_MAX) {
         VLOG(0) << "error: the number of ids greater than INT_MAX";
         VLOG(0) << "please check line<" << instance_cout << "> in file<"
                 << filename << ">";
+        VLOG(0) << "Error occured when parsing " << i
+                << " th slot with total slots number: " << all_slots_.size();
         return false;
       }
       if (all_slots_type_[i] == "float") {
-        for (int i = 0; i < num; ++i) {
+        for (int j = 0; j < num; ++j) {
           strtof(endptr, &endptr);
           if (errno == ERANGE) {
             VLOG(0) << "error: the value is out of the range of "
                        "representable values for float";
             VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
+            VLOG(0) << "Error occured when parsing " << i
+                    << " th slot with total slots number: "
+                    << all_slots_.size();
+            VLOG(0) << "and in this slot: " << j
+                    << " th id with total id number: " << num;
             return false;
           }
-          if (i + 1 != num && endptr - str == len) {
+          if (j + 1 != num && endptr - str == len) {
             VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "Error occured when parsing " << i
+                    << " th slot with total slots number: "
+                    << all_slots_.size();
+            VLOG(0) << "and in this slot: " << j
+                    << " th id with total id number: " << num;
             VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
           }
         }
       } else if (all_slots_type_[i] == "uint64") {
-        for (int i = 0; i < num; ++i) {
+        for (int j = 0; j < num; ++j) {
           strtoull(endptr, &endptr, 10);
           if (errno == ERANGE) {
             VLOG(0) << "error: the value is out of the range of "
                        "representable values for uint64_t";
+            VLOG(0) << "Error occured when parsing " << i
+                    << " th slot with total slots number: "
+                    << all_slots_.size();
+            VLOG(0) << "and in this slot: " << j
+                    << " th id with total id number: " << num;
             VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
           }
-          if (i + 1 != num && endptr - str == len) {
+          if (j + 1 != num && endptr - str == len) {
             VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "Error occured when parsing " << i
+                    << " th slot with total slots number: "
+                    << all_slots_.size();
+            VLOG(0) << "and in this slot: " << j
+                    << " th id with total id number: " << num;
             VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
@@ -632,8 +658,13 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
               "The number of ids can not be zero, you need padding "
               "it in data generator; or if there is something wrong with "
               "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s",
-              str));
+              "characters.\nplease check this error line: %s, \n Specifically, "
+              "something wrong happened(the length of this slot's feasign is 0)"
+              "when we parse the %d th slots."
+              "Maybe something wrong around this slot",
+              "\nWe detect the feasign number of this slot is %d, "
+              "which is illegal.",
+              str, i, num));
       if (idx != -1) {
         (*instance)[idx].Init(all_slots_type_[i]);
         if ((*instance)[idx].GetType()[0] == 'f') {  // float
@@ -683,8 +714,13 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
               "The number of ids can not be zero, you need padding "
               "it in data generator; or if there is something wrong with "
               "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s.",
-              str));
+              "characters.\nplease check this error line: %s, \n Specifically, "
+              "something wrong happened(the length of this slot's feasign is 0)"
+              "when we parse the %d th slots."
+              "Maybe something wrong around this slot",
+              "\nWe detect the feasign number of this slot is %d, "
+              "which is illegal.",
+              str, i, num));
 
       if (idx != -1) {
         (*instance)[idx].Init(all_slots_type_[i]);
@@ -916,8 +952,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
               "The number of ids can not be zero, you need padding "
               "it in data generator; or if there is something wrong with "
               "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s.",
-              str));
+              "characters.\nplease check this error line: %s, \n Specifically, "
+              "something wrong happened(the length of this slot's feasign is 0)"
+              "when we parse the %d th slots."
+              "Maybe something wrong around this slot",
+              "\nWe detect the feasign number of this slot is %d, "
+              "which is illegal.",
+              str, i, num));
       if (idx != -1) {
         if (all_slots_type_[i][0] == 'f') {  // float
           for (int j = 0; j < num; ++j) {
@@ -982,8 +1023,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
               "The number of ids can not be zero, you need padding "
               "it in data generator; or if there is something wrong with "
               "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s.",
-              str));
+              "characters.\nplease check this error line: %s, \n Specifically, "
+              "something wrong happened(the length of this slot's feasign is 0)"
+              "when we parse the %d th slots."
+              "Maybe something wrong around this slot",
+              "\nWe detect the feasign number of this slot is %d, "
+              "which is illegal.",
+              str, i, num));
 
       if (idx != -1) {
         if (all_slots_type_[i][0] == 'f') {  // float
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index a3cc4d1721e20a72817606bd773129230a8154ce..8281ec2143890aa2bb886347ccc0eff8145c67f3 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -74,6 +74,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
     eager_deletion_pass
     buffer_shared_inplace_op_pass
     buffer_shared_cross_op_memory_reuse_pass
+    inplace_addto_op_pass
     set_reader_device_info_utils
     add_reader_dependency_pass)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 7fc08f3e0f20f243425b351b43c124d4519753f6..939a2fc8fc9c73472ff5c25633610fa70c7cec6d 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+
 #include <algorithm>
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -34,14 +36,24 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<platform::Place> &places,
                                      const platform::NCCLCommunicator *ctxs)
     : NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
 }
 #else
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places)
     : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
 }
 #endif
 
@@ -60,13 +72,25 @@ void AllReduceOpHandle::AllReduceImpl(
     const std::vector<VarHandle *> &in_var_handles,
     const std::vector<VarHandle *> &out_var_handles) {
   size_t num_places = places_.size();
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), num_places,
-      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), num_places,
+                    platform::errors::InvalidArgument(
+                        "The NoDummyInputSize should be equal "
+                        "to the number of places, but got NoDummyInputSize is "
+                        "%d and the number of place is %d.",
+                        in_var_handles.size(), num_places));
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-  PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places);
+      platform::errors::InvalidArgument(
+          "The NoDummyInputSize and NoDummyOutputSize should be "
+          "equal, but got NoDummyInputSize is %d and NoDummyOutputSize is %d.",
+          in_var_handles.size(), out_var_handles.size()));
+  PADDLE_ENFORCE_EQ(
+      local_exec_scopes_.size(), num_places,
+      platform::errors::InvalidArgument(
+          "The number of local scopes should be equal "
+          "to the number of places, but got the number of local scopes is "
+          "%d and the number of place is %d.",
+          in_var_handles.size(), num_places));
 
   std::vector<const void *> lod_tensor_data;
   std::vector<platform::Place> places;
@@ -78,23 +102,36 @@ void AllReduceOpHandle::AllReduceImpl(
   for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
     auto &local_scope = local_exec_scopes_[i];
     auto var = local_scope->FindVar(in_var_handles[i]->name());
-    PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.",
-                            in_var_handles[i]->name());
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::NotFound(
+                                     "Variable %s is not found in local scope.",
+                                     in_var_handles[i]->name()));
     auto &lod_tensor = var->Get<LoDTensor>();
 
     if (i == 0) {
       numel = static_cast<int64_t>(lod_tensor.numel());
       // only enforce place0, we will enforce other palce numel == place0 numel
       PADDLE_ENFORCE_GT(
-          numel, 0, platform::errors::InvalidArgument(
-                        "The numel of tensos=[%s] must > 0. But now numel=[%d]",
-                        in_var_handles[i]->name(), numel));
+          numel, 0,
+          platform::errors::PreconditionNotMet(
+              "The numel of tensor %s should be > 0, but got numel is %d.",
+              in_var_handles[i]->name(), numel));
       dtype = lod_tensor.type();
       is_gpu_place = platform::is_gpu_place(lod_tensor.place());
     }
-    PADDLE_ENFORCE_EQ(numel, static_cast<int64_t>(lod_tensor.numel()));
-    PADDLE_ENFORCE_EQ(dtype, lod_tensor.type());
-    PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()));
+    PADDLE_ENFORCE_EQ(
+        numel, static_cast<int64_t>(lod_tensor.numel()),
+        platform::errors::PreconditionNotMet(
+            "The size of tensors of the same variable in different local "
+            "scopes should be equal."));
+    PADDLE_ENFORCE_EQ(
+        dtype, lod_tensor.type(),
+        platform::errors::PreconditionNotMet(
+            "The dtype of tensors of the same variable in different local "
+            "scopes should be equal."));
+    PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of tensors of the same variable "
+                          "in different local scopes should be equal."));
 
     lod_tensor_data.emplace_back(lod_tensor.data<void>());
     places.emplace_back(lod_tensor.place());
@@ -102,8 +139,12 @@ void AllReduceOpHandle::AllReduceImpl(
     VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
              << ", out_name:" << out_var_handles[i]->name();
 
-    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
-                      "The name of input and output should be equal.");
+    PADDLE_ENFORCE_EQ(
+        in_var_handles[i]->name(), out_var_handles[i]->name(),
+        platform::errors::InvalidArgument(
+            "The name of input and output of all_reduce op should be equal, "
+            "but got input is %s and output is %s.",
+            in_var_handles[i]->name(), out_var_handles[i]->name()));
   }
 
   std::vector<std::string> grad_var_names;
@@ -122,7 +163,9 @@ void AllReduceOpHandle::AllReduceFunc(
     const std::vector<std::string> &out_var_names) {
   if (is_gpu_place(places[0])) {
 #if defined(PADDLE_WITH_NCCL)
-    PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+    PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
+                            platform::errors::InvalidArgument(
+                                "The nccl context should not be NULL."));
     ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
     std::vector<std::function<void()>> all_reduce_calls;
     for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
@@ -134,7 +177,8 @@ void AllReduceOpHandle::AllReduceFunc(
     }
     NCCLAllReduceFunc(all_reduce_calls);
 #else
-    PADDLE_THROW("Not compiled with CUDA.");
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
   } else {  // Special handle CPU only Operator's gradient. Like CRF
     auto &trg = *local_exec_scopes_[0]
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index d42bd0b16d7a84987517326af9567809fd29da4d..12c0d6749029c657a829e8d2b04a2113fbe8946a 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -89,8 +89,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
       places_(std::move(places)),
       graphs_(std::move(graphs)) {
   VLOG(3) << "build AsyncSSAGraphExecutor";
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-  PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
+  PADDLE_ENFORCE_EQ(
+      local_scopes_.size(), local_exec_scopes_.size(),
+      platform::errors::InvalidArgument(
+          "The number of local scopes and the number of local execution scopes "
+          "should be equal, but got number of local scopes is %d and "
+          "number of local execution scopes is %d.",
+          local_scopes_.size(), local_exec_scopes_.size()));
 
   // set the correct size of thread pool to each device.
   strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 5388df6bc504203abb57237f2d23a324367ce087..01d496d4ea7f7f0d0347b552e13d988fdc68e0c7 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -19,6 +19,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "boost/optional.hpp"
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -119,6 +120,9 @@ struct BuildStrategy {
   // Turn on inplace by default.
   bool enable_inplace_{true};
 
+  // Turn off inplace addto by default.
+  bool enable_addto_{false};
+
   // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
   // num_trainers is 1, so the current fields of build_strategy doesn't tell if
   // it's distributed model.
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index e440dff2af6b5649d34f47c3b696edeb8a1ba0a2..7f1d3c9b340c9ee92c45c038bf42cf409d535158 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+
 #include <deque>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -48,7 +50,9 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
       bootstrap_ops_.emplace_back(op);
     }
   }
-  PADDLE_ENFORCE_GT(op_deps_.size(), 0, "The graph doesn't have operators.");
+  PADDLE_ENFORCE_GT(op_deps_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The graph doesn't have operators."));
   PrepareAtomicOpDeps();
 }
 
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index ae69960ef78c3e35143c66226133bd0dceac8b79..aedb8db46a5d9c90f176588d1dfd206e0abaf616 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
+
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -138,8 +140,10 @@ void FetchOpHandle::RunImpl() {
     auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
     auto &scope = scopes.at(var_handle->scope_idx());
     auto *var = scope->FindVar(var_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
-                            var_handle->name());
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::NotFound(
+            "Cannot find variable %s in execution scope.", var_handle->name()));
 
     if (var->IsType<LoDTensor>()) {
       auto &t = var->Get<framework::LoDTensor>();
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 35fe5d631fbaad61ce64ccf70d58d176aa3d3a20..459bcff5c0b740be0d495a6ad648da7424bd1a42 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/op_handle_base.h"
+
 #include <map>
 #include <unordered_set>
 
@@ -88,6 +89,12 @@ void OpHandleBase::Run(bool use_cuda) {
   PADDLE_ENFORCE(!use_cuda);
 #endif
 
+  // skip running current op, used with inplace_addto_op_pass
+  if (skip_running_) {
+    VLOG(4) << "skip running: " << Name();
+    return;
+  }
+
   RunImpl();
 }
 
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index c5aa1295aad695175e53b17d729006ffc67ce3ab..097f54d5d5891390fdd479d3e6f62ae0e97cd0d4 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -18,6 +18,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -52,6 +53,10 @@ class OpHandleBase {
 
   virtual Priority GetPriority() const { return kNormal; }
 
+  virtual bool GetSkipRunning() const { return skip_running_; }
+
+  virtual void SetSkipRunning(bool skip_runing) { skip_running_ = skip_runing; }
+
   virtual std::string Name() const = 0;
 
   void Run(bool use_cuda);
@@ -131,6 +136,7 @@ class OpHandleBase {
   std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
 
   std::vector<Scope *> local_exec_scopes_;
+  bool skip_running_ = false;
 
 #ifdef PADDLE_WITH_CUDA
   std::unordered_map<int, cudaEvent_t> events_;
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index e7d466c4af0711219c5a10a4c739ae3eb998e27d..35834fe5d7480819311a15ec54ab9412fc0a7cee 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+
 #include <algorithm>
 #include <memory>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
@@ -104,7 +106,12 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
       places_(places),
       graphs_(std::move(graphs)),
       feed_status_(places.size(), FeedStatus::kNone) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
 
   PADDLE_ENFORCE_EQ(places_.size(), graphs_.size(),
                     platform::errors::InvalidArgument(
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index fe86d002ca8b33695839be3c5d2ff5fd20672952..7cc1f54131416ed454846c75c8c8a6849ec20e6c 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+
 #include <stdexcept>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -37,7 +39,13 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
       var_infos_(std::move(var_infos)),
       places_(std::move(places)),
       scope_monitor_(places_, local_exec_scopes_) {
-  PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size());
+  PADDLE_ENFORCE_EQ(
+      local_scopes_.size(), local_exec_scopes_.size(),
+      platform::errors::InvalidArgument(
+          "The number of local scopes and the number of local execution scopes "
+          "should be equal, but got number of local scopes is %d and "
+          "number of local execution scopes is %d.",
+          local_scopes_.size(), local_exec_scopes_.size()));
   PrepareLocalExeScopes();
 }
 
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
index 6fdec553f3d65debdf8f6d95eeeb8ebe30b4a36a..5fbaf3cbfe028638ad9219d9e1286480ae16ee6b 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
+
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -29,7 +31,8 @@ static inline const Tensor &GetTensorFromVar(const Variable *var) {
   if (var->IsType<LoDTensor>()) {
     return var->Get<LoDTensor>();
   } else {
-    PADDLE_THROW("Variable must be type of LoDTensor");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable must be type of LoDTensor."));
   }
 }
 
@@ -37,20 +40,27 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) {
   if (var->IsType<LoDTensor>()) {
     return var->GetMutable<LoDTensor>();
   } else {
-    PADDLE_THROW("Variable must be type of LoDTensor");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable must be type of LoDTensor."));
   }
 }
 
 ShareTensorBufferFunctor::ShareTensorBufferFunctor(
     Scope *scope, size_t scope_idx, const std::string &op_type,
     const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-    const std::vector<std::string> &out_var_names)
+    const std::vector<std::string> &out_var_names, bool share_dims)
     : scope_(scope),
       scope_idx_(scope_idx),
       op_type_(op_type),
       in_var_infos_(in_var_infos),
-      out_var_names_(out_var_names) {
-  PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size());
+      out_var_names_(out_var_names),
+      share_dims_(share_dims) {
+  PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(),
+                    platform::errors::PreconditionNotMet(
+                        "The number of input variables and output variables "
+                        "should be equal, but got number of input variables is "
+                        "%d and number of output variables is %d.",
+                        in_var_infos_.size(), out_var_names_.size()));
   for (size_t i = 0; i < in_var_infos_.size(); ++i) {
     AddReuseVarPair(in_var_infos_[i], out_var_names_[i]);
   }
@@ -67,32 +77,59 @@ ShareTensorBufferFunctor::ReusedVars() const {
 
 void ShareTensorBufferFunctor::AddReuseVarPair(
     const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) {
-  PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr");
+  PADDLE_ENFORCE_NOT_NULL(
+      in_var_info,
+      platform::errors::InvalidArgument(
+          "The input variables to be inplaced should not be NULL."));
   PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name,
-                    "in/out cannot have same name: %s", out_var_name);
+                    platform::errors::InvalidArgument(
+                        "The input variable and output variable to be inplaced "
+                        "cannot have the same name: %s.",
+                        out_var_name));
   in_var_infos_.emplace_back(in_var_info);
   out_var_names_.emplace_back(out_var_name);
 }
 
 void ShareTensorBufferFunctor::CallOnce() {
-  PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here");
+  PADDLE_ENFORCE(in_out_vars_.empty(),
+                 platform::errors::InvalidArgument(
+                     "The input-output variable pairs to be "
+                     "inplaced should be initialized here."));
   for (size_t i = 0; i < in_var_infos_.size(); ++i) {
     auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name());
     auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    PADDLE_ENFORCE_NOT_NULL(out_var);
-    PADDLE_ENFORCE_NE(in_var, out_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var, platform::errors::NotFound(
+                    "The input variable(%s)to be inplaced should not be NULL.",
+                    in_var_infos_[i]->Name()));
+    PADDLE_ENFORCE_NOT_NULL(
+        out_var,
+        platform::errors::NotFound(
+            "The output variable(%s) to be inplaced should not be NULL.",
+            out_var_names_[i]));
+    PADDLE_ENFORCE_NE(
+        in_var, out_var,
+        platform::errors::PreconditionNotMet(
+            "The input variable and output variable to be inplaced "
+            "cannot be the same variable(%s).",
+            out_var_names_[i]));
     in_out_vars_.emplace_back(in_var, out_var);
   }
 }
 
 void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
   if (!exec_scope_) {
-    PADDLE_ENFORCE_NOT_NULL(exec_scope);
+    PADDLE_ENFORCE_NOT_NULL(exec_scope,
+                            platform::errors::InvalidArgument(
+                                "The given execution scope should not be NULL "
+                                "if the cached scope is NULL."));
     exec_scope_ = exec_scope;
     CallOnce();
   } else {
-    PADDLE_ENFORCE(exec_scope_ == exec_scope, "Scope must be the same");
+    PADDLE_ENFORCE_EQ(exec_scope_, exec_scope,
+                      platform::errors::InvalidArgument(
+                          "The given execution scope and the cached execution "
+                          "scope should be the same."));
   }
 
   for (size_t i = 0; i < in_var_infos_.size(); ++i) {
@@ -115,6 +152,13 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
     } else {
       out_tensor->ShareBufferWith(in_tensor);
 
+      // NOTE(zhiqiu): In the case of inplace addto, if the operator of
+      // the in_out_vars is skipped during running, we should set the dims of
+      // output as the same as input.
+      if (share_dims_) {
+        out_tensor->Resize(in_tensor.dims());
+      }
+
       VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
               << in_var_info->Name() << " -> " << out_var_names_[i];
     }
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.h b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
index 774dcd056e59bc8f090a5ceb916e73843c8c9df6..be49d1c432b2ab2b9741d873ba005b400e9f0829 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
@@ -19,6 +19,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/scope.h"
@@ -40,11 +41,13 @@ class ShareTensorBufferFunctor {
   ShareTensorBufferFunctor(
       Scope *scope, size_t scope_idx, const std::string &op_type,
       const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-      const std::vector<std::string> &out_var_names);
+      const std::vector<std::string> &out_var_names, bool share_dims = false);
 
   void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
                        const std::string &out_var_name);
 
+  void SetShareDims(bool share_dims) { share_dims_ = share_dims; }
+
   void operator()(Scope *exec_scope);
 
   std::unordered_map<std::string, std::string> ReusedVars() const;
@@ -66,6 +69,11 @@ class ShareTensorBufferFunctor {
   std::vector<std::string> out_var_names_;
 
   std::vector<std::pair<const Variable *, Variable *>> in_out_vars_;
+
+  // NOTE(zhiqiu): In the case of inplace addto, if the operator of
+  // the in_out_vars is skipped during running, we should set the dims of output
+  // as the same as input.
+  bool share_dims_{false};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index f06507257f1e9fc8b1783201adb533ec7b032c09..be3f5515a971900258ab5914b579deffe5d5b7d6 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
+
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -32,26 +34,35 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle(
     for (ir::Node *pending_op : out_var->outputs) {
       auto &op = pending_op->Wrapper<OpHandleBase>();
       auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op);
-      PADDLE_ENFORCE_NOT_NULL(compute_op);
+      PADDLE_ENFORCE_NOT_NULL(
+          compute_op,
+          platform::errors::PreconditionNotMet(
+              "The pending OpHandle should be ComputationOpHandle."));
 
       if (result_op == nullptr) {
         result_op = compute_op;
       } else {
-        PADDLE_ENFORCE_EQ(result_op, compute_op);
+        PADDLE_ENFORCE_EQ(
+            result_op, compute_op,
+            platform::errors::PreconditionNotMet(
+                "The pending OpHandle should be the unique one."));
       }
     }
   }
 
-  PADDLE_ENFORCE_NOT_NULL(result_op);
+  PADDLE_ENFORCE_NOT_NULL(result_op,
+                          platform::errors::PreconditionNotMet(
+                              "The pending OpHandle should not be NULL."));
   return result_op;
 }
 
 ShareTensorBufferOpHandle::ShareTensorBufferOpHandle(
     ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type,
     const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-    const std::vector<std::string> &out_var_names)
+    const std::vector<std::string> &out_var_names, bool share_dims)
     : OpHandleBase(node),
-      functor_(scope, scope_idx, op_type, in_var_infos, out_var_names) {}
+      functor_(scope, scope_idx, op_type, in_var_infos, out_var_names,
+               share_dims) {}
 
 std::unordered_map<std::string, std::string>
 ShareTensorBufferOpHandle::ReusedVars() const {
@@ -63,6 +74,10 @@ void ShareTensorBufferOpHandle::AddReuseVarPair(
   functor_.AddReuseVarPair(in_var_info, out_var_name);
 }
 
+void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
+  functor_.SetShareDims(share_dims);
+}
+
 void ShareTensorBufferOpHandle::InitCUDA() {
 #ifdef PADDLE_WITH_CUDA
   int dev_id =
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
index b22f5621fe44d887d70d82ce4dc9e26596d23f4e..a02c346485eca813f0d0f0b432b8b647e2fe4414 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
@@ -17,6 +17,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
@@ -31,7 +32,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
       ir::Node *node, Scope *scope, size_t scope_idx,
       const std::string &op_type,
       const std::vector<const ir::MemOptVarInfo *> &in_vars_infos,
-      const std::vector<std::string> &out_var_names);
+      const std::vector<std::string> &out_var_names, bool share_dims = false);
 
   std::unordered_map<std::string, std::string> ReusedVars() const;
 
@@ -42,6 +43,8 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
   void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
                        const std::string &out_var_name);
 
+  void SetShareDims(bool share_dims);
+
   const ShareTensorBufferFunctor &Functor() const { return functor_; }
 
  protected:
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
index 71123f708e3ca149d9fd634f55652cede5a57b50..2723a46dcfae3582a9286bcacba8d2e0a4990ac5 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
+
 #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 
 namespace paddle {
@@ -27,8 +28,9 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
     PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
                           dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
                       true,
-                      "The input ops of ClearFetchOp function should be "
-                      "FetchOpHandle or FetchAsyncOpHandle.");
+                      platform::errors::PreconditionNotMet(
+                          "The input ops of ClearFetchOp function should be "
+                          "FetchOpHandle or FetchAsyncOpHandle."));
     for (auto& out_var : op->Node()->outputs) {
       graph->RemoveNode(out_var);
     }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 92c3a0cd6b9c01497199fece0a9bdafc89f64678..2ed52b3bd94733e329ccf8270054b23b1ad29d87 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -138,7 +139,10 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
         }
       }
     }
-    PADDLE_ENFORCE(ready_ops.empty());
+    PADDLE_ENFORCE_EQ(
+        ready_ops.empty(), true,
+        platform::errors::Fatal("After the execution of computation graph, "
+                                "there are unexecuted operators left."));
   }
 
   // Wait FetchOps.
@@ -165,9 +169,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
     FetchResultType *fetch_data, bool return_merged) {
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
   std::unordered_set<VarHandleBase *> local_ready_vars;
-  std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
-                                                   fetch_tensors.end());
-  for (auto &fetch_var_name : fetch_tensor_set) {
+
+  for (auto &fetch_var_name : fetch_tensors) {
     for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
@@ -231,7 +234,11 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
       ready_ops->insert(static_cast<OpHandleBase *>(op));
     }
   }
-  PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0);
+  PADDLE_ENFORCE_EQ(
+      local_ready_vars.size(), 0,
+      platform::errors::Fatal(
+          "The number of ready variables should be 0, but got %d.",
+          local_ready_vars.size()));
 }
 
 void ThreadedSSAGraphExecutor::InsertPendingOp(
@@ -277,7 +284,9 @@ void ThreadedSSAGraphExecutor::PrepareOpDeps() {
     }
   }
   op_deps_->num_ops_ = ready_ops.size() + pending_ops.size();
-  PADDLE_ENFORCE_GT(op_deps_->num_ops_, 0, "The graph doesn't have operators.");
+  PADDLE_ENFORCE_GT(
+      op_deps_->num_ops_, 0,
+      platform::errors::InvalidArgument("The graph doesn't have operators."));
 
   for (auto ready_var : ready_vars) {
     pending_vars.erase(ready_var);
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index b8b584f27200bd3f89efcc20be2c6a3435274a56..45fa3adbf14080317fe004a7113b58d34145447d 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <ThreadPool.h>  // ThreadPool in thrird party
+
 #include <deque>
 #include <functional>
 #include <list>
@@ -24,8 +26,6 @@
 #include <utility>
 #include <vector>
 
-#include <ThreadPool.h>  // ThreadPool in thrird party
-
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 86428f8b7613760f59a1166189c61f3217d8017d..bb38424d3ae2d74f6f0a48e11df95b60dbf432f3 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -54,8 +54,10 @@ struct VarHandleBase {
 
   void AddOutput(OpHandleBase* out, ir::Node* node) {
     if (pending_ops_.find(out) == pending_ops_.end()) {
-      PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr",
-                     this->Node()->Name());
+      PADDLE_ENFORCE_NOT_NULL(out,
+                              platform::errors::InvalidArgument(
+                                  "The output added to VarHandle %s is NULL.",
+                                  this->Node()->Name()));
       pending_ops_.insert(out);
       node_->outputs.push_back(node);
     }
@@ -120,7 +122,10 @@ struct VarHandle : public VarHandleBase {
   bool HasEvent() { return has_event_; }
 
   const cudaEvent_t& GetEvent() {
-    PADDLE_ENFORCE(HasEvent(), "The event is not set.");
+    PADDLE_ENFORCE_EQ(
+        HasEvent(), true,
+        platform::errors::PreconditionNotMet(
+            "The cuda event is not set, maybe InitCUDA() is not called."));
     return event_;
   }
 
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index 134f759081a0778194c20785e215420d6e2bb622..fba0c1bf463ee0b9a434c350474af4be0c589e30 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/variable_visitor.h"
+
 #include "paddle/fluid/framework/selected_rows.h"
 namespace paddle {
 namespace framework {
@@ -24,7 +25,9 @@ static void VisitVariable(Variable* var, Func* func) {
   } else if (var->IsType<SelectedRows>()) {
     (*func)(var->GetMutable<SelectedRows>());
   } else {
-    PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "VisitVariable is not supported for type %s.",
+        ToTypeName(var->Type())));
   }
 }
 
@@ -35,7 +38,8 @@ static void VisitVariable(const Variable& var, Func* func) {
   } else if (var.IsType<SelectedRows>()) {
     (*func)(var.Get<SelectedRows>());
   } else {
-    PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "VisitVariable is not supported for type %s.", ToTypeName(var.Type())));
   }
 }
 
@@ -50,7 +54,8 @@ struct TensorVisitor {
 
   template <typename T>
   void operator()() {
-    PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Getting tensor from type %s is not supported.", typeid(T).name()));
   }
 };
 
@@ -78,8 +83,8 @@ struct ShareDimsAndLoDVisitor {
 
   template <typename T>
   void operator()(const T&) {
-    PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s",
-                   typeid(T).name());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "ShareDimsAndLoD is not supported for type %s.", typeid(T).name()));
   }
 };
 
@@ -89,42 +94,54 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
 }
 
 struct EnforceShapeAndDTypeEQVisitor {
-  const Variable* trg_;
+  const Variable* dst_;
 
   void operator()(const LoDTensor& src) {
-    auto& tensor = trg_->Get<LoDTensor>();
-    PADDLE_ENFORCE_EQ(
-        src.place().which(), tensor.place().which(),
-        "The Places of the two Variable must be all on CPU or all on GPU.");
+    auto& tensor = dst_->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of the two variables is not equal."));
     PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
-                      "The dtype of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(),
-                      "The dims of the two Variable is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The dtype of the two variables is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.dims(), tensor.dims(),
+        platform::errors::PreconditionNotMet(
+            "The layout of the two variables' tensors is not equal."));
     PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(),
-                      "The lod of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(),
-                      "The layout of the two Variable's tensor is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The lod of the two variable is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.layout(), tensor.layout(),
+        platform::errors::PreconditionNotMet(
+            "The layout of the two variables' tensors tensor is not equal."));
   }
 
   void operator()(const SelectedRows& src) {
-    auto& selected_rows = trg_->Get<SelectedRows>();
-    PADDLE_ENFORCE_EQ(
-        src.place().which(), selected_rows.place().which(),
-        "The Places of the two Variable must be all on CPU or all on GPU.");
+    auto& selected_rows = dst_->Get<SelectedRows>();
+    PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of the two variables is not equal."));
     PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
-                      "The dtype of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(),
-                      "The layout of the two Variable's tensor is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The dtype of the two variables is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.value().layout(), selected_rows.value().layout(),
+        platform::errors::PreconditionNotMet(
+            "The layout of the two variables' tensors is not equal."));
     PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(),
-                      "The height of the two Variable is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The height of the two variables is not equal."));
     PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(),
-                      "The dims of the two Variable is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The dims of the two variables is not equal."));
   }
 
   template <typename T>
   void operator()(const T&) {
-    PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s",
-                   typeid(T).name());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "EnforceShapeAndDTypeEQ is not supported for type %s.",
+        typeid(T).name()));
   }
 };
 
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 04befbe1ca01d4bfec5872a63565f21d110a6c67..efe6fa1b2daffcbbaa2b7945e7139ac83f689bcd 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -441,6 +441,7 @@ class SectionWorker : public DeviceWorker {
     skip_vars_ = skip_vars;
   }
   static void ResetBatchId() { batch_id_ = 0; }
+  static void ResetThreadCompletedFlag() { threads_completed = false; }
 
   static std::atomic<int> cpu_id_;
 
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc
index bb958f1ac015bfd1a71b3ccd530406a33e4e37cb..f195dde40843c8c4ee5168d11ad0b8eac8199f4e 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 namespace gloo {
 namespace rendezvous {
 
+constexpr int kNodeSize = 136;
+
 HdfsStore::HdfsStore(const std::string& path) {
   path_ = path;
   wait_sleep_ms_ = 10000;
@@ -213,12 +215,14 @@ void ParallelConnectContext::connectFullMesh(
   storeKey << rank;
   store.set(storeKey.str(), allBytes);
 
+  auto total_add_size = kNodeSize * (size - 1);
+
   std::vector<std::shared_ptr<std::thread>> connect_threads(thread_num_);
   // Connect every pair
   for (uint32_t i = 0; i < connect_threads.size(); ++i) {
     connect_threads[i].reset(new std::thread(
-        [&store, &transportContext, this](size_t thread_idx,
-                                          size_t thread_num) -> void {
+        [&store, &transportContext, total_add_size, this](
+            size_t thread_idx, size_t thread_num) -> void {
           for (int i = thread_idx; i < size; i += thread_num) {
             if (i == rank) {
               continue;
@@ -226,8 +230,23 @@ void ParallelConnectContext::connectFullMesh(
             // Wait for address of other side of this pair to become available
             std::string key = std::to_string(i);
             store.wait({key}, getTimeout());
+
+            std::vector<char> allAddrs;
+            auto max_retry_times = 5;
             // Connect to other side of this pair
-            auto allAddrs = store.get(key);
+
+            while (max_retry_times > 0) {
+              allAddrs = store.get(key);
+
+              VLOG(3) << "store get all address size: " << allAddrs.size()
+                      << " except: " << total_add_size;
+              if (allAddrs.size() == static_cast<size_t>(total_add_size)) {
+                break;
+              }
+
+              --max_retry_times;
+            }
+
             auto addr = extractAddress(allAddrs, i);
             transportContext->getPair(i)->connect(addr);
           }
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index b50b4f37caecd8d8d5c393ee3a5c5b76c1f406be..fd8b55a6b7deb9bf4685b27f8849a49ab77f64e9 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -225,3 +226,14 @@ REGISTER_PASS(conv_affine_channel_fuse_pass,
               paddle::framework::ir::ConvAffineChannelFusePass);
 REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
               paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
+REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("affine_channel", 0));
+REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("affine_channel", 0));
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 9d3e0806ac79d838765ca5a4bbf61d0f67ab6ed5..fb787e08814429781bf64efda2f1eb915f185f63 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -372,3 +373,14 @@ REGISTER_PASS(depthwise_conv_bn_fuse_pass,
               paddle::framework::ir::DepthwiseConvBNFusePass);
 REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
               paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
+REGISTER_PASS_CAPABILITY(conv_bn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("batch_norm", 0));
+REGISTER_PASS_CAPABILITY(conv_eltwiseadd_bn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("batch_norm", 0));
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index 2627da7dc40f19a9df22d2f44a4b1032df5cea01..ad6af69ae02e4f6262ee8760dbda90e0b5833feb 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -11,9 +11,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
 #include <string>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -116,3 +116,10 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
               paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add2_act_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0)
+            .EQ("identity", 0));
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index 0b454a0407e48fcf2693975b00c60ee5448786e4..c5fa47ec55fe9a15caca493a4b0c72c22f2cf5c7 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -102,3 +103,10 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(conv_elementwise_add_act_fuse_pass,
               paddle::framework::ir::ConvElementwiseAddActFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_act_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0)
+            .EQ("identity", 0));
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
index 007770cf57d278d155650c00996413e3bc8e7b53..38c0b773ddeddddea68ecefd6c8525449c52d7a6 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
-
 #include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
+#include <string>
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -89,3 +89,8 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(conv_elementwise_add_fuse_pass,
               paddle::framework::ir::ConvElementwiseAddFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 7612df9ab915011001b57b37e4fd559d393302a7..3f88a460d140f6d7389194a29e37128f5ba5b458 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -334,3 +335,8 @@ void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
 
 REGISTER_PASS(embedding_eltwise_layernorm_fuse_pass,
               paddle::framework::ir::EmbeddingEltwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("lookup_table", 0)
+            .EQ("elementweise_add", 0));
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
index 71c9dbae1a46af1ecae0aaff3fde52de8142d4bb..727e42629f9fab9183668ae0cc84ae54eb01982c 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
@@ -16,12 +16,13 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-TEST(SkipLayerNormFusePass, basic) {
+TEST(EmbeddingElewiseLayernormFusePass, basic) {
   // inputs                           operator            output
   // --------------------------------------------------------------------
   // (x, y)                       elementwise_add    -> elementwise_out
@@ -91,6 +92,12 @@ TEST(SkipLayerNormFusePass, basic) {
           "The number of fusion nodes does not meet expectations after fuse"));
 }
 
+TEST(EmbeddingElewiseLayernormFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("embedding_eltwise_layernorm_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
index c50b7476c6a9616a784646b3ef6a43140ac2d401..02e3e2542f6e8dea47c53fd298c7ae7512a72c36 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -23,6 +23,8 @@
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -34,7 +36,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 
   // Build pattern
   PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
-                  ->assert_is_op_input("lookup_table")
+                  ->assert_is_op_input("lookup_table_v2")
                   ->assert_var_not_persistable();
   patterns::Embedding embedding_pattern(pattern, name_scope);
   // TODO(jczaja): Intermediate can only be for val that are not used anywhere
@@ -256,3 +258,11 @@ void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(embedding_fc_lstm_fuse_pass,
               paddle::framework::ir::EmbeddingFCLSTMFusePass);
+REGISTER_PASS_CAPABILITY(embedding_fc_lstm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("lookup_table_v2", 0)
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("lstm", 0)
+            .EQ("fused_embedding_fc_lstm", 0));
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 066a8fb975740ad5e45b4840a7404160d086b6f0..d60510a4074997a028cd914ca7a0e76335801c80 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -182,3 +183,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
 
 REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass)
     .RequirePassAttr("use_gpu");
+REGISTER_PASS_CAPABILITY(fc_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0)
+            .EQ("fc", 0));
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
index a2185cdc5593cc36ed6ceda839fb13c28b45600c..f5fea90ac2fcee8e9c48ca21203b3b60cd7f7166 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -125,7 +126,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
     auto* x_n = subgraph.at(x);
     GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
@@ -136,10 +136,17 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
                               gru_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchHidden, gru_pattern);
 
+    // TODO(wilber): Support origin_mode=True.
+    if (gru->Op()->GetAttrIfExists<bool>("origin_mode") == true) {
+      LOG(INFO) << "fc_gru_fuse_pass not supported when origin_mode=True.";
+      return;
+    }
+
     if (with_fc_bias) {
       GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
       GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
       GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
 
       gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
       // Remove unneeded nodes.
@@ -188,3 +195,16 @@ void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
 REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
+REGISTER_PASS_CAPABILITY(mul_gru_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("gru", 0)
+            .EQ("fusion_gru", 0));
+REGISTER_PASS_CAPABILITY(fc_gru_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("gru", 0)
+            .EQ("fusion_gru", 0));
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index 12c7fc051e23a946ec9049e061499056f009bfa3..a3c57e14e1aedbed1e4cf462d4883cd83bf2fa10 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -196,3 +197,17 @@ void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
 REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
+
+REGISTER_PASS_CAPABILITY(fc_lstm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("lstm", 0)
+            .EQ("fusion_lstm", 0));
+REGISTER_PASS_CAPABILITY(mul_lstm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("lstm", 0)
+            .EQ("fusion_lstm", 0));
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
index 726a2d90fcf03c3e2023485e983ea64f93231f73..a8c0973cac488ceb96249a898e819af7565c6c7a 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -13,4 +13,6 @@ cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handl
 cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass)
 cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass) 
 
+cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass)
+
 cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op)
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
index 0b42f2ebd5555a5c73527d9819ff254411a399d4..ce7f27d27559c70cf164f6bb641fa0ee6f02a2a0 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@@ -16,6 +16,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
@@ -141,11 +142,12 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const {
         VLOG(4) << "Inplace performed in op " << op_type << ": "
                 << in_var_handle_ptr->Name() << " -> "
                 << out_var_handle_ptr->Name()
-                << ". Debug String is: " << op->GetOp()->DebugString();
+                << ". Debug String is: " << op->GetOp()->DebugString()
+                << ". ReuseType: " << ReuseType();
       } else {
         VLOG(3) << "Inplace failed in op " << op_type << ": "
                 << in_var_handle_ptr->Name() << " -> "
-                << out_var_handle_ptr->Name();
+                << out_var_handle_ptr->Name() << ". ReuseType: " << ReuseType();
       }
     }
   }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81c63f46bda453ec8705cf4bc93dd9e3acf844ec
--- /dev/null
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InplaceAddToOpPass : public MemoryReusePass {
+ protected:
+  std::string ReuseType() const override { return "inplace_addto"; }
+
+  void Run(Graph *graph) const override;
+
+ private:
+  // 1. Add last living op of in_var, add any last living op of out_var
+  // 2. Set reference count of in_var to be 2
+  void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
+                             details::VarHandle *in_var,
+                             details::VarHandle *out_var) const override {
+    size_t scope_idx = op->GetScopeIdx();
+    auto *last_live_ops_of_vars_ =
+        &Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+    auto *var_infos_ = &(Get<MemOptVarInfoMapList>(kMemOptVarInfoMapList));
+    auto out_var_op_iter =
+        (*last_live_ops_of_vars_)[scope_idx].find(out_var->Name());
+
+    // In Reduce mode, some output variable(gradient of parameter) does not have
+    // last live ops
+    details::ComputationOpHandle *last_live_op_of_in_var = nullptr;
+    if (out_var_op_iter == (*last_live_ops_of_vars_)[scope_idx].end()) {
+      last_live_op_of_in_var = op;
+    } else {
+      PADDLE_ENFORCE_EQ(
+          out_var_op_iter->second.ops().empty(), false,
+          platform::errors::InvalidArgument(
+              "Var(%s)'s last live op should not empty.", out_var->Name()));
+      last_live_op_of_in_var = *(out_var_op_iter->second.ops().begin());
+    }
+
+    auto *last_live_ops_of_in_var =
+        (*last_live_ops_of_vars_)[scope_idx][in_var->Name()].mutable_ops();
+    // last_live_ops_of_in_var->clear();
+    last_live_ops_of_in_var->insert(last_live_op_of_in_var);
+
+    auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name());
+    PADDLE_ENFORCE_NE(
+        in_var_info_iter, (*var_infos_)[scope_idx].end(),
+        platform::errors::NotFound("Cannot find variable %s.", in_var->Name()));
+
+    in_var_info_iter->second->SetRefCnt(2);  // before inplace, it is 1
+  }
+};
+
+void InplaceAddToOpPass::Run(Graph *graph) const {
+  const auto &last_live_ops =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+
+  bool use_cuda = Get<bool>(kUseCuda);
+
+  // Currently, only perform InplaceAddToOpPass on cuda place
+  if (!use_cuda) {
+    return;
+  }
+
+  // Step 1: Build a reverse map of last_live_ops
+  // i.e.: op -> vars
+  std::unordered_map<details::ComputationOpHandle *,
+                     std::unordered_map<std::string, ir::Node *>>
+      candidate_ops;
+  for (auto &each_scope_ops : last_live_ops) {
+    for (auto &pair : each_scope_ops) {
+      // If variable has more than 1 last lived ops, this variable cannot
+      // be inplaced.
+      if (pair.second.ops().size() != 1) {
+        continue;
+      }
+
+      auto *op = *(pair.second.ops().begin());
+      const std::string &op_type = op->GetOp()->Type();
+      const framework::OpDesc *op_desc = op->Node()->Op();
+      PADDLE_ENFORCE_NOT_NULL(
+          op_desc, platform::errors::NotFound("Op(%s) can not find opdesc.",
+                                              op->Name()));
+
+      // only grad op should be processed.
+      if (op_type != "grad_add") {
+        continue;
+      }
+
+      const std::string &var_name = pair.first;
+      auto in_nodes = this->FindNodesByName(var_name, op->Node()->inputs);
+      if (in_nodes.size() == 1) {
+        candidate_ops[op][var_name] = *in_nodes.begin();
+      }
+      VLOG(4) << "Find op " << op_type << " with input(" << var_name
+              << ") that can do inplace add to";
+    }
+  }
+
+  // Step 2: Check which vars can be inplaced indeed
+  for (auto &op_vars_pair : candidate_ops) {
+    auto *op = op_vars_pair.first;
+
+    // The original gradient accumulation is g = sum(g_0, g_1,..., g_n), and it
+    // could be changed as follws if inplace addto is enabled:
+    // g_sum_0 = g_0
+    // g_sum_1 = grad_add(g_sum_0, g_1)
+    // g_sum_2 = grad_add(g_sum_1, g_2)
+    // ...
+    // g_sum_n = grad_add(g_sum_n-1, g_n)
+
+    // here we will add inplace for each grad_add, for example, for the first
+    // grad_add, g_sum_0 -> g1, g_sum_1 -> g1, and set grad_add as skipped.
+
+    const std::string &op_type = op->GetOp()->Type();
+
+    PADDLE_ENFORCE_EQ(op->Node()->inputs.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "The size of inputs of %s should be 2, but got %d",
+                          op_type, op->Node()->inputs.size()));
+
+    PADDLE_ENFORCE_EQ(op->Node()->outputs.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The size of outputs of %s should be 1, but got %d",
+                          op_type, op->Node()->outputs.size()));
+
+    auto *left_var_ptr = dynamic_cast<details::VarHandle *>(
+        &(op->Node()->inputs[0]->Wrapper<details::VarHandleBase>()));
+    auto *right_var_ptr = dynamic_cast<details::VarHandle *>(
+        &(op->Node()->inputs[1]->Wrapper<details::VarHandleBase>()));
+    auto *out_var_ptr = dynamic_cast<details::VarHandle *>(
+        &(op->Node()->outputs[0]->Wrapper<details::VarHandleBase>()));
+
+    if (left_var_ptr == nullptr || right_var_ptr == nullptr ||
+        out_var_ptr == nullptr) {
+      continue;
+    }
+
+    // auto *left_generated_op = dynamic_cast<details::ComputationOpHandle *>(
+    //     left_var_ptr->GeneratedOp());
+
+    auto *right_generated_op = dynamic_cast<details::ComputationOpHandle *>(
+        right_var_ptr->GeneratedOp());
+
+    auto *out_generated_op = dynamic_cast<details::ComputationOpHandle *>(
+        out_var_ptr->GeneratedOp());
+
+    // NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy
+    if (right_generated_op->Name() != "conv2d_grad") {
+      continue;
+    }
+
+    // NOTE(zhiqiu): Normally, if we inplace a->b, we should let a generated
+    // before b. However, in the situation of inplace addto, we do not care
+    // the order, since a+b is equal to b+a. Is there any exception for that?
+
+    // AddDependencyVar(right_generated_op, left_generated_op);
+    // no need, as discussed above.
+
+    // step (a): inplace right_var->left_var of grad_add
+
+    this->AddReuseVar(right_generated_op, left_var_ptr, right_var_ptr);
+    UpdateLastLiveOpOfVar(right_generated_op, left_var_ptr, right_var_ptr);
+    VLOG(4) << "Inplace performed in op " << right_generated_op->GetOp()->Type()
+            << ": " << left_var_ptr->Name() << " -> " << right_var_ptr->Name()
+            << ". Debug String is: "
+            << right_generated_op->GetOp()->DebugString()
+            << ". ReuseType: " << ReuseType();
+
+    // step (b): inplace out -> right_var of grad_add
+
+    this->AddReuseVar(out_generated_op, right_var_ptr, out_var_ptr, true);
+
+    VLOG(4) << "Inplace performed in op " << op_type << ": "
+            << left_var_ptr->Name() << " -> " << out_var_ptr->Name()
+            << ". Debug String is: " << op->GetOp()->DebugString()
+            << ". ReuseType: " << ReuseType();
+
+    // step (c): make right_var cannot inplace afterwards. canbe done
+    // aotomatically since CollectReusedVars is called before any reuse.
+
+    // step (d): make right_var's generated op use addto
+    right_generated_op->GetOp()->SetAttr("use_addto", true);
+
+    // step (e): make grad_add skip running
+    op->SetSkipRunning(true);
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inplace_addto_op_pass, paddle::framework::ir::InplaceAddToOpPass)
+    .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList)
+    .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars)
+    .RequirePassAttr(paddle::framework::ir::kUseCuda);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
index 221b0a76e7ef5b01d87c63fb466a9b980f1e69b4..3e3b9864a7b408267ac73de053c1692628e9a14c 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
+
 #include <functional>
 #include <map>
 #include <string>
@@ -73,6 +74,7 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var,
           out_var->Name()));
   if (IsVarPairReusable(*in_var, *out_var)) {
     AddReuseVar(op, in_var, out_var);
+    UpdateLastLiveOpOfVar(op, in_var, out_var);
     return true;
   } else {
     return false;
@@ -324,7 +326,8 @@ bool MemoryReusePass::IsVarPairReusable(
 
 void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
                                   details::VarHandle *in_var,
-                                  details::VarHandle *out_var) const {
+                                  details::VarHandle *out_var,
+                                  bool share_dims) const {
   PADDLE_ENFORCE_GT(
       (*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0,
       platform::errors::NotFound("Var(%s) does not in mem opt var infos.",
@@ -344,13 +347,15 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
     share_buffer_op->AddInput(in_var);
   }
 
+  if (share_dims) {
+    share_buffer_op->SetShareDims(true);
+  }
+
   share_buffer_op->AddReuseVarPair(
       (*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(),
       out_var->Name());
   reused_in_var_names_[op->GetScopeIdx()].insert(in_var->Name());
   reused_out_var_names_[op->GetScopeIdx()].insert(out_var->Name());
-
-  UpdateLastLiveOpOfVar(op, in_var, out_var);
 }
 
 // 1. Set last living op of in_var to be any last living op of out_var
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
index 822744191847586dc429b6896ff6f490381c5901..1c0c6ae60205b14f97bd15bceeb126d0eb54f654 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
@@ -18,6 +18,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
@@ -92,6 +93,12 @@ class MemoryReusePass : public Pass {
 
   int64_t GetMemorySize(const details::VarHandle &var) const;
 
+  void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
+                   details::VarHandle *out_var, bool share_dims = false) const;
+  virtual void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
+                                     details::VarHandle *in_var,
+                                     details::VarHandle *out_var) const;
+
  private:
   VarDesc *GetVarDesc(const details::VarHandle &var) const;
 
@@ -109,13 +116,6 @@ class MemoryReusePass : public Pass {
 
   void CollectReusedVars() const;
 
-  void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
-                   details::VarHandle *out_var) const;
-
-  void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
-                             details::VarHandle *in_var,
-                             details::VarHandle *out_var) const;
-
  private:
   mutable Graph *graph_;
   mutable bool use_cuda_;
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index 82e0af3c198750296032769f2f3b04658871adb7..f7a8e3e3f6c3c77e978c57eeb7515d8cfce86471 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -84,6 +85,19 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
       VLOG(3) << "do not perform " + type() + "+bias fuse";
       return;
     }
+    if (conv->Op()->HasAttr("dilations")) {
+      auto dilations =
+          BOOST_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations"));
+      for (const auto& d : dilations) {
+        if (d != 1) {
+          LOG(WARNING)
+              << "dilation conv not supported in MKLDNN, fuse not apply "
+              << "and set conv attribute use_mkldnn = false";
+          conv->Op()->SetAttr("use_mkldnn", false);
+          return;
+        }
+      }
+    }
 
     auto* eltwise_bias_tensor =
         scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
@@ -151,3 +165,8 @@ REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
               paddle::framework::ir::Conv2DTransposeBiasFusePass);
 REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
               paddle::framework::ir::Conv3DBiasFusePass);
+REGISTER_PASS_CAPABILITY(conv_bias_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
index 88aac001a93ae836d62fe3bf3fc502960eebe70f..455350d2f703c52a9ef3e5714a60573408310080 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/platform/place.h"
 
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
 namespace paddle {
@@ -149,6 +150,12 @@ TEST(ConvBiasFusePass, conv2d_transpose) {
   ASSERT_EQ(pass.type(), std::string("conv2d_transpose"));
 }
 
+TEST(ConvBiasFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("conv_bias_mkldnn_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index af2b1308e084ee937f26bf90caf2df6fb44e044b..2fb131aceaad28a365e8202dca35cfe53f8f54da 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <memory>
 #include <tuple>
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -341,3 +342,8 @@ void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
 
 REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
               paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
index 8a13596cd50087475bf12b6cfa5920b82e24de31..fd4910fc8e95cd98fe9feaba51e70d5a143ad443 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -17,6 +17,7 @@
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -267,6 +268,12 @@ TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
   AssertOpsCount(graph, 2, 1);
 }
 
+TEST(ConvElementwiseAddMKLDNNFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("conv_elementwise_add_mkldnn_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
index c5965701a53d4312d89f1e09f17840b09f1bd5f5..df5ba3314e637fefe930d4c45f431314dd7d8493 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -57,3 +58,7 @@ void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(depthwise_conv_mkldnn_pass,
               paddle::framework::ir::DepthwiseConvMKLDNNPass);
+REGISTER_PASS_CAPABILITY(depthwise_conv_mkldnn_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "depthwise_conv2d", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
index a37565236cd440bd803184d038ad4deb3c0b6150..c6c72ba33d6295d90c502ab88d7d712d76a11aad 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
@@ -16,6 +16,8 @@
 
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -70,6 +72,12 @@ ProgramDesc BuildProgramDesc() {
   return prog;
 }
 
+TEST(DepthwiseConvMKLDNNPass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("depthwise_conv_mkldnn_pass"));
+}
+
 TEST(DepthwiseConvMKLDNNPass, basic) {
   auto prog = BuildProgramDesc();
 
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 198107ea082dc86d9e65a926bf9befe2fc4abfa4..9d2b4ebaf8ccf33e175e46c08657e7eeed467055 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/errors.h"
 
 namespace paddle {
@@ -707,3 +708,13 @@ REGISTER_PASS(multihead_matmul_fuse_pass,
 
 REGISTER_PASS(multihead_matmul_fuse_pass_v2,
               paddle::framework::ir::MultiHeadMatmulV2FusePass);
+REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .EQ("matmul", 0)
+            .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
index d8a06b037bdefbe8776c9b95b36be80afb988393..2eda643d4e53aa061908f02c9d31b765241c318b 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h"  // NOLINT
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -133,6 +134,12 @@ TEST(MultiHeadMatmulFusePass, basic) {
                         num_fused_nodes_after));
 }
 
+TEST(MultiHeadMatmulFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("multihead_matmul_fuse_pass_v2"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index 2396a7f3c4f84f70c2f350e2121c4044c56b141a..23f794c11c239225b31cea8a7e7f11f576c87081 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 #define MAX_NUM_FC 10
 
@@ -174,6 +175,10 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
             if (x->outputs.size() <= 0 || x->inputs.size() <= 0U) {
               return false;
             }
+            if (x->IsVar() && x->Var() && x->Var()->GetShape().size() > 2) {
+              LOG(WARNING) << "repeated fc relu only supports input dims = 2";
+              return false;
+            }
             int fc_idx = FindFCIdx(x);
             if (fc_idx < 0) {
               return false;
@@ -384,3 +389,8 @@ void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(repeated_fc_relu_fuse_pass,
               paddle::framework::ir::RepeatedFCReluFusePass);
+REGISTER_PASS_CAPABILITY(repeated_fc_relu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fc", 0)
+            .EQ("relu", 0));
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
index 1485a84d001acef8542a9dda5436cfeb57518d69..75ab04f1b9130dccd42cea39dc0e074e2e2838eb 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -98,3 +99,9 @@ void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(seqconv_eltadd_relu_fuse_pass,
               paddle::framework::ir::SeqConvEltAddReluFusePass);
+REGISTER_PASS_CAPABILITY(seqconv_eltadd_relu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("sequence_conv", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0));
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
index d9a65e71592ff464a2e6beaa2219a39103f6cae1..74ba0093a17beb5d30cd0234faf948d8a7dd620d 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -34,6 +35,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
   const std::string pattern_name = "shufflechannel_pattern";
   FusePassBase::Init(pattern_name, graph);
 
+  LOG(WARNING) << "There is fluid.layers.shuffle_channel API already, you can "
+                  "use it instead of (reshape + transpose +reshape)";
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
                 ->NewNode("x")
@@ -93,3 +96,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(shuffle_channel_detect_pass,
               paddle::framework::ir::ShuffleChannelDetectPass);
+REGISTER_PASS_CAPABILITY(shuffle_channel_detect_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0));
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index 9dddc9154f8fc39144b38535824999b933a92106..2e3cd16d5ce49fdd6186f98c72d77c75c4053559 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -180,3 +181,8 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(skip_layernorm_fuse_pass,
               paddle::framework::ir::SkipLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(skip_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("elementwise_add", 0)
+            .EQ("layer_norm", 0));
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
index d2d7469872857a070294520a589fee4ca383f065..eff5dcddf54ee49be5b14a7bdfa609079f925036 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -54,6 +55,12 @@ TEST(SkipLayerNormFusePass, basic) {
           "The number of fusion nodes does not meet expectations after fuse"));
 }
 
+TEST(SkipLayerNormFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("skip_layernorm_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
index 035b198bdcc51800be62acce58a538145413e92f..d74843611cdd238f1fb78153e6b946ae8a1c8473 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -77,7 +78,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
   };
 
   auto is_fusion_input_var = [=](Node* x, const std::string& arg_name) {
-    bool basic = var_is_op_input(x, "matmul", arg_name) &&
+    bool basic = (var_is_op_input(x, "matmul_v2", arg_name) ||
+                  var_is_op_input(x, "matmul", arg_name)) &&
                  var_is_op_input(x, "square", "X");
     if (!basic) {
       return false;
@@ -88,7 +90,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
     }
     auto* squared_x = squared_x_op->outputs[0];
     bool next_is_matmul_from_arg =
-        var_is_op_input(squared_x, "matmul", arg_name) &&
+        (var_is_op_input(squared_x, "matmul_v2", arg_name) ||
+         var_is_op_input(squared_x, "matmul", arg_name)) &&
         squared_x->outputs.size() == 1 &&
         squared_x->outputs[0]->outputs.size() == 1;
     if (!next_is_matmul_from_arg) {
@@ -103,7 +106,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
   auto is_fusion_first_mul_out = [=](Node* x) -> bool {
     bool input_is_matmul_op = x && x->inputs.size() == 1 &&
                               x->inputs[0]->IsOp() &&
-                              x->inputs[0]->Op()->Type() == "matmul";
+                              (x->inputs[0]->Op()->Type() == "matmul_v2" ||
+                               x->inputs[0]->Op()->Type() == "matmul");
     if (!input_is_matmul_op) {
       return false;
     }
@@ -167,7 +171,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto* matmul_xy_op = pattern->NewNode(
       [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "matmul" &&
+        return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
+                                  x->Op()->Type() == "matmul") &&
                is_fusion_first_mul_out(x->outputs[0]);
       },
       name_scope + "/matmul_xy_op");
@@ -189,7 +194,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto is_fusion_mat_squared_x_y_op_out = [=](Node* x) -> bool {
     bool basic = x && x->IsVar() && x->inputs.size() == 1 &&
-                 x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == "matmul";
+                 x->inputs[0]->IsOp() &&
+                 (x->inputs[0]->Op()->Type() == "matmul_v2" ||
+                  x->inputs[0]->Op()->Type() == "matmul");
     if (!basic) {
       return false;
     }
@@ -206,7 +213,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto* matmul_squared_x_y_op = pattern->NewNode(
       [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "matmul" &&
+        return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
+                                  x->Op()->Type() == "matmul") &&
                is_fusion_mat_squared_x_y_op_out(x->outputs[0]);
       },
       name_scope + "/matmul_squared_x_y_op");
@@ -378,3 +386,13 @@ void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(squared_mat_sub_fuse_pass,
               paddle::framework::ir::SquaredMatSubFusePass);
+REGISTER_PASS_CAPABILITY(squared_mat_sub_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("matmul", 0)
+            .EQ("matmul_v2", 0)
+            .EQ("square", 0)
+            .EQ("elementwise_mul", 0)
+            .EQ("elementwise_sub", 0)
+            .EQ("fill_constant", 0)
+            .EQ("fusion_squared_mat_sub", 0));
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
index b6165a512acdb9b6e3bdbf49196692ef83edb58f..56b7ec9b84314bd3634c406c31e20dd421f7fa92 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
@@ -24,7 +24,7 @@ namespace framework {
 namespace ir {
 
 /**
- * Fuse ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar
+ * Fuse ( (A * B).^2 - (A.^2 * B.^2) ) .* scalar
  */
 class SquaredMatSubFusePass : public FusePassBase {
  public:
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index ebecbf0498c384a55627e2b5cb31304d098a444c..bd52d7ffef5040f596bfb5ca9521a6e1062bb5aa 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -157,6 +157,14 @@ class OperatorBase {
         platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
     return BOOST_GET_CONST(T, attrs_.at(name));
   }
+  void SetAttr(const std::string& name, const Attribute& v) {
+    PADDLE_ENFORCE_EQ(
+        HasAttr(name), true,
+        platform::errors::NotFound(
+            "The attribute %s is not found in operator %s", name, Type()));
+
+    attrs_[name] = v;
+  }
   const AttributeMap& Attrs() const { return attrs_; }
 
   const VariableNameMap& Inputs() const { return inputs_; }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 12e0f97f1262ca0f6bf8fc70ab5b482fb0bdd305..535ec9cd7d950588fd7877d0913e3e851f8fe8dc 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/parallel_executor.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -108,6 +110,11 @@ class ParallelExecutorPrivate {
    *                                       them.
    */
   inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) {
+    if (mem_opt_var_infos_.size() == 0) {
+      VLOG(4) << "The mem_opt_var_infos_ is empty, maybe no memory "
+                 "optimization strategy is enabled";
+      return;
+    }
     auto iter = mem_opt_var_infos_[scope_idx].find(name);
     if (iter != mem_opt_var_infos_[scope_idx].end()) {
       iter->second->SetSkipMemoryReuse(true);
@@ -308,6 +315,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
   }
 
   bool need_mem_opt = build_strategy_.enable_inplace_ ||
+                      build_strategy_.enable_addto_ ||
                       build_strategy_.memory_optimize_.get() || is_gc_enabled;
 
   if (!need_mem_opt) return graph;
@@ -320,6 +328,16 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
   graph = ref_cnt_pass->Apply(graph);
   VLOG(10) << "ReferenceCountPass Applied";
 
+  if (build_strategy_.enable_addto_) {
+    auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass");
+    addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
+    addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
+    addto_pass->SetNotOwned(ir::kUseCuda, &use_cuda_);
+    VLOG(10) << "Start to apply inplace_addto_op_pass";
+    graph = addto_pass->Apply(graph);
+    VLOG(10) << "inplace_addto_op_pass Applied";
+  }
+
   if (build_strategy_.enable_inplace_) {
     auto inplace_pass =
         ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass");
@@ -1068,3 +1086,4 @@ USE_PASS(reference_count_pass);
 USE_PASS(eager_deletion_pass);
 USE_PASS(buffer_shared_inplace_pass);
 USE_PASS(buffer_shared_cross_op_memory_reuse_pass);
+USE_PASS(inplace_addto_op_pass);
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 758b728fd9cffff6867a46a6c22c86e496103b84..d7506edbf4ca74976f067d879dad7df4923f946f 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -251,6 +251,7 @@ void PipelineTrainer::Finalize() {
   }
   root_scope_->DropKids();
   SectionWorker::ResetBatchId();
+  SectionWorker::ResetThreadCompletedFlag();
 }
 
 Scope* PipelineTrainer::GetWorkerScope(int thread_id) {
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 03b7afbb8771fadbe07a352497fa69a299928cf7..b9a3cac0ec4c9863f4e0dea102965eff8c47fe8d 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -196,7 +196,6 @@ void SectionWorker::TrainFiles() {
         if (threads_completed) {
           VLOG(3) << "thread " << thread_id_ << " completed.";
           lk.unlock();
-          threads_completed = false;
           return;
         }
         lk.unlock();
@@ -459,7 +458,6 @@ void SectionWorker::TrainFilesWithProfiler() {
                     << ", mean_time: " << op_total_time[i] / op_count[i];
           }
           VLOG(0) << "================================";
-          threads_completed = false;
           return;
         }
         lk.unlock();
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index c19e77d2714bcfc18c2cf2a98511d31a97295daa..19f52422b441faf45204f47adbcf4e6aae30f6f1 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -156,7 +156,8 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
                   // "seqpool_concat_fuse_pass",    //
                   "seqpool_cvm_concat_fuse_pass",  //
                   // "embedding_fc_lstm_fuse_pass", //
-                  "fc_lstm_fuse_pass",                       //
+                  // TODO(wilber): fix correctness problem.
+                  // "fc_lstm_fuse_pass",                       //
                   "mul_lstm_fuse_pass",                      //
                   "fc_gru_fuse_pass",                        //
                   "mul_gru_fuse_pass",                       //
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index cdc0e415d46739c646cc2a26dfd6ec5333973b25..9fff558c583596215c191a31e95b4e9b2aad058b 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -80,10 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     nvinfer1::ILayer* layer = nullptr;
 
     if (engine_->with_dynamic_shape()) {
-      plugin::DynamicPluginTensorRT* plugin = nullptr;
-      plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
+      auto use_fp16 = engine_->WithFp16();
+      auto plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
           input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-          eps);
+          eps, use_fp16);
       layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
     } else {
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 5e43be90de3dbbfef3c7d3def7e37904bb644380..873631fea614cc18cdc2b2b2f27d2480aa71d50b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -32,13 +32,34 @@ namespace plugin {
 #if IS_TRT_VERSION_GE(6000)
 
 template <typename T>
-int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
+EmbEltwiseLayernormPluginDynamicImpl<
+    T>::~EmbEltwiseLayernormPluginDynamicImpl() {
+  this->terminate();
+}
+
+inline half fp32tofp16(float x) { return static_cast<half>(x); }
+
+template <typename T>
+int EmbEltwiseLayernormPluginDynamicImpl<T>::initialize() {
   embs_gpu_.resize(embs_.size());
   for (int i = 0; i < embs_.size(); i++) {
     if (embs_[i]) {
-      cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
-      cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float),
+      T *host_ptr;
+      auto size = emb_sizes_[i];
+
+      if (std::is_same<T, half>::value) {
+        host_ptr = new T[size];
+        std::transform(embs_[i], (embs_[i] + size), host_ptr, fp32tofp16);
+      } else {
+        host_ptr = reinterpret_cast<T *>(embs_[i]);
+      }
+
+      cudaMalloc(&embs_gpu_[i], sizeof(T) * size);
+      cudaMemcpy(embs_gpu_[i], host_ptr, size * sizeof(T),
                  cudaMemcpyHostToDevice);
+      if (std::is_same<T, half>::value) {
+        delete[] host_ptr;
+      }
     }
   }
 
@@ -53,11 +74,105 @@ int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
                cudaMemcpyHostToDevice);
   }
 
+  int input_num = embs_.size();
+  in_ptr_tensor_.Resize({input_num});
+  emb_ptr_tensor_.Resize({input_num});
+
+  cudaGetDevice(&device_id_);
+  auto emb_ptr_gpu_d =
+      emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
+  cudaMemcpy(emb_ptr_gpu_d, embs_gpu_.data(), sizeof(uintptr_t) * input_num,
+             cudaMemcpyHostToDevice);
+
   return 0;
 }
 
 template <typename T>
-nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
+void EmbEltwiseLayernormPluginDynamicImpl<T>::terminate() {
+  for (int i = 0; i < embs_gpu_.size(); ++i) {
+    if (embs_gpu_[i]) {
+      cudaFree(embs_gpu_[i]);
+      embs_gpu_[i] = nullptr;
+    }
+  }
+
+  if (bias_gpu_) {
+    cudaFree(bias_gpu_);
+    bias_gpu_ = nullptr;
+  }
+
+  if (scale_gpu_) {
+    cudaFree(scale_gpu_);
+    scale_gpu_ = nullptr;
+  }
+}
+
+template <typename T>
+int EmbEltwiseLayernormPluginDynamicImpl<T>::enqueue(
+    const nvinfer1::PluginTensorDesc *input_desc,
+    const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
+    void *const *outputs, void *workspace, cudaStream_t stream) {
+  auto id_dims = input_desc[0].dims;
+  int batch = id_dims.d[0];
+  int seq_len = id_dims.d[1];
+  int input_num = embs_.size();
+
+  auto in_ptr_gpu_d =
+      in_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
+  auto emb_ptr_gpu_d =
+      emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
+
+  auto new_input_ptr = reinterpret_cast<uintptr_t>(inputs[0]);
+
+  if (old_input_ptr_ != new_input_ptr) {
+    old_input_ptr_ = new_input_ptr;
+
+    cudaMemcpyAsync(in_ptr_gpu_d, reinterpret_cast<const void *>(inputs),
+                    sizeof(uintptr_t) * input_num, cudaMemcpyHostToDevice,
+                    stream);
+  }
+
+  auto out_type = output_desc[0].type;
+
+  if (std::is_same<T, float>::value) {
+    PADDLE_ENFORCE_EQ(
+        out_type == nvinfer1::DataType::kFLOAT, true,
+        platform::errors::InvalidArgument(
+            "The EmbEltwiseLayernorm Plugin only support fp32 input."));
+  } else if (std::is_same<T, half>::value) {
+    PADDLE_ENFORCE_EQ(
+        out_type == nvinfer1::DataType::kHALF, true,
+        platform::errors::InvalidArgument(
+            "The EmbEltwiseLayernorm Plugin only support fp16 input."));
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Unsupport data type, the out type of EmbEltwiseLayernorm should be "
+        "float or half."));
+  }
+
+  auto *output_d = reinterpret_cast<T *>(outputs[0]);
+
+  operators::math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
+  emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
+                             scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
+                             eps_, input_num, stream);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+template class EmbEltwiseLayernormPluginDynamicImpl<float>;
+#ifdef SUPPORTS_CUDA_FP16
+template class EmbEltwiseLayernormPluginDynamicImpl<half>;
+#endif  // SUPPORTS_CUDA_FP16
+
+int EmbEltwiseLayernormPluginDynamic::initialize() {
+  impl_->initialize();
+
+  return 0;
+}
+
+void EmbEltwiseLayernormPluginDynamic::terminate() { impl_->terminate(); }
+
+nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
     nvinfer1::IExprBuilder &expr_builder) {  // NOLINT
   PADDLE_ENFORCE_EQ(output_index, 0,
@@ -76,18 +191,7 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
   return ret;
 }
 
-template <typename T>
-void EmbEltwiseLayernormPluginDynamic<T>::terminate() {
-  for (auto ptr : embs_gpu_) {
-    if (ptr) cudaFree(ptr);
-  }
-
-  if (bias_gpu_) cudaFree(bias_gpu_);
-  if (scale_gpu_) cudaFree(scale_gpu_);
-}
-
-template <typename T>
-bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
+bool EmbEltwiseLayernormPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
     int nb_outputs) {
   PADDLE_ENFORCE_NOT_NULL(
@@ -98,6 +202,11 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
                         "The EmbEltwiseLayerNorm's output should be one"
                         "but it's (%d) outputs.",
                         nb_outputs));
+  PADDLE_ENFORCE_EQ(nb_outputs, 1,
+                    platform::errors::InvalidArgument(
+                        "The EmbEltwiseLayerNorm's output should be one"
+                        "but it's (%d) outputs.",
+                        nb_outputs));
   PADDLE_ENFORCE_LT(
       pos, nb_inputs + nb_outputs,
       platform::errors::InvalidArgument("The pos(%d) should be less than the "
@@ -122,7 +231,7 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
   }
 
   if (pos == all_nums - 1) {
-    if (sizeof(T) == sizeof(float)) {
+    if (with_fp16_ == false) {
       return desc.type == nvinfer1::DataType::kFLOAT;
     } else {
       return desc.type == nvinfer1::DataType::kHALF;
@@ -131,84 +240,27 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
   return false;
 }
 
-template <typename T>
-nvinfer1::DataType EmbEltwiseLayernormPluginDynamic<T>::getOutputDataType(
+nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
   PADDLE_ENFORCE_EQ(
       index, 0, platform::errors::InvalidArgument(
                     "The EmbEltwiseLayernorm Plugin only has one input, so the "
                     "index value should be 0, but get %d.",
                     index));
-  return nvinfer1::DataType::kFLOAT;
+  if (with_fp16_)
+    return nvinfer1::DataType::kHALF;
+  else
+    return nvinfer1::DataType::kFLOAT;
 }
 
-template <typename T>
-int EmbEltwiseLayernormPluginDynamic<T>::enqueue(
+int EmbEltwiseLayernormPluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc *input_desc,
     const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
     void *const *outputs, void *workspace, cudaStream_t stream) {
-  auto id_dims = input_desc[0].dims;
-  int batch = id_dims.d[0];
-  int seq_len = id_dims.d[1];
-  int input_num = embs_.size();
-
-  framework::Tensor in_ptr_tensor, emb_ptr_tensor;
-  int device_id;
-  cudaGetDevice(&device_id);
-
-  in_ptr_tensor.Resize({input_num});
-  emb_ptr_tensor.Resize({input_num});
-  int64_t *in_ptr_gpu_d =
-      in_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
-  int64_t *emb_ptr_gpu_d =
-      emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
-
-  std::vector<uintptr_t> in_ptr, emb_ptr;
-  for (int i = 0; i < input_num; i++) {
-    in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
-    emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
-  }
-
-  cudaMemcpyAsync(in_ptr_gpu_d, in_ptr.data(), sizeof(int64_t) * input_num,
-                  cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(emb_ptr_gpu_d, emb_ptr.data(), sizeof(int64_t) * input_num,
-                  cudaMemcpyHostToDevice, stream);
-
-  auto out_type = output_desc[0].type;
-
-  const unsigned tpb = 256;
-  const dim3 grid(seq_len, batch, 1);
-  const dim3 block(tpb, 1, 1);
-  if (sizeof(T) == sizeof(float)) {
-    PADDLE_ENFORCE_EQ(
-        out_type == nvinfer1::DataType::kFLOAT, true,
-        platform::errors::InvalidArgument(
-            "The EmbEltwiseLayernorm Plugin only support fp32 input."));
-  } else if (sizeof(T) == sizeof(int16_t)) {
-    PADDLE_ENFORCE_EQ(
-        out_type == nvinfer1::DataType::kHALF, true,
-        platform::errors::InvalidArgument(
-            "The EmbEltwiseLayernorm Plugin only support fp16 input."));
-  } else {
-    PADDLE_THROW(platform::errors::Fatal(
-        "Unsupport data type, the out type of EmbEltwiseLayernorm should be "
-        "float or half."));
-  }
-
-  T *output_d = static_cast<T *>(outputs[0]);
-
-  operators::math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
-  emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
-                             scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
-                             eps_, input_num, stream);
+  impl_->enqueue(input_desc, output_desc, inputs, outputs, workspace, stream);
   return cudaGetLastError() != cudaSuccess;
 }
 
-template class EmbEltwiseLayernormPluginDynamic<float>;
-#ifdef SUPPORTS_CUDA_FP16
-template class EmbEltwiseLayernormPluginDynamic<half>;
-#endif  // SUPPORTS_CUDA_FP16
-
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index 5babd87db0602e973452efa613fcaf9810d29afa..24ca853104e35c26a2f9add57fd2f8bc025646c2 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -27,14 +27,76 @@ namespace tensorrt {
 namespace plugin {
 
 #if IS_TRT_VERSION_GE(6000)
+
+class EmbEltwiseLayernormPluginDynamicImplBase {
+ public:
+  EmbEltwiseLayernormPluginDynamicImplBase() {}
+  virtual ~EmbEltwiseLayernormPluginDynamicImplBase() {}
+
+  virtual int initialize() = 0;
+  virtual void terminate() = 0;
+  virtual int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                      const nvinfer1::PluginTensorDesc* outputDesc,
+                      const void* const* inputs, void* const* outputs,
+                      void* workspace, cudaStream_t stream) = 0;
+};
+
 template <typename T>
+class EmbEltwiseLayernormPluginDynamicImpl
+    : public EmbEltwiseLayernormPluginDynamicImplBase {
+ public:
+  explicit EmbEltwiseLayernormPluginDynamicImpl(std::vector<float*> input_embs,
+                                                float* bias, float* scale,
+                                                std::vector<int> emb_sizes,
+                                                int bias_size, int scale_size,
+                                                int hidden_size, float eps)
+      : embs_(input_embs),
+        bias_(bias),
+        scale_(scale),
+        emb_sizes_(emb_sizes),
+        bias_size_(bias_size),
+        scale_size_(scale_size),
+        hidden_size_(hidden_size),
+        eps_(eps) {}
+
+  ~EmbEltwiseLayernormPluginDynamicImpl();
+
+  int initialize();
+  void terminate();
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream);
+
+ private:
+  std::vector<float*> embs_;
+  float* bias_{nullptr};
+  float* scale_{nullptr};
+
+  // data on devices
+  float* bias_gpu_{nullptr};
+  float* scale_gpu_{nullptr};
+  std::vector<T*> embs_gpu_;
+
+  std::vector<int> emb_sizes_;
+  int bias_size_;
+  int scale_size_;
+  int hidden_size_;
+  float eps_;
+
+  framework::Tensor in_ptr_tensor_, emb_ptr_tensor_;
+  int device_id_{0};
+  uintptr_t old_input_ptr_{0};
+};
+
 class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
  public:
   explicit EmbEltwiseLayernormPluginDynamic(std::vector<float*> input_embs,
                                             float* bias, float* scale,
                                             std::vector<int> emb_sizes,
                                             int bias_size, int scale_size,
-                                            int hidden_size, float eps)
+                                            int hidden_size, float eps,
+                                            bool with_fp16)
       : embs_(input_embs),
         bias_(bias),
         scale_(scale),
@@ -42,51 +104,81 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
         bias_size_(bias_size),
         scale_size_(scale_size),
         hidden_size_(hidden_size),
-        eps_(eps) {}
+        eps_(eps),
+        with_fp16_(with_fp16),
+        own_host_buff_(false) {
+    if (with_fp16) {
+#ifdef SUPPORTS_CUDA_FP16
+      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
+          embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
+          hidden_size_, eps_);
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "Unsupported data type, current GPU doesn't support half."));
+#endif  // SUPPORTS_CUDA_FP16
+    } else {
+      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
+          embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
+          hidden_size_, eps_);
+    }
+  }
 
   EmbEltwiseLayernormPluginDynamic(void const* serial_data,
-                                   size_t serial_length) {
+                                   size_t serial_length)
+      : own_host_buff_(true) {
     DeserializeValue(&serial_data, &serial_length, &emb_sizes_);
 
-    embs_gpu_.resize(emb_sizes_.size());
     embs_.resize(emb_sizes_.size());
     for (size_t i = 0; i < emb_sizes_.size(); i++) {
-      cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
-      cudaMemcpy(embs_gpu_[i], serial_data, emb_sizes_[i] * sizeof(float),
-                 cudaMemcpyHostToDevice);
+      auto size = emb_sizes_[i];
+      auto ptr = new float[size];
+      memcpy(ptr, serial_data, sizeof(float) * size);
+      embs_[i] = ptr;
       reinterpret_cast<char const*&>(serial_data) +=
           emb_sizes_[i] * sizeof(float);
       serial_length -= emb_sizes_[i] * sizeof(float);
-      embs_[i] = nullptr;
     }
     DeserializeValue(&serial_data, &serial_length, &bias_size_);
     DeserializeValue(&serial_data, &serial_length, &scale_size_);
 
-    cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
-    cudaMemcpy(bias_gpu_, serial_data, bias_size_ * sizeof(float),
-               cudaMemcpyHostToDevice);
-    bias_ = nullptr;
+    if (bias_size_) {
+      bias_ = new float[bias_size_];
+      memcpy(bias_, serial_data, sizeof(float) * bias_size_);
+    }
     reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
     serial_length -= bias_size_ * sizeof(float);
 
-    cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
-    cudaMemcpy(scale_gpu_, serial_data, scale_size_ * sizeof(float),
-               cudaMemcpyHostToDevice);
-    scale_ = nullptr;
+    if (scale_size_) {
+      scale_ = new float[scale_size_];
+      memcpy(scale_, serial_data, sizeof(float) * scale_size_);
+    }
     reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float);
     serial_length -= scale_size_ * sizeof(float);
 
     DeserializeValue(&serial_data, &serial_length, &hidden_size_);
     DeserializeValue(&serial_data, &serial_length, &eps_);
+    DeserializeValue(&serial_data, &serial_length, &with_fp16_);
+
+    if (with_fp16_) {
+#ifdef SUPPORTS_CUDA_FP16
+      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
+          embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
+          hidden_size_, eps_);
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "Unsupported data type, current GPU doesn't support half."));
+#endif  // SUPPORTS_CUDA_FP16
+    } else {
+      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
+          embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
+          hidden_size_, eps_);
+    }
   }
 
   nvinfer1::IPluginV2DynamicExt* clone() const override {
     auto ptr = new EmbEltwiseLayernormPluginDynamic(
         embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
-        eps_);
-    ptr->embs_gpu_ = embs_gpu_;
-    ptr->bias_gpu_ = bias_gpu_;
-    ptr->scale_gpu_ = scale_gpu_;
+        eps_, with_fp16_);
     return ptr;
   }
 
@@ -95,6 +187,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   }
   int getNbOutputs() const override { return 1; }
   int initialize() override;
+  void terminate() override;
 
   size_t getSerializationSize() const override {
     int sum_num = 0;
@@ -110,24 +203,32 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
     sum_num += (bias_size_ + scale_size_) * sizeof(float);
     sum_num += SerializedSize(hidden_size_);
     sum_num += SerializedSize(eps_);
-    // sum_num += SerializedSize(with_fp16_);
+    sum_num += SerializedSize(with_fp16_);
 
     return sum_num;
   }
 
-  void terminate() override;
   void serialize(void* buffer) const override {
-    // SerializeValue(&buffer, with_fp16_);
     SerializeValue(&buffer, emb_sizes_);
     for (size_t i = 0; i < emb_sizes_.size(); i++) {
-      SerializeCudaPointer(&buffer, embs_gpu_[i], emb_sizes_[i]);
+      auto size = emb_sizes_[i];
+      for (int j = 0; j < size; ++j) {
+        SerializeValue(&buffer, embs_[i][j]);
+      }
     }
     SerializeValue(&buffer, bias_size_);
     SerializeValue(&buffer, scale_size_);
-    SerializeCudaPointer(&buffer, bias_gpu_, bias_size_);
-    SerializeCudaPointer(&buffer, scale_gpu_, scale_size_);
+    for (int i = 0; i < bias_size_; ++i) {
+      SerializeValue(&buffer, bias_[i]);
+    }
+
+    for (int i = 0; i < scale_size_; ++i) {
+      SerializeValue(&buffer, scale_[i]);
+    }
+
     SerializeValue(&buffer, hidden_size_);
     SerializeValue(&buffer, eps_);
+    SerializeValue(&buffer, with_fp16_);
   }
 
   nvinfer1::DimsExprs getOutputDimensions(
@@ -158,23 +259,33 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
                                        const nvinfer1::DataType* input_types,
                                        int nb_inputs) const override;
 
-  void destroy() override { delete this; }
+  void destroy() override {
+    if (own_host_buff_) {
+      for (auto ptr : embs_) {
+        delete[] ptr;
+      }
+      delete[] bias_;
+      delete[] scale_;
+    }
+
+    delete impl_;
+    delete this;
+  }
 
  private:
   std::vector<float*> embs_;
   float* bias_;
   float* scale_;
 
-  // data on devices
-  float* bias_gpu_;
-  float* scale_gpu_;
-  std::vector<float*> embs_gpu_;
-
   std::vector<int> emb_sizes_;
   int bias_size_;
   int scale_size_;
   int hidden_size_;
   float eps_;
+
+  bool with_fp16_;
+  bool own_host_buff_{false};
+  EmbEltwiseLayernormPluginDynamicImplBase* impl_{nullptr};
 };
 
 class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
@@ -198,8 +309,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
   nvinfer1::IPluginV2* deserializePlugin(const char* name,
                                          const void* serial_data,
                                          size_t serial_length) override {
-    return new EmbEltwiseLayernormPluginDynamic<float>(serial_data,
-                                                       serial_length);
+    return new EmbEltwiseLayernormPluginDynamic(serial_data, serial_length);
   }
 
   void setPluginNamespace(const char* lib_namespace) override {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
index 685f7b6600e4d73731860135469a3072d8ce7f9a..d49f83b9d38a3d16099c4bc698c47f18a4280da0 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -151,7 +151,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
   run(config, &out_data);         // serialize
   run(*config_deser, &out_data);  // deserialize
   for (size_t i = 0; i < out_data.size(); i++) {
-    EXPECT_NEAR(result[i], out_data[i], 1e-6);
+    EXPECT_NEAR(result[i], out_data[i], 1e-2);
   }
 }
 
@@ -159,13 +159,11 @@ TEST(AnalysisPredictor, no_fp16) {
   std::vector<float> result = {0.597841, 0.219972, 0.182187};
   trt_ernie(false, result);
 }
-
-TEST(AnalysisPredictor, fp16) {
 #ifdef SUPPORTS_CUDA_FP16
-  std::vector<float> result = {0.598336, 0.219558, 0.182106};
+TEST(AnalysisPredictor, fp16) {
+  std::vector<float> result = {0.59923654, 0.21923761, 0.18152587};
   trt_ernie(true, result);
-#endif
 }
-
+#endif  // SUPPORTS_CUDA_FP16
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 3958d3f685470f2505abf0e8bfd269d3834970ae..338e46111fca83230aca1c7877578e557cef5a31 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -54,9 +54,13 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     float average_window = ctx.Attr<float>("average_window");
     int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
     int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    PADDLE_ENFORCE_LE(min_average_window, max_average_window,
-                      "min_average_window shouldn't be larger than "
-                      "max_average_window");
+    PADDLE_ENFORCE_LE(
+        min_average_window, max_average_window,
+        platform::errors::InvalidArgument(
+            "The min_average_window > "
+            "max_average_window is not right, min_average_window is %ld, "
+            "max_average_window is %ld.",
+            min_average_window, max_average_window));
 
     // Get inputs
     auto* param = ctx.Input<Tensor>("param");
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 7f705755915924de4ca6ab4c698e46a437bb649c..00af724ac7fce64b9a210bf43a150acf20f34dce 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -287,7 +288,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 #endif
 
     // ------------------- cudnn conv forward ---------------------
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+    VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
     for (int i = 0; i < groups; i++) {
       workspace_handle.RunFunc(
           [&](void* workspace_ptr) {
@@ -609,9 +612,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     // ------------------- cudnn conv backward data ---------------------
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+    VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+
     if (input_grad) {
-      // Because beta is zero, it is unnecessary to reset input_grad.
+      // When beta is 0, it is unnecessary to reset input_grad.
+      // When beta is 1, the output cannot be reset since addt strategy used.
       for (int i = 0; i < groups; i++) {
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
@@ -653,6 +660,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
             ctx, &transformed_input_grad_channel, input_grad);
       }
     }
+
+    // filter_grad do not use inplace addto.
+    ScalingParamType<T> beta_filter = 0.0f;
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
       // Because beta is zero, it is unnecessary to reset filter_grad.
@@ -665,7 +675,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       input_data + i * group_offset_in, args2.odesc.desc(),
                       output_grad_data + i * group_offset_out,
                       args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
-                      workspace_size, &beta, args2.wdesc.desc(),
+                      workspace_size, &beta_filter, args2.wdesc.desc(),
                       filter_grad_data + i * group_offset_filter));
             },
             workspace_size);
@@ -1017,7 +1027,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
     int group_offset_out = o_c / groups * o_h * o_w * o_d;
     int group_offset_filter = W->numel() / groups;
 
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = 0.0f;
+
+    // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+    // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
+    // 0.0f;
+    // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+
     auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
 
     if (ddO) {
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 9ed169fe3502e0c34b9f37d6520edc1a3fbfa91c..bf97b9d03c455182a8d95b6987896b9a580c84fe 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -305,6 +305,11 @@ void Conv2DOpMaker::Make() {
       .SetDefault(0.0f);
   AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
       .SetDefault(0.0f);
+  AddAttr<bool>(
+      "use_addto",
+      "(bool, default false) If use addto strategy or not, only used in "
+      "cudnn kernel")
+      .SetDefault(false);
   AddAttr<bool>("fuse_residual_connection",
                 "(bool, default false) Only used in mkldnn kernel. Used "
                 "whenever convolution output is as an input to residual "
@@ -460,6 +465,11 @@ void Conv3DOpMaker::Make() {
       .SetDefault(0.0f);
   AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
       .SetDefault(0.0f);
+  AddAttr<bool>(
+      "use_addto",
+      "(bool, default false) If use addto strategy or not, only used in "
+      "cudnn kernel")
+      .SetDefault(false);
   AddAttr<bool>("fuse_residual_connection",
                 "(bool, default false) Only used in mkldnn kernel. Used "
                 "whenever convolution output is as an input to residual "
diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
index 1da164175e1daf8e872964d8fad21c4f3dd614e7..cb27dc75eb2faf15746e596265d1b1e4b3717e52 100644
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
@@ -43,9 +43,9 @@ class FakeInitOp : public framework::OperatorBase {
       tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
       tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::InvalidArgument(
           "fake init op's output only"
-          "supports SelectedRows and LoDTensor");
+          "supports SelectedRows and LoDTensor"));
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 5e1e408eb2c28239fded0d0cf037c94783828b50..43de8488a0e4ac24f7e261671488cd88563a805d 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -134,7 +134,10 @@ void ListenAndServOp::RunSyncLoop(
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
   PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
+                    platform::errors::PreconditionNotMet(
+                        "Invalid number of blocks in server program. Expected "
+                        "equal or greater than 2. Recieved %zu",
+                        num_blocks));
 
   // Prepare all the server block
   std::vector<int> optimize_blocks_list;
@@ -218,7 +221,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
       VLOG(3) << "reset sparse var: " << varname;
       var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
     } else {
-      PADDLE_THROW("The type of sparse var should be SelectedRows");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "The type of sparse var should be SelectedRows"));
     }
   }
   if (UNLIKELY(reset_all)) {
@@ -235,7 +239,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
         math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
                            static_cast<float>(0));
       } else {
-        PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]");
+        PADDLE_THROW(platform::errors::PreconditionNotMet(
+            "The type of dense var should be in [LoDTensor, Tensor]"));
       }
     }
   }
@@ -254,8 +259,15 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
     std::vector<std::string> pieces;
     split(grad_and_id, ':', &pieces);
     VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
-    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0);
+    PADDLE_ENFORCE_EQ(pieces.size(), 2,
+                      platform::errors::PreconditionNotMet(
+                          "Invalid format of grad_and_id argument. "
+                          "Expected \"grad:block_id\". Recieved %s",
+                          grad_and_id.c_str()));
+    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
+                      platform::errors::AlreadyExists(
+                          "The gradient name %s has already existed in out_map",
+                          pieces[0].c_str()));
 
     int block_id = std::stoi(pieces[1]);
     (*out_map)[pieces[0]] = block_id;
@@ -267,7 +279,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
 
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
+                    platform::errors::PreconditionNotMet(
+                        "Invalid number of blocks in server program. Expected "
+                        "equal or greater than 2. Recieved %zu",
+                        num_blocks));
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
     block_list.push_back(blkid);
@@ -342,9 +357,9 @@ void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
                var->IsType<framework::Tensor>()) {
       dense_vars_.push_back(varname);
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
           "The type of received var should be in [SelectedRows, LoDTensor, "
-          "Tensor].");
+          "Tensor]."));
     }
   }
 }
@@ -450,7 +465,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     split(prefetch_var_name_and_id, ':', &pieces);
     VLOG(3) << "after split, prefetch_var = " << pieces[0]
             << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        pieces.size(), 2,
+        platform::errors::PreconditionNotMet(
+            "Invalid format of prefetch_var_name_and_id argument. "
+            "Expected \"xxx:xxx\". Recieved %s",
+            prefetch_var_name_and_id.c_str()));
 
     int block_id = std::stoi(pieces[1]);
     prefetch_block_id_list.push_back(block_id);
@@ -476,7 +496,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
        sparse_grad_name_to_param_name_str) {
     std::vector<std::string> pieces;
     split(sparse_grad_name_and_param_name, ':', &pieces);
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        pieces.size(), 2,
+        platform::errors::PreconditionNotMet(
+            "Invalid format of sparse_grad_name_and_param_name argument. "
+            "Expected \"xxx:xxx\". Recieved %s",
+            sparse_grad_name_and_param_name.c_str()));
     VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
             << ", param_name = " << pieces[1];
     sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index 534a19bd94a231f0522dd15d2510917be8c71a4b..97624944ca109f27322f151f0742c72447fd5c39 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+
 #include <memory>
 #include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
 namespace paddle {
@@ -129,3 +132,18 @@ REGISTER_OP_CPU_KERNEL(
                                         int>,
     ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>);
+
+// A specialization elementwise_add operator, used in gradient accumulation with
+// inplace addto.
+REGISTER_OPERATOR(
+    grad_add, paddle::operators::ElementwiseOp,
+    paddle::operators::ElementwiseAddOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(
+    grad_add,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 71019872802eaca964373fd58a7ccc6445d9c489..a4cbd14388b4dd5ceab6417db79fafeeff41ccb7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -111,3 +111,10 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
                                         plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
index 5dc93740949e6e7c25be564927c8fcffde1a18d6..721c23e38307fddb63d41baddc395afa94f40336 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -61,8 +61,15 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
                            const framework::Tensor *x,
                            const framework::Tensor *y, framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
-      ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, FloorDivFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
+  }
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
index f539e2e6f6d2d6faa084d1e62ec894b4b65e96bf..3d28ca90a5a15fd53a57034a4722a21842dc4b1c 100644
--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
@@ -55,31 +55,38 @@ class EmptyOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
 
     if (context->HasInput("ShapeTensor")) {
-      auto dims = context->GetInputDim("ShapeTensor");
+      auto shape_dims = context->GetInputDim("ShapeTensor");
       int num_ele = 1;
-      for (int i = 0; i < dims.size(); ++i) {
-        num_ele *= dims[i];
+      for (int i = 0; i < shape_dims.size(); ++i) {
+        num_ele *= shape_dims[i];
       }
-
-      context->SetOutputDim("Out", framework::make_ddim({num_ele}));
+      auto vec_dims = std::vector<int>(num_ele, -1);
+      context->SetOutputDim("Out", framework::make_ddim(vec_dims));
     } else if (context->HasInputs("ShapeTensorList")) {
       std::vector<int> out_dims;
       auto dims_list = context->GetInputsDim("ShapeTensorList");
       for (size_t i = 0; i < dims_list.size(); ++i) {
         auto& dims = dims_list[i];
-        PADDLE_ENFORCE_EQ(
-            dims, framework::make_ddim({1}),
-            "ShapeError: The shape of Tensor in list must be [1]. "
-            "But received the shape "
-            "is [%s]",
-            dims);
-
-        out_dims.push_back(dims[0]);
+        PADDLE_ENFORCE_EQ(dims, framework::make_ddim({1}),
+                          platform::errors::InvalidArgument(
+                              "The shape of Tensor in list must be [1]. "
+                              "But received the shape is [%s]",
+                              dims));
+
+        out_dims.push_back(-1);
       }
 
       context->SetOutputDim("Out", framework::make_ddim(out_dims));
     } else {
       auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
+      for (size_t i = 0; i < shape.size(); ++i) {
+        PADDLE_ENFORCE_GE(
+            shape[i], 0,
+            platform::errors::InvalidArgument(
+                "Each value of attribute 'shape' is expected to be no less "
+                "than 0. But recieved: shape[%u] = %d; shape = [%s].",
+                i, shape[i], framework::make_ddim(shape)));
+      }
       context->SetOutputDim("Out", framework::make_ddim(shape));
     }
   }
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 04ac4a35208a54361a4f434e68095e9519ee12e9..e9b4c7dacf8b4493fcfa0504ecf7421bd50de90c 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -174,7 +174,64 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
 
 template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
                                                float>;
+template <typename T>
+struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int quant_axis,
+                  framework::Tensor* out) {
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
 
+    auto* scale_data = scale.data<T>();
+    auto* in_data = in.data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    auto in_dims = in.dims();
+    const int64_t channel = in_dims[quant_axis];
+    platform::Transform<platform::CPUDeviceContext> trans;
+    if (quant_axis == 0) {
+      const int64_t channel_size = in.numel() / channel;
+      for (int i = 0; i < channel; i++) {
+        T s = scale_data[i];
+        auto* start = in_data + i * channel_size;
+        auto* end = in_data + (i + 1) * channel_size;
+        trans(ctx, start, end, out_data + i * channel_size,
+              ClipFunctor<T>(-s, s));
+      }
+      for (int i = 0; i < channel; i++) {
+        T s = scale_data[i];
+        T inv_s = inverse(s);
+        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+        out_e.device(*ctx.eigen_device()) =
+            (bin_cnt * inv_s * out_e).round() * s / static_cast<T>(bin_cnt);
+      }
+    } else if (quant_axis == 1) {
+      const int64_t step_i = in.numel() / in_dims[0];
+      const int64_t step_j = in.numel() / (in_dims[0] * in_dims[1]);
+      for (int i = 0; i < in_dims[0]; i++) {
+        for (int j = 0; j < in_dims[1]; j++) {
+          T s = scale_data[j];
+          T inv_s = inverse(s);
+          auto* start = in_data + i * step_i + j * step_j;
+          auto* end = in_data + i * step_i + (j + 1) * step_j;
+          auto* cur_out_data = out_data + i * step_i + j * step_j;
+          trans(ctx, start, end, cur_out_data, ClipFunctor<T>(-s, s));
+          for (int k = 0; k < step_j; k++) {
+            cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]) *
+                              s / static_cast<T>(bin_cnt);
+          }
+        }
+      }
+    }
+  }
+};
+
+template struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext,
+                                                   float>;
 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,
@@ -360,6 +417,75 @@ $$0 \leq c \lt \ the\ channel\ number\ of\ X$$
   }
 };
 
+class FakeChannelWiseQuantizeDequantizeAbsMaxOp
+    : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                   "FakeChannelWiseQuantizeDequantizeAbsMax");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FakeChannelWiseQuantizeDequantizeAbsMax");
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+                   "FakeChannelWiseQuantizeDequantizeAbsMax");
+    int quant_axis = ctx->Attrs().Get<int>("quant_axis");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddOutput("Out",
+              "(Tensor) Output of quantized and dequantized low level tensor, "
+              "saved as float data type.");
+    AddOutput("OutScale", "(Tensor) Current channel wise scale");
+    AddAttr<int>("quant_axis",
+                 "(int, default 0) The axis for quantization. "
+                 "For conv2d, depthwise_conv2d, conv2d_transpose "
+                 "and mul, the quant_axis is equal to the cout axis.")
+        .SetDefault(0)
+        .AddCustomChecker([](const int& quant_axis) {
+          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+                            platform::errors::InvalidArgument(
+                                "'quant_axis' should be 0 or 1, but "
+                                "the received is %d",
+                                quant_axis));
+        });
+    AddAttr<int>("bit_length", "(int, default 8)")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+                            platform::errors::InvalidArgument(
+                                "'bit_length' should be between 1 and 16, but "
+                                "the received is %d",
+                                bit_length));
+        });
+    AddComment(R"DOC(
+The scale of FakeChannelWiseQuantize operator is a vector.
+In detail, each channel of the input X has a scale value.
+
+$$scale_c = max(abs(X_c))$$
+$$range = 2^{bit\_length - 1} - 1$$
+$$Out_c = round(\frac{X_c * range} {scale_c}) * \frac{scale_c} {range}$$
+In above three formulas, the range value of c is as follow:
+$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
+)DOC");
+  }
+};
+
 class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
  public:
   FakeQuantizeRangeAbsMaxOp(const std::string& type,
@@ -666,3 +792,12 @@ REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
 REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp);
 REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad,
                        ops::FakeQuantDequantGradKernel<CPU, float>);
+
+REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max,
+                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
+                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
+                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
+                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    fake_channel_wise_quantize_dequantize_abs_max,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CPU, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 6ff3c7ec632f236fe4ae6c6504537df3b8a46b7a..8bc14dde8636822354bbaeaf659880ee754dc5b9 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -417,8 +417,90 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
-                                               float>;
+// ChannelClipAndQuantDequantKernel for quant_axis is 0
+template <typename T>
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
+    const T* in, const T* scale, const int bin_cnt, const int n, const int c,
+    T* out) {
+  int tid = threadIdx.x;
+
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+
+  T s = scale[blockIdx.x];
+  T inv_s = inverse(s);
+
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v) * s / bin_cnt;
+  }
+}
+
+// ChannelClipAndQuantDequantKernel for quant_axis is 1
+template <typename T>
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
+    const T* in, const T* scale, const int bin_cnt, const int n, const int cin,
+    const int cout, T* out) {
+  T s = scale[blockIdx.x % cout];
+  T inv_s = inverse(s);
+
+  int wh_size = n / (cin * cout);
+  const T* in_c = in + blockIdx.x * wh_size;
+  T* out_c = out + blockIdx.x * wh_size;
+
+  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v) * s / bin_cnt;
+  }
+}
+
+template <typename T>
+struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int quant_axis,
+                  framework::Tensor* out) {
+    // At present, channelwise quantization supports conv2d, depthwise_conv2d
+    // conv2d_transpose and mul
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
+
+    int num = in.numel();
+    auto in_dims = in.dims();
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    if (quant_axis == 0) {
+      int grid = in_dims[0];
+      int block = 1024;
+      ChannelClipAndQuantDequantKernelQuantAxis0<
+          T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
+                                               num, in_dims[0], out_data);
+    } else if (quant_axis == 1) {
+      int grid = in_dims[0] * in_dims[1];
+      int block = 1024;
+
+      ChannelClipAndQuantDequantKernelQuantAxis1<
+          T><<<grid, block, 0, ctx.stream()>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
+    }
+  }
+};
+
+template struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext,
+                                                   float>;
 
 }  // namespace operators
 }  // namespace paddle
@@ -443,3 +525,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad,
                         ops::FakeQuantDequantGradKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(
+    fake_channel_wise_quantize_dequantize_abs_max,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 5c6e0b1f6e26d84462a18da910b412f03b93285d..2f5afbe0eedf98ac7219772a6705d502069f0385 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -72,6 +72,13 @@ struct ChannelClipAndFakeQuantFunctor {
                   const int quant_axis, framework::Tensor* out);
 };
 
+template <typename DeviceContext, typename T>
+struct ChannelClipFakeQuantDequantFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& scale, const int bin_cnt,
+                  const int quant_axis, framework::Tensor* out);
+};
+
 template <typename DeviceContext, typename T>
 struct FindMovingAverageAbsMaxFunctor {
   void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
@@ -154,6 +161,30 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    out->mutable_data<T>(dev_ctx.GetPlace());
+
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    int quant_axis = context.Attr<int>("quant_axis");
+
+    FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
+                                                 out_scale_data);
+
+    ChannelClipFakeQuantDequantFunctor<DeviceContext, T>()(
+        dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
+  }
+};
+
 template <typename DeviceContext, typename T>
 class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 4013906609603e31b798e333d55ecccba197506a..e3776a80b316089891282136022a4e6656360c6e 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
+#include <vector>
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
index a6c9a137b5438d840ae283b72fc9e85903c83775..c5a291f10b2eaa32aa4b98d73004008bae89a5c9 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -192,6 +192,9 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
           copy_size += src_mat_w_sz;
         }
         // fill data
+        if (context_start > 0) {
+          src_data += context_start * src_mat_w;
+        }
         for (int j = 0; j < seq_len - up_pad - down_pad; ++j) {
           std::memcpy(dst_data, src_data, copy_size);
           dst_data += col_mat_w;
@@ -201,18 +204,15 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
         std::memset(dst_data, 0, down_pad * col_mat_w_sz);
         copy_size -= src_mat_w_sz;
         for (int j = 0; j < down_pad; ++j) {
+          if (copy_size < 0) {
+            copy_size = 0;
+          }
           std::memcpy(dst_data, src_data, copy_size);
           dst_data += col_mat_w;
           src_data += src_mat_w;
           copy_size -= src_mat_w_sz;
         }
       } else {
-        PADDLE_ENFORCE_GE(context_length, up_pad + down_pad + 1,
-                          platform::errors::InvalidArgument(
-                              "context length must be bigger or equal than "
-                              "up_pad + down_pad + 1, but received context "
-                              "length is: %d, up_pad is: %d, down_pad is: %d.",
-                              context_length, up_pad, down_pad));
         std::memset(dst_data, 0, seq_len * col_mat_w_sz);
         dst_data = dst_data + up_pad * src_mat_w;
         int zero_sz = up_pad * src_mat_w_sz;
@@ -226,9 +226,15 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
         // from bottom
         dst_data = col_data + ed * col_mat_w;
         src_data = x_data + st * src_mat_w;
+        if (context_start > 0) {
+          src_data += context_start * src_mat_w;
+        }
         zero_sz = down_pad * src_mat_w_sz;
         for (int j = 1; j <= std::min(down_pad, seq_len); ++j) {
           int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz);
+          if (copy_size < 0) {
+            copy_size = 0;
+          }
           std::memcpy(dst_data - (zero_sz + copy_size) / sizeof(T),
                       src_data + std::max(seq_len - j - up_pad, 0) * src_mat_w,
                       copy_size);
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index 0155ef188ef967fbf67505d28beeeaf956bb3a70..550de1aadde2935fae34226dba78cc06d82cd1f3 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -87,7 +87,10 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
     lod[0].assign(high_level.begin(), high_level.end());
     lod[1].assign(low_level.begin(), low_level.end());
     if (!framework::CheckLoD(lod)) {
-      PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "lod %s is not right in"
+          " beam_search, please check your code.",
+          framework::LoDToString(lod)));
     }
     selected_ids->set_lod(lod);
     selected_scores->set_lod(lod);
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index cf6d44c1abc531da9d00738bba22f70a4c68bbab..ed3ead47d171efb4128a294c7d7a24324c7187b7 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -400,7 +400,10 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
 
     context.Wait();
     if (!framework::CheckLoD(selected_lod)) {
-      PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "lod %s is not right in"
+          " beam_search, please check your code.",
+          framework::LoDToString(selected_lod)));
     }
 
     selected_ids->set_lod(selected_lod);
diff --git a/paddle/fluid/operators/math/blas.cc b/paddle/fluid/operators/math/blas.cc
index 6a143b3c056455595fdedc131b0c5f4ee756e1e0..2a7ce83967f0f74f4c2178dd4277e6a1687b5ec7 100644
--- a/paddle/fluid/operators/math/blas.cc
+++ b/paddle/fluid/operators/math/blas.cc
@@ -20,7 +20,11 @@ namespace operators {
 namespace math {
 MatDescriptor CreateMatrixDescriptor(const framework::DDim &tensor_dim,
                                      int num_flatten_cols, bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
+  PADDLE_ENFORCE_GT(
+      tensor_dim.size(), 1,
+      platform::errors::InvalidArgument("The tensor dim size should be greater "
+                                        "than 1, but reveived dim size is %d",
+                                        tensor_dim.size()));
   MatDescriptor retv;
   if (num_flatten_cols > 1) {
     auto flatten_dim = framework::flatten_to_2d(tensor_dim, num_flatten_cols);
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index d0c5f74d4efb8248b41d8b2af285e8dd7ec4d479..a0464cf70e2dcc44c42fc2ca7440680ef8a53e6e 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -60,7 +60,8 @@ struct CUBlas<float> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cublasSgemmStridedBatched(args...));
 #else
-    PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "SgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
   }
 
@@ -85,7 +86,8 @@ struct CUBlas<float> {
           beta, C, Ctype, ldc));
     });
 #else
-    PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasSgemmEx is not supported on cuda <= 7.5"));
 #endif
   }
 
@@ -146,13 +148,15 @@ struct CUBlas<double> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cublasDgemmStridedBatched(args...));
 #else
-    PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "DgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
   }
 
   template <typename... ARGS>
   static void GEMM_EX(ARGS... args) {
-    PADDLE_THROW("Currently there are not cublasDgemmEx.");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Currently there are not cublasDgemmEx."));
   }
 
   template <typename... ARGS>
@@ -216,7 +220,8 @@ struct CUBlas<platform::float16> {
         reinterpret_cast<const __half *>(beta), reinterpret_cast<__half *>(C),
         ldc, strideC, batchCount));
 #else
-    PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "HgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
   }
 
@@ -247,7 +252,8 @@ struct CUBlas<platform::float16> {
           beta, C, Ctype, ldc, computeType, algo));
     });
 #else
-    PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
   }
 };
@@ -302,8 +308,12 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
   // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
-                    "cublas fp16 gemm requires GPU compute capability >= 53");
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
 
   float h_alpha = static_cast<float>(alpha);
   float h_beta = static_cast<float>(beta);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 892bf15738141bfbb7e75fa6b37c0cda53a8e098..515d6a2435e86fe07ffe1309628ef2fbeefdc6f0 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -29,7 +29,8 @@ template <>
 struct CBlas<int8_t> {
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_THROW("Blas VCOPY don't support int8_t");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Blas VCOPY do not supported on CPU, please check your code"));
   }
 };
 
@@ -347,22 +348,47 @@ struct CBlas<double> {
 
 template <>
 struct CBlas<platform::float16> {
-  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+  static void GEMM(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 GEMM not supported on CPU, please check your code"));
+  }
+
   static void SMM_GEMM(...) {
-    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 SMM_GEMM not supported on CPU, please check your code"));
   }
-  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
-  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
-  static void VSQUARE(...) {
-    PADDLE_THROW("float16 VSQUARE not supported on CPU");
+  static void VMUL(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VMUL not supported on CPU, please check your code"));
   }
-  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
-  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
-  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
-  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
+  static void VEXP(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VEXP not supported on CPU, please check your code"));
+  }
+  static void VSQUARE(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VSQUARE not supported on CPU, please check your code"));
+  }
+  static void VPOW(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VPOW not supported on CPU, please check your code"));
+  }
+  static void DOT(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 DOT not supported on CPU, please check your code"));
+  };
+  static void SCAL(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 SCAL not supported on CPU, please check your code"));
+  };
+  static void ASUM(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 ASUM not supported on CPU, please check your code"));
+  };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
-    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 GEMM_BATCH not supported on CPU, please check your code"));
   }
 #endif
 };
@@ -446,11 +472,18 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
   auto dim_a = mat_a.dims();
   auto dim_b = mat_b.dims();
   auto dim_out = mat_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-  PADDLE_ENFORCE(
-      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(),
-      "The places of matrices must be same");
+  PADDLE_ENFORCE_EQ(
+      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, true,
+      platform::errors::InvalidArgument(
+          "The input and output of matmul should be matrix, the dim size must "
+          "be 2,"
+          "but received dim size input_a:%d, input_b:%d, output:%d",
+          dim_a.size(), dim_b.size(), dim_out.size()));
+  PADDLE_ENFORCE_EQ(
+      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(), true,
+      platform::errors::InvalidArgument("The places of matrices in the matmul "
+                                        "should be same, please check your "
+                                        "code."));
 
   int M = dim_out[0];
   int N = dim_out[1];
@@ -715,7 +748,13 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMMWithHead(
     }
 
   } else {
-    PADDLE_ENFORCE_EQ(W1, H2);
+    PADDLE_ENFORCE_EQ(
+        W1, H2,
+        platform::errors::InvalidArgument(
+            "The fisrt matrix width should be same as second matrix height,"
+            "but received fisrt matrix width %d"
+            ", second matrix height %d",
+            W1, H2));
     int ldc = W2 * head_number;
     int sub_width = W1 / head_number;
 
@@ -785,7 +824,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                                  const framework::Tensor &mat_b,
                                  const MatDescriptor &dim_b, T alpha,
                                  framework::Tensor *mat_out, T beta) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+  PADDLE_ENFORCE_EQ(
+      dim_a.width_, dim_b.height_,
+      platform::errors::InvalidArgument(
+          "The fisrt matrix width should be same as second matrix height,"
+          "but received fisrt matrix width %d"
+          ", second matrix height %d",
+          dim_a.width_, dim_b.height_));
+
   CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
   if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -793,12 +839,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                            dim_a.width_, alpha, mat_a.data<T>(),
                            mat_b.data<T>(), beta, mat_out->data<T>());
   } else {
-    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
-                       dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0,
-                   "dim_a.batch_size should be equal to dim_b.batch_size, or "
-                   "one of dim_a.batch_size and dim_b.batch_size should be 0. "
-                   "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
-                   dim_a.batch_size_, dim_b.batch_size_);
+    PADDLE_ENFORCE_EQ(
+        dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
+            dim_b.batch_size_ == 0,
+        true, platform::errors::InvalidArgument(
+                  "dim_a.batch_size should be equal to dim_b.batch_size, or "
+                  "one of dim_a.batch_size and dim_b.batch_size should be 0. "
+                  "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
+                  dim_a.batch_size_, dim_b.batch_size_));
     this->template BatchedGEMM<T>(
         transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
         mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
@@ -834,15 +882,42 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
                                          int head_number,
                                          framework::Tensor *mat_out, T beta,
                                          bool mat_b_split_vertical) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_ % head_number, 0);
-  PADDLE_ENFORCE_GE(head_number, 1);
-  PADDLE_ENFORCE_LE(head_number, dim_a.width_);
+  PADDLE_ENFORCE_EQ(
+      dim_a.width_ % head_number, 0,
+      platform::errors::InvalidArgument(
+          "The first input width must be some times the head number"
+          "but received first input width %d"
+          ",  head_number %d",
+          dim_a.width_, head_number));
+  PADDLE_ENFORCE_GE(head_number, 1,
+                    platform::errors::InvalidArgument(
+                        "The head number should be greater equal 1,"
+                        "but received head number %d",
+                        head_number));
+  PADDLE_ENFORCE_LE(
+      head_number, dim_a.width_,
+      platform::errors::InvalidArgument(
+          "The head number should be less equal first input width,"
+          "but received first input width %d"
+          ",  head_number %d",
+          dim_a.width_, head_number));
   CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
 
   if (mat_b_split_vertical) {
-    PADDLE_ENFORCE_EQ(dim_b.height_, dim_a.width_ / head_number);
-    PADDLE_ENFORCE_EQ(dim_b.width_ % head_number, 0);
+    PADDLE_ENFORCE_EQ(
+        dim_b.height_, dim_a.width_ / head_number,
+        platform::errors::InvalidArgument(
+            "The second input height should be equal than first input width,"
+            "but received second input height %d, first input width %d",
+            dim_b.height_, dim_a.width_ / head_number));
+    PADDLE_ENFORCE_EQ(
+        dim_a.width_ % head_number, 0,
+        platform::errors::InvalidArgument(
+            "The second input width should be some times the head number"
+            "but received second input width %d"
+            ",  head_number %d",
+            dim_b.width_, head_number));
   }
 
   if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -888,9 +963,16 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
                              mat_out->data<T>() + sub_matC_offset, ldc);
     }
   } else {
-    PADDLE_ENFORCE_EQ((dim_a.batch_size_ == dim_b.batch_size_ ||
-                       dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0),
-                      true);
+    PADDLE_ENFORCE_EQ(
+        (dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
+         dim_b.batch_size_ == 0),
+        true,
+        platform::errors::InvalidArgument(
+            "The first input batch size should be equal than second input,"
+            "either two input batch size is 0, but received first input batch "
+            "size"
+            " %d, second input batch size %d",
+            dim_a.batch_size_, dim_b.batch_size_));
 
     this->template BatchedGEMMWithHead<T>(
         transA, transB, dim_a.width_, dim_a.height_, dim_b.width_,
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index f44b33fcf2fc23f79483909046dd9e292fd8dde8..b8af5a21ca58185a10b34bca2310dd93436d340c 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -22,10 +22,12 @@ limitations under the License. */
 #include <cblas.h>
 #endif
 
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/float16.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace operators {
@@ -63,6 +65,55 @@ DEFINE_CPU_TRANS(4);
 DEFINE_CPU_TRANS(5);
 DEFINE_CPU_TRANS(6);
 
+template <typename T>
+struct TransposeNormal<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    const T* in_ptr = in.data<T>();
+    T* out_ptr = out->data<T>();
+
+    auto transpose_helper = [&](int64_t beg, int64_t end) {
+      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
+        int64_t in_idx = 0;
+        int64_t tmp_idx = out_idx;
+        // calculate the input index
+        for (int i = 0; i < rank; ++i) {
+          const int64_t coordinate = tmp_idx / out_stride[i];
+          tmp_idx -= coordinate * out_stride[i];
+          in_idx += coordinate * in_stride[axis[i]];
+        }
+        out_ptr[out_idx] = in_ptr[in_idx];
+      }
+    };
+    double cost_per_iteration =
+        rank * (Eigen::TensorOpCost::DivCost<int64_t>() +
+                2 * Eigen::TensorOpCost::MulCost<int64_t>() +
+                2 * Eigen::TensorOpCost::AddCost<int64_t>());
+    Eigen::TensorOpCost cost(sizeof(T), sizeof(T), cost_per_iteration);
+    auto* cpu_device = context.eigen_pool_device();
+    cpu_device->parallelFor(out->numel(), cost, std::move(transpose_helper));
+  }
+};
+
+// define transpose normal
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
+
+DEFINE_CPU_TRANS_NORMAL(platform::float16);
+DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
+DEFINE_CPU_TRANS_NORMAL(float);
+DEFINE_CPU_TRANS_NORMAL(double);
+DEFINE_CPU_TRANS_NORMAL(int);
+DEFINE_CPU_TRANS_NORMAL(int64_t);
+DEFINE_CPU_TRANS_NORMAL(bool);
+DEFINE_CPU_TRANS_NORMAL(int16_t);
+DEFINE_CPU_TRANS_NORMAL(uint8_t);
+DEFINE_CPU_TRANS_NORMAL(int8_t);
+
 struct TensorSetConstantCPU {
   TensorSetConstantCPU(framework::Tensor* tensor, float value)
       : tensor_(tensor), value_(value) {}
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 1c519d226ebfe5ff19876f17b79fd36aa12c4130..4d7c1a49286dd67a314f81fd706bdcc5e340e450 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -11,8 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
@@ -23,6 +26,7 @@ namespace operators {
 namespace math {
 
 using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
 
 template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
 template struct SetConstant<platform::CUDADeviceContext, float>;
@@ -31,12 +35,13 @@ template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
 
-#define DEFINE_GPU_TRANS(RANK)                                           \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>; \
+#define DEFINE_GPU_TRANS(RANK)                                            \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>;    \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext, bfloat16, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;  \
   template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;
 
 DEFINE_GPU_TRANS(1);
@@ -46,6 +51,88 @@ DEFINE_GPU_TRANS(4);
 DEFINE_GPU_TRANS(5);
 DEFINE_GPU_TRANS(6);
 
+#define REINTERPRET(T, DST_PTR, SRC_PTR) \
+  T* DST_PTR = reinterpret_cast<T*>(SRC_PTR)
+
+template <typename T>
+__global__ void TransposeNormalKernel(const T* in_ptr, T* out_ptr,
+                                      int64_t element,
+                                      const int64_t* in_stride_ptr,
+                                      const int64_t* out_stride_ptr,
+                                      const int64_t* axis_ptr, int rank) {
+  CUDA_KERNEL_LOOP(out_idx, element) {
+    int64_t in_idx = 0;
+    int64_t tmp_idx = out_idx;
+    for (int i = 0; i < rank; ++i) {
+      const int64_t coordinate = tmp_idx / out_stride_ptr[i];
+      tmp_idx -= coordinate * out_stride_ptr[i];
+      in_idx += coordinate * in_stride_ptr[axis_ptr[i]];
+    }
+    out_ptr[out_idx] = in_ptr[in_idx];
+  }
+}
+
+template <typename T>
+struct TransposeNormal<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    auto* in_ptr = in.data<T>();
+    auto* out_ptr = out->data<T>();
+
+    // copy in_stride, out_stride, axis to gpu device
+    const platform::CUDAPlace& cuda_place =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+    platform::CPUPlace cpu_place = platform::CPUPlace();
+    size_t size = 3 * rank * sizeof(int64_t);
+    auto cpu_buf_holder = memory::AllocShared(cpu_place, size);
+    auto cuda_buf_holder = memory::AllocShared(cuda_place, size);
+    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
+    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
+    for (int i = 0; i < rank; ++i) {
+      cpu_buf[i] = in_stride[i];
+      cpu_buf[rank + i] = out_stride[i];
+      cpu_buf[2 * rank + i] = axis[i];
+    }
+    memory::Copy(cuda_place, cuda_buf, cpu_place, cpu_buf, size,
+                 context.stream());
+    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
+    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
+    REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
+
+    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
+    const int MAX_GRID_DIM =
+        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int64_t elements = in.numel();
+    int block_size = (elements >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(elements)));
+    int grid_size = elements / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
+        in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr,
+        rank);
+  }
+};
+
+// define transpose normal
+#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CUDADeviceContext, TYPE>
+
+DEFINE_GPU_TRANS_NORMAL(float16);
+DEFINE_GPU_TRANS_NORMAL(bfloat16);
+DEFINE_GPU_TRANS_NORMAL(float);
+DEFINE_GPU_TRANS_NORMAL(double);
+DEFINE_GPU_TRANS_NORMAL(int);
+DEFINE_GPU_TRANS_NORMAL(int64_t);
+DEFINE_GPU_TRANS_NORMAL(bool);
+DEFINE_GPU_TRANS_NORMAL(int16_t);
+DEFINE_GPU_TRANS_NORMAL(uint8_t);
+DEFINE_GPU_TRANS_NORMAL(int8_t);
+
 struct TensorSetConstantGPU {
   TensorSetConstantGPU(const platform::DeviceContext& context,
                        framework::Tensor* tensor, float value)
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 333552a0c1a79ed1682ea1f6da36f30e2ad5b4bf..6af0278d82503a27a14de6aef96468d69d6c17ad 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -26,6 +26,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 namespace math {
+
+template <typename DeviceContext, typename T>
+struct TransposeNormal {
+  // for dims >= 7 situation
+  void operator()(const DeviceContext& context, const framework::Tensor& in,
+                  framework::Tensor* out, const std::vector<int>& axis);
+};
+
 template <typename DeviceContext, typename T, int Rank>
 struct Transpose {
   void operator()(const DeviceContext& context, const framework::Tensor& in,
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc
index 99d1156ee6d5fc88161e25bfa581a265707e6f92..eeee008cdc53c457146074060d526d8d0e8b43aa 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc
@@ -143,4 +143,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>);
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cu b/paddle/fluid/operators/optimizers/rmsprop_op.cu
index 8b17d6a0204045a9b20adb79dbad72dff5ba267e..bf11ee686757c6c5e54e05f055eaa19f6553f915 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cu
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cu
@@ -15,4 +15,5 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>);
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 67a19cb83c36f9cb6ef0cdd65e9fc04a7bb4d169..25f9453571ac632e70b0755ca1e5566eb5bf6ee6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -18,9 +18,10 @@ limitations under the License. */
 #include <set>
 #include <string>
 #include <vector>
-
 #include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 
 namespace paddle {
@@ -34,6 +35,110 @@ namespace operators {
   }
 
 using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims,
+                           const std::vector<int>& reduced_dims,
+                           std::vector<int>* perm_axis) {
+  // check if it's a reduced dim
+  std::vector<bool> src_dims_check(src_dims.size(), false);
+  size_t src_size = src_dims.size();
+  size_t reduce_size = reduced_dims.size();
+  for (size_t i = 0; i < reduce_size; ++i) {
+    dst_dims->at(src_size - reduce_size + i) = src_dims[reduced_dims[i]];
+    (*perm_axis)[src_size - reduce_size + i] = reduced_dims[i];
+    src_dims_check[reduced_dims[i]] = true;
+  }
+
+  size_t offset = 0;
+  for (size_t i = 0; i < src_dims_check.size(); ++i) {
+    bool is_reduced = src_dims_check[i];
+    if (!is_reduced) {
+      (*perm_axis)[offset] = i;
+      dst_dims->at(offset++) = src_dims[i];
+    }
+  }
+}
+
+template <typename DeviceContext, typename OutT>
+void GetShuffledInput(const framework::ExecutionContext& context,
+                      const Tensor* input, Tensor* shuffled_input,
+                      const std::vector<int>& dims) {
+  DDim shuffled_dims(input->dims());
+  std::vector<int> perm_axis(input->dims().size());
+  GetShuffledDim(input->dims(), &shuffled_dims, dims, &perm_axis);
+
+  shuffled_input->Resize(shuffled_dims);
+  shuffled_input->mutable_data<OutT>(context.GetPlace());
+
+  math::TransposeNormal<DeviceContext, OutT> trans;
+  trans(context.template device_context<DeviceContext>(), *input,
+        shuffled_input, perm_axis);
+}
+
+inline void GetOriginDimFromShuffled(const DDim& src_dim,
+                                     const std::vector<int>& dims,
+                                     std::vector<int>* origin_dim) {
+  DDim shuffled_dims(src_dim);
+  size_t n = src_dim.size();
+  std::vector<int> perm_axis(n);
+  GetShuffledDim(src_dim, &shuffled_dims, dims, &perm_axis);
+  for (size_t i = 0; i < n; ++i) {
+    (*origin_dim)[perm_axis[i]] = i;
+  }
+}
+
+template <typename DeviceContext, typename OutT, typename Functor>
+void HandleLargeDim(const framework::ExecutionContext& context,
+                    const Tensor* input, Tensor* output,
+                    const std::vector<int>& dims, bool keep_dim) {
+  //  shuffle the reduced dim to the end
+  Tensor shuffled_input;
+  GetShuffledInput<DeviceContext, OutT>(context, input, &shuffled_input, dims);
+
+  // transpose to 2D tensor whose shape is {unreduced, reduced}.
+  const int64_t unreduced = output->numel();
+  const int64_t reduced = shuffled_input.numel() / unreduced;
+  shuffled_input.Resize({unreduced, reduced});
+  DDim output_dim = output->dims();
+  output->Resize({unreduced});
+  ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
+      context.template device_context<DeviceContext>(), shuffled_input, output,
+      {1}, keep_dim);
+  output->Resize(output_dim);
+}
+
+template <typename DeviceContext, typename T, typename Functor>
+void HandleLargeDimGrad(const framework::ExecutionContext& context,
+                        const framework::Tensor* x,
+                        const framework::Tensor* out,
+                        const framework::Tensor* dout, framework::Tensor* dx,
+                        const std::vector<int>& dims) {
+  const int64_t unreduced = out->numel();
+  const int64_t reduced = x->numel() / unreduced;
+  DDim out_dim(out->dims());
+  DDim x_dim(x->dims());
+  // transpose and reshape X
+  Tensor shuffled_x;
+  GetShuffledInput<DeviceContext, T>(context, x, &shuffled_x, dims);
+  DDim shuffled_dim = shuffled_x.dims();
+  shuffled_x.Resize({unreduced, reduced});
+  // reshape dX {unreduced, reduced}
+  dx->Resize({unreduced, reduced});
+  ReduceGradFunctor<DeviceContext, T, 2, Functor>(
+      context.template device_context<DeviceContext>(), shuffled_x, *out, *dout,
+      dx, {1});
+  // transpose dX
+  std::vector<int> origin_axis(x_dim.size());
+  GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
+  Tensor dx_tmp;
+  framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
+  dx_tmp.Resize(shuffled_dim);
+  dx->Resize(x_dim);
+  math::TransposeNormal<DeviceContext, T> trans;
+  trans(context.template device_context<DeviceContext>(), dx_tmp, dx,
+        origin_axis);
+}
 
 template <typename DeviceContext, typename T, typename Functor>
 struct ReduceKernelFunctor {
@@ -69,22 +174,27 @@ struct ReduceKernelFunctor {
     } else {
       int ndim = input->dims().size();
       int rdim = dims.size();
-      HANDLE_DIM(6, 5);
-      HANDLE_DIM(6, 4);
-      HANDLE_DIM(6, 3);
-      HANDLE_DIM(6, 2);
-      HANDLE_DIM(6, 1);
-      HANDLE_DIM(5, 4);
-      HANDLE_DIM(5, 3);
-      HANDLE_DIM(5, 2);
-      HANDLE_DIM(5, 1);
-      HANDLE_DIM(4, 3);
-      HANDLE_DIM(4, 2);
-      HANDLE_DIM(4, 1);
-      HANDLE_DIM(3, 2);
-      HANDLE_DIM(3, 1);
-      HANDLE_DIM(2, 1);
-      HANDLE_DIM(1, 1);
+      if (ndim > 6) {
+        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
+                                                     dims, keep_dim);
+      } else {
+        HANDLE_DIM(6, 5);
+        HANDLE_DIM(6, 4);
+        HANDLE_DIM(6, 3);
+        HANDLE_DIM(6, 2);
+        HANDLE_DIM(6, 1);
+        HANDLE_DIM(5, 4);
+        HANDLE_DIM(5, 3);
+        HANDLE_DIM(5, 2);
+        HANDLE_DIM(5, 1);
+        HANDLE_DIM(4, 3);
+        HANDLE_DIM(4, 2);
+        HANDLE_DIM(4, 1);
+        HANDLE_DIM(3, 2);
+        HANDLE_DIM(3, 1);
+        HANDLE_DIM(2, 1);
+        HANDLE_DIM(1, 1);
+      }
     }
   }
 };
@@ -137,7 +247,6 @@ class ReduceKernel : public framework::OpKernel<T> {
     }
   }
 };
-
 template <typename DeviceContext, typename OutT, typename Functor>
 class BoolReduceKernel : public framework::OpKernel<OutT> {
  public:
@@ -175,22 +284,27 @@ class BoolReduceKernel : public framework::OpKernel<OutT> {
       int ndim = input->dims().size();
       int rdim = dims.size();
       // comments for accelerating compiling temporarily.
-      //      HANDLE_DIM(6, 5);
-      //      HANDLE_DIM(6, 4);
-      //      HANDLE_DIM(6, 3);
-      //      HANDLE_DIM(6, 2);
-      //      HANDLE_DIM(6, 1);
-      //      HANDLE_DIM(5, 4);
-      //      HANDLE_DIM(5, 3);
-      //      HANDLE_DIM(5, 2);
-      //      HANDLE_DIM(5, 1);
-      HANDLE_DIM(4, 3);
-      HANDLE_DIM(4, 2);
-      HANDLE_DIM(4, 1);
-      HANDLE_DIM(3, 2);
-      HANDLE_DIM(3, 1);
-      HANDLE_DIM(2, 1);
-      HANDLE_DIM(1, 1);
+      if (ndim > 6) {
+        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
+                                                     dims, keep_dim);
+      } else {
+        HANDLE_DIM(6, 5);
+        HANDLE_DIM(6, 4);
+        HANDLE_DIM(6, 3);
+        HANDLE_DIM(6, 2);
+        HANDLE_DIM(6, 1);
+        HANDLE_DIM(5, 4);
+        HANDLE_DIM(5, 3);
+        HANDLE_DIM(5, 2);
+        HANDLE_DIM(5, 1);
+        HANDLE_DIM(4, 3);
+        HANDLE_DIM(4, 2);
+        HANDLE_DIM(4, 1);
+        HANDLE_DIM(3, 2);
+        HANDLE_DIM(3, 1);
+        HANDLE_DIM(2, 1);
+        HANDLE_DIM(1, 1);
+      }
     }
   }
 };
@@ -279,6 +393,10 @@ class ReduceGradKernel : public framework::OpKernel<T> {
               context.template device_context<DeviceContext>(), *input0,
               *input1, *input2, output, dims);
           break;
+        default:
+          HandleLargeDimGrad<DeviceContext, T, Functor>(context, input0, input1,
+                                                        input2, output, dims);
+          break;
       }
     }
   }
@@ -313,12 +431,6 @@ class ReduceOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceOp");
     auto x_dims = ctx->GetInputDim("X");
     auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's dimensions of ReduceOp "
-                          "should be less equal than 6. But received X's "
-                          "dimensions = %d, X's shape = [%s].",
-                          x_rank, x_dims));
     auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
     PADDLE_ENFORCE_GT(dims.size(), 0,
                       platform::errors::InvalidArgument(
@@ -402,11 +514,6 @@ class ReduceGradOp : public framework::OperatorWithKernel {
                    "Out@GRAD", "ReduceOp");
     auto x_dims = ctx->GetInputDim("X");
     auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6,
-                      platform::errors::InvalidArgument(
-                          "Tensors with rank at most 6 are supported by "
-                          "ReduceOp. Received tensor with rank %d.",
-                          x_rank));
     auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
     for (size_t i = 0; i < dims.size(); ++i) {
       PADDLE_ENFORCE_LT(dims[i], x_rank,
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 62bffe630484e3ab30bedcf2324f6516bca3b27e..0ecf9bfb5d8c0ccadbdd8b7a0b8f6193d4dc5310 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -68,6 +68,6 @@ REGISTER_OPERATOR(
     shape, ops::ShapeOp, ops::ShapeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int32_t>,
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
                        ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
                        ops::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
index 4b9dca0d4028be36ad8ba46ebe35db101e003ee9..5d50b17818cbb8068db2fded1f3f4e76bad44430 100644
--- a/paddle/fluid/operators/shape_op.cu
+++ b/paddle/fluid/operators/shape_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/shape_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
-    shape, paddle::operators::ShapeKernel<int>,
-    paddle::operators::ShapeKernel<int32_t>,
+    shape, paddle::operators::ShapeKernel<bool>,
+    paddle::operators::ShapeKernel<int>,
     paddle::operators::ShapeKernel<int64_t>,
     paddle::operators::ShapeKernel<float>,
     paddle::operators::ShapeKernel<double>,
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index cc72d83411f5a34561a75e7e75f98077ee5a4e5d..0e3fcced19ea8eb1580ca93fa9d6616685601f75 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -32,7 +32,6 @@ class TopkV2Op : public framework::OperatorWithKernel {
 
     auto input_dims = ctx->GetInputDim("X");
     const int& dim_size = input_dims.size();
-    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
     int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
     PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
                       "the axis of topk"
@@ -41,8 +40,18 @@ class TopkV2Op : public framework::OperatorWithKernel {
 
     if (axis < 0) axis += dim_size;
 
-    PADDLE_ENFORCE_GE(
-        k, 1, "the attribute of k in the topk must >= 1, but received %d .", k);
+    int k;
+    auto k_is_tensor = ctx->HasInput("K");
+    if (k_is_tensor) {
+      k = -1;
+    } else {
+      k = static_cast<int>(ctx->Attrs().Get<int>("k"));
+      PADDLE_ENFORCE_EQ(k >= 1, true,
+                        "the attribute of k in the topk must >= 1 or be a "
+                        "Tensor, but received %d .",
+                        k);
+    }
+
     PADDLE_ENFORCE_GE(input_dims.size(), 1,
                       "input of topk must have >= 1d shape");
 
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index d7f5c3dd457c90eefc4181cdbc662196a046853e..e4e5dfdba9f6057161051e551d1f6711ba0cd4e9 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -53,10 +53,9 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
       trans6(dev_ctx, in, out, axis);
       break;
     default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Tensors with rank at most 6 are supported"
-          ", but received input tensor's rank is %d,",
-          dim));
+      // for dim >= 7 situation
+      math::TransposeNormal<DeviceContext, T> trans_normal;
+      trans_normal(dev_ctx, in, out, axis);
   }
 }
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 29982c13c8ca88bc8b4a168f92e4116a283a97e8..34305c404b4df72ed28547e46817be00d6722a42 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include <set>
 #include <string>
+#include <thread>  //NOLINT
 #include <unordered_set>
 #include <vector>
 
@@ -23,6 +24,7 @@ limitations under the License. */
 #endif
 
 #include "glog/logging.h"
+#include "unsupported/Eigen/CXX11/ThreadPool"
 
 namespace paddle {
 namespace memory {
@@ -131,16 +133,31 @@ DeviceContextPool::DeviceContextPool(
 
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
+  InitPoolDevice();
 }
 
 CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
   eigen_device_.reset(new Eigen::DefaultDevice());
+  InitPoolDevice();
+}
+
+void CPUDeviceContext::InitPoolDevice() {
+  using EigenEnv = Eigen::StlThreadEnvironment;
+  using EigenThreadPool = Eigen::ThreadPoolTempl<EigenEnv>;
+  int num_threads = std::thread::hardware_concurrency();
+  eigen_threadpool_.reset(new EigenThreadPool(num_threads));
+  eigen_pool_device_.reset(
+      new Eigen::ThreadPoolDevice(eigen_threadpool_.get(), num_threads));
 }
 
 Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
 
+Eigen::ThreadPoolDevice* CPUDeviceContext::eigen_pool_device() const {
+  return eigen_pool_device_.get();
+}
+
 Place CPUDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 8bfdfc8a1c6033a79c197e1cd425197f77079bda..28d94627f9575573075beaa328a682314b5c3b71 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -41,6 +41,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #endif
+#define EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/Tensor"
 
 #ifdef PADDLE_WITH_XPU
@@ -65,11 +66,17 @@ class CPUDeviceContext : public DeviceContext {
 
   Eigen::DefaultDevice* eigen_device() const;
 
+  Eigen::ThreadPoolDevice* eigen_pool_device() const;
+
   Place GetPlace() const override;
 
+  inline void InitPoolDevice();
+
  private:
   CPUPlace place_;
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_pool_device_;
+  std::unique_ptr<Eigen::ThreadPool> eigen_threadpool_;
 };
 
 template <typename Place>
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 44a03d6f14a3ba07d73cfbc944d8db9601394103..1166dc5e4ad93fa23ef00623de6777b78b56ea09 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
+#endif
+
 #ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
 CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 #endif
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index af8798a4b7cf5a8832ce9345cad45ce3096484e4..9116edd01b040e793d23c76a04b2c93ed4d2586b 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -521,3 +521,18 @@ DEFINE_int32(
 DEFINE_bool(sort_sum_gradient, false,
             "Sum gradients by the reverse order of "
             "the forward execution sequence.");
+
+/**
+ * Performance related FLAG
+ * Name: max_inplace_grad_add
+ * Since Version: 2.0.0
+ * Value Range: int32, default=0
+ * Example:
+ * Note: The maximum number of inplace grad_add.
+ */
+DEFINE_int32(
+    max_inplace_grad_add, 0,
+    "The maximum number of inplace grad_add. When doing "
+    "gradient accumulation, if the number of gradients need to that "
+    "less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
+    "instead of sum. Default is 0.");
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 318178d5eb927e45fa6472a695ce57f4b2a058b8..894740e25c018b09f8604006ae06fa5b9dc14bf0 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -62,6 +62,7 @@ DECLARE_bool(use_system_allocator);
 // others
 DECLARE_bool(benchmark);
 DECLARE_int32(inner_op_parallelism);
+DECLARE_int32(max_inplace_grad_add);
 DECLARE_string(tracer_profile_fname);
 #ifdef PADDLE_WITH_CUDA
 // cudnn
@@ -348,7 +349,7 @@ static void RegisterGlobalVarGetterSetter() {
       FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
       FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
       FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
-      FLAGS_paddle_num_threads, FLAGS_use_mkldnn);
+      FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add);
 
 #ifdef PADDLE_WITH_CUDA
   REGISTER_PUBLIC_GLOBAL_VAR(
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index f751136640caad6acd3230bc22cd0e3f0fafe9fb..d3052ebd351ef4844d7563935172ed4b7eb1654c 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -111,6 +111,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"Out", "OutScale", "OutAccum", "OutState"}},
     {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
+    {"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}},
     {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
     {"update_loss_scaling",
      {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 330254ecaafd29c00e8942765956ea065d2bb7cf..04087cb241c9cd4975773e646bc0ef6e1287518f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <Python.h>
+
 #include <algorithm>
 #include <cstdlib>
 #include <map>
@@ -22,6 +23,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
@@ -2528,6 +2530,10 @@ All parameter, weight, gradient are variables in Paddle.
           "enable_inplace",
           [](const BuildStrategy &self) { return self.enable_inplace_; },
           [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
+      .def_property(
+          "enable_addto",
+          [](const BuildStrategy &self) { return self.enable_addto_; },
+          [](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
       .def_property(
           "fuse_all_reduce_ops",
           [](const BuildStrategy &self) {
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index ec07565c5af6c7ba79c15d9a335313775719c682..ac89116fc499d456e1fab8db030eda1c8fce9de2 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -121,6 +121,18 @@ function cmake_base() {
             else
                 exit 1
             fi
+        elif [ "$1" == "cp38-cp38" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.8" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.8/include/python3.8/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/libpython3.8.dylib"
+                pip3.8 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
         fi
         # delete `gym` to avoid modifying requirements.txt in *.whl
         sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
@@ -176,6 +188,13 @@ function cmake_base() {
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
                 pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt
+            elif [ "$1" == "cp38-cp38" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.8.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.8.0/bin/python3.8
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so"
+                pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt
            fi
         else
             pip install -r ${PADDLE_ROOT}/python/requirements.txt
@@ -514,6 +533,8 @@ EOF
             pip3.6 uninstall -y paddlepaddle
         elif [ "$1" == "cp37-cp37m" ]; then
             pip3.7 uninstall -y paddlepaddle
+        elif [ "$1" == "cp38-cp38" ]; then
+            pip3.8 uninstall -y paddlepaddle
         fi
         set -ex
 
@@ -527,6 +548,8 @@ EOF
             pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         elif [ "$1" == "cp37-cp37m" ]; then
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp38-cp38" ]; then
+            pip3.8 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
         tmpfile_rand=`date +%s%N`
         tmpfile=$tmp_dir/$tmpfile_rand
@@ -621,6 +644,7 @@ function generate_upstream_develop_api_spec() {
     git checkout -b develop_base_pr upstream/$BRANCH
     cmake_gen $1
     build $2
+    cp ${PADDLE_ROOT}/python/requirements.txt /tmp
 
     git checkout $cur_branch
     generate_api_spec "$1" "DEV"
@@ -641,7 +665,12 @@ function generate_api_spec() {
     cd ${PADDLE_ROOT}/build/.check_api_workspace
     virtualenv .${spec_kind}_env
     source .${spec_kind}_env/bin/activate
-    pip install -r ${PADDLE_ROOT}/python/requirements.txt
+
+    if [ "$spec_kind" == "DEV" ]; then
+        pip install -r /tmp/requirements.txt
+    else
+        pip install -r ${PADDLE_ROOT}/python/requirements.txt
+    fi
     pip --no-cache-dir install ${PADDLE_ROOT}/build/python/dist/*whl
     spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path
@@ -660,7 +689,7 @@ function generate_api_spec() {
 
     awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
     awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
-    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then 
+    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' $spec_path
         sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path
@@ -930,6 +959,10 @@ function parallel_test_base_gpu() {
 EOF
 
 set +x
+        precison_cases=""
+        if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
+            precision_cases=`python $PADDLE_ROOT/tools/get_pr_ut.py`
+        fi
         EXIT_CODE=0;
         test_cases=$(ctest -N -V) # get all test cases
         exclusive_tests=''        # cases list which would be run exclusively
@@ -959,10 +992,23 @@ set +x
                     echo $testcase" will only run at night."
                     continue
                 fi
+                if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]]; then
+                    will_test="false"
+                    for case in $precision_cases; do
+                        if [[ $testcase == $case ]]; then
+                            will_test="true"
+                            break
+                        fi
+                    done
+                    if [[ $will_test == "false" ]]; then
+                        echo $testcase" won't run in PRECISION_TEST mode."
+                        continue
+                    fi
+                fi
 
                 if [[ "$is_multicard" == "" ]]; then
                   # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
-                  read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist")
+                  read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_")
                 fi
 
                 if [[ "$is_exclusive" != "" ]]; then
@@ -1077,8 +1123,6 @@ set +x
                 done
         fi
 
-
-       
         if [[ "$EXIT_CODE" != "0" ]]; then
             if [[ "$failed_test_lists" == "" ]]; then
                 echo "========================================"
@@ -1223,21 +1267,25 @@ EOF
     ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
 
     ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
 
     if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
         ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
         ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
         ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
+        ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
         ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
         ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
         ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
+        ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
     fi
 
     #ref_paddle2_mv1=""
@@ -1342,6 +1390,22 @@ EOF
         apt-get clean -y && \
         rm -f ${ref_paddle37} && \
         ldconfig
+EOF
+    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
+        tar -xzf Python-3.8.0.tgz && cd Python-3.8.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.8.0.tgz
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
+        pip3.8 install opencv-python && wget ${ref_web}/${ref_paddle38} && pip3.8 install ${ref_paddle38_whl}; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f ${ref_paddle38} && \
+        ldconfig
 EOF
     cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     # run paddle version to install python packages first
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 016726633ea355ed20149e94833ca7e1657c3f7d..661471599cb080da7a65c11fecc339830f2c00ee 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -77,6 +77,7 @@ from .tensor.creation import triu  #DEFINE_ALIAS
 from .tensor.creation import tril  #DEFINE_ALIAS
 from .tensor.creation import meshgrid  #DEFINE_ALIAS
 from .tensor.creation import empty  #DEFINE_ALIAS
+from .tensor.creation import empty_like  #DEFINE_ALIAS
 from .tensor.linalg import matmul  #DEFINE_ALIAS
 from .tensor.linalg import dot  #DEFINE_ALIAS
 # from .tensor.linalg import einsum        #DEFINE_ALIAS
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 2539fa57a34b1fe6fdea6b6b847d52f765df3fa3..f3ee09a6d9ec1b22171253a920b26bbf98afd36e 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -39,6 +39,7 @@ server_num = fleet.server_num
 server_index = fleet.server_index
 server_endpoints = fleet.server_endpoints
 is_server = fleet.is_server
+set_util = fleet.set_util
 util = fleet.util
 barrier_worker = fleet.barrier_worker
 init_worker = fleet.init_worker
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 805c2d1fc734b0e2bfdfb26f44a4e712bcf9e9e4..d00faac838504f5d68e9d44d9ffa9f25c7bf2ee5 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -180,6 +180,8 @@ class Fleet(object):
                 raise ValueError(
                     "`role_maker` should be subclass of `RoleMakerBase`, but got {}".
                     format(type(role_maker)))
+        self._role_maker._generate_role()
+
         self.strategy_compiler = StrategyCompiler()
         if paddle.fluid.framework.in_dygraph_mode():
             if parallel_helper._is_parallel_ctx_initialized():
@@ -187,7 +189,6 @@ class Fleet(object):
                     "The dygraph parallel environment has been initialized.")
             else:
                 paddle.distributed.init_parallel_env()
-        return None
 
     def is_first_worker(self):
         """
@@ -206,7 +207,7 @@ class Fleet(object):
                 fleet.is_first_worker()
 
         """
-        return self._role_maker.is_first_worker()
+        return self._role_maker._is_first_worker()
 
     def worker_index(self):
         """
@@ -223,7 +224,7 @@ class Fleet(object):
                 fleet.worker_index()
 
         """
-        return self._role_maker.worker_index()
+        return self._role_maker._worker_index()
 
     def worker_num(self):
         """
@@ -240,7 +241,7 @@ class Fleet(object):
                 fleet.worker_num()
 
         """
-        return self._role_maker.worker_num()
+        return self._role_maker._worker_num()
 
     def is_worker(self):
         """
@@ -258,7 +259,7 @@ class Fleet(object):
                 fleet.is_worker()
 
         """
-        return self._role_maker.is_worker()
+        return self._role_maker._is_worker()
 
     def worker_endpoints(self, to_string=False):
         """
@@ -275,13 +276,10 @@ class Fleet(object):
                 fleet.worker_endpoints()
 
         """
-        '''
         if to_string:
-            return ",".join(self._role_maker.get_trainer_endpoints())
+            return ",".join(self._role_maker._get_trainer_endpoints())
         else:
-            return self._role_maker.get_trainer_endpoints()
-        '''
-        return ["127.0.0.1:1001", "127.0.0.1:1002"]
+            return self._role_maker._get_trainer_endpoints()
 
     def server_num(self):
         """
@@ -296,7 +294,7 @@ class Fleet(object):
             fleet.init()
             fleet.server_num()
         """
-        return len(self._role_maker.get_pserver_endpoints())
+        return len(self._role_maker._get_pserver_endpoints())
 
     def server_index(self):
         """
@@ -313,7 +311,7 @@ class Fleet(object):
                 fleet.server_index()
 
         """
-        return self._role_maker.server_index()
+        return self._role_maker._server_index()
 
     def server_endpoints(self, to_string=False):
         """
@@ -332,9 +330,9 @@ class Fleet(object):
         """
 
         if to_string:
-            return ",".join(self._role_maker.get_pserver_endpoints())
+            return ",".join(self._role_maker._get_pserver_endpoints())
         else:
-            return self._role_maker.get_pserver_endpoints()
+            return self._role_maker._get_pserver_endpoints()
 
     def is_server(self):
         """
@@ -352,10 +350,12 @@ class Fleet(object):
                 fleet.is_server()
 
         """
-        return self._role_maker.is_server(
+        return self._role_maker._is_server(
         ) or self._role_maker._is_heter_worker()
 
-    @property
+    def set_util(self, util):
+        self._util = util
+
     def util(self):
         """
         Utility functions that can be used under certain runtime
@@ -376,16 +376,6 @@ class Fleet(object):
         """
         return self._util
 
-    @util.setter
-    def util(self, util):
-        """
-        Set Utility functions for userd-defined runtime
-
-        Returns:
-            None
-        """
-        self._util = util
-
     def barrier_worker(self):
         """
         barrier all workers
@@ -393,7 +383,7 @@ class Fleet(object):
         Returns:
             None
         """
-        self._role_maker.barrier_worker()
+        self._role_maker._barrier("worker")
 
     @is_non_distributed_check
     @inited_runtime_handler
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 8614b1861343b8e48b55a8e75d9e432ef6329184..81d5908ccd4f859e36ae98a157fac2d37214c271 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -13,18 +13,332 @@
 # limitations under the License.
 """Defination of Role Makers."""
 import os
+import time
 import numpy as np
 import warnings
 from multiprocessing import Process, Manager
-import paddle.fluid as fluid
 
-#__all__ = ['UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+import paddle.fluid as fluid
 
 
 class Role:
     WORKER = 1
     SERVER = 2
     HETER_WORKER = 3
+    ALL = 4
+
+
+class Gloo(object):
+    """
+    Gloo is a universal class for barrier and collective communication
+    """
+
+    class RENDEZVOUS:
+        HDFS = 1
+        FILE = 2
+        HTTP = 3
+
+    def __init__(self):
+        self._worker_comm = None
+        self._server_comm = None
+        self._nodes_comm = None
+
+        self._comm_world = ["worker", "server", "all"]
+        self._err_init = "gloo is not initialized, will not communicator with other nodes"
+        self._err_type = "gloo initialized error, please check arguments"
+        self._err_world = "argument error, comm_world must in {}".format(
+            self._comm_world)
+
+        self._is_initialized = False
+        self._init_timeout_seconds = 3600
+        self._run_timeout_seconds = 9999999
+
+        self._rendezvous = None
+        self._role = None
+        self._iface = None
+
+        self._role_id = -1
+        self._worker_num = -1
+        self._server_num = -1
+        self._need_init_all = False
+
+    def init(self,
+             rendezvous,
+             role,
+             role_id,
+             worker_num,
+             server_num,
+             need_init_all=False,
+             kwargs=None):
+
+        self._rendezvous = rendezvous
+        self._role = role
+        self._role_id = role_id
+        self._worker_num = worker_num
+        self._server_num = server_num
+        self._need_init_all = need_init_all
+        self._iface = self.__get_default_iface()
+        self._prefix = kwargs.get("store.prefix", "")
+
+        if self._rendezvous == Gloo.RENDEZVOUS.HDFS:
+            dfs_name = kwargs.get("dfs.name", "")
+            dfs_ugi = kwargs.get("dfs.ugi", "")
+            dfs_path = kwargs.get("dfs.path", "")
+
+            if not dfs_name or not dfs_ugi or not dfs_path:
+                raise ValueError(self._err_type)
+            self._init_dfs(dfs_name, dfs_ugi, dfs_path, self._prefix)
+
+        elif self._rendezvous == Gloo.RENDEZVOUS.FILE:
+            fs_path = kwargs.get("dfs.path", "")
+
+            if not fs_path:
+                raise ValueError(self._err_type)
+            self._init_fs(fs_path, self._prefix)
+
+        elif self._rendezvous == Gloo.RENDEZVOUS.HTTP:
+            ip = kwargs.get("http.host", "")
+            port = kwargs.get("http.port", "")
+
+            if not ip or not port:
+                raise ValueError(self._err_type)
+            self._init_http(ip, port, self._prefix)
+
+        else:
+            raise ValueError(self._err_type)
+
+        self._is_initialized = True
+
+    def _init_fs(self, fs_path, prefix):
+        def init(rank, nodes, role):
+            gloo = fluid.core.Gloo()
+            gloo.set_rank(rank)
+            gloo.set_size(nodes)
+            gloo.set_prefix(prefix)
+            gloo.set_iface(self._iface)
+            gloo.set_timeout_seconds(self._init_timeout_seconds,
+                                     self._run_timeout_seconds)
+            gloo.set_hdfs_store(os.path.join(fs_path, role), "", "")
+            gloo.init()
+            return gloo
+
+        if self._role == Role.WORKER:
+            rank, nodes = self._get_rank_nodes(Role.WORKER)
+            gloo = init(rank, nodes, "WORKER")
+            self._worker_comm = gloo
+        else:
+            rank, nodes = self._get_rank_nodes(Role.SERVER)
+            gloo = init(rank, nodes, "SERVER")
+            self._server_comm = gloo
+
+        if self._need_init_all:
+            rank, nodes = self._get_rank_nodes(Role.ALL)
+            gloo = init(rank, nodes, "ALL")
+            self._nodes_comm = gloo
+
+    def _init_dfs(self, dfs_name, dfs_ugi, dfs_path, prefix):
+        def init(rank, nodes, role):
+            gloo = fluid.core.Gloo()
+            gloo.set_rank(rank)
+            gloo.set_size(nodes)
+            gloo.set_prefix(prefix)
+            gloo.set_iface(self._iface)
+            gloo.set_timeout_seconds(self._init_timeout_seconds,
+                                     self._run_timeout_seconds)
+            gloo.set_hdfs_store(os.path.join(dfs_path, role), dfs_name, dfs_ugi)
+            gloo.init()
+            return gloo
+
+        if self._role == Role.WORKER:
+            rank, nodes = self._get_rank_nodes(Role.WORKER)
+            gloo = init(rank, nodes, "WORKER")
+            self._worker_comm = gloo
+        else:
+            rank, nodes = self._get_rank_nodes(Role.SERVER)
+            gloo = init(rank, nodes, "SERVER")
+            self._server_comm = gloo
+
+        if self._need_init_all:
+            rank, nodes = self._get_rank_nodes(Role.ALL)
+            gloo = init(rank, nodes, "ALL")
+            self._nodes_comm = gloo
+
+    def _init_http(self, ip, port, prefix):
+        def __start_kv_server(http_server_d, size_d):
+            from paddle.distributed.fleet.utils.http_server import KVServer
+            http_server = KVServer(port, size_d)
+            http_server.start()
+            wait_seconds = 5
+            while http_server_d.get("running",
+                                    False) and not http_server.shoud_stop():
+                time.sleep(wait_seconds)
+            http_server.stop()
+
+        def init_kv_server():
+            size_d = {
+                "trainer": self._worker_num,
+                "pserver": self._server_num,
+                "all": self._worker_num + self._server_num
+            }
+
+            _http_server_d = {"running": True}
+            # child process for http server
+            _http_server = Process(
+                target=__start_kv_server, args=(_http_server_d, size_d))
+            _http_server.daemon = True
+            # set running status to True
+            # start child process
+            _http_server.start()
+
+        def init(rank, nodes, role):
+            gloo = fluid.core.Gloo()
+            gloo.set_rank(rank)
+            gloo.set_size(nodes)
+            gloo.set_prefix(prefix)
+            gloo.set_iface(self._iface)
+            gloo.set_timeout_seconds(self._init_timeout_seconds,
+                                     self._run_timeout_seconds)
+            gloo.set_http_store(ip, port, role)
+            return gloo
+
+        port = int(port)
+
+        if self._role == Role.SERVER and self._role_id == 0:
+            init_kv_server()
+
+        if self._role == Role.WORKER:
+            rank, nodes = self._get_rank_nodes(Role.WORKER)
+            gloo = init(rank, nodes, "WORKER")
+            self._worker_comm = gloo
+        else:
+            rank, nodes = self._get_rank_nodes(Role.SERVER)
+            gloo = init(rank, nodes, "SERVER")
+            self._server_comm = gloo
+
+        if self._need_init_all:
+            rank, nodes = self._get_rank_nodes(Role.ALL)
+            gloo = init(rank, nodes, "ALL")
+            self._nodes_comm = gloo
+
+    def _get_rank_nodes(self, role):
+        nodes = 0
+        rank = -1
+
+        if role == Role.WORKER:
+            nodes = self._worker_num
+            rank = self._role_id
+        elif role == Role.SERVER:
+            nodes = self._server_num
+            rank = self._role_id
+        elif role == Role.ALL:
+            nodes = self._worker_num + self._server_num
+
+            if self._role == Role.WORKER:
+                rank = self._role_id
+            else:
+                rank = self._worker_num + self._role_id
+        else:
+            ValueError(self._err_type)
+
+        return rank, nodes
+
+    def __get_default_iface(self):
+        """
+        get default physical interface
+        """
+        default1 = self.__get_default_iface_from_gateway()
+        default2 = self.__get_default_iface_from_interfaces()
+        return default2 if default1 == "lo" else default1
+
+    def __get_default_iface_from_gateway(self):
+        """
+        get default physical interface
+        """
+        import netifaces
+        gateways = netifaces.gateways()
+        if gateways.get(netifaces.AF_INET) != None:
+            gateway = gateways[netifaces.AF_INET]
+            if len(gateway) > 0 and len(gateway[0]) > 1:
+                return gateway[0][1]
+        return "lo"
+
+    def __get_default_iface_from_interfaces(self):
+        """
+        get default physical interface
+        """
+        import netifaces
+        for intf_name in netifaces.interfaces():
+            addresses = netifaces.ifaddresses(intf_name)
+            if netifaces.AF_INET in addresses:
+                ipv4_addresses = addresses[netifaces.AF_INET]
+                for ipv4_address in ipv4_addresses:
+                    if 'broadcast' in ipv4_address:
+                        return intf_name
+        return "lo"
+
+    def barrier(self, comm_world):
+        """
+        dummy barrier, do nothing
+        """
+        if not self._is_initialized:
+            warnings.warn(self._err_init)
+            return
+
+        if comm_world not in self._comm_world:
+            raise ValueError(self._err_world)
+
+        if comm_world == "worker":
+            self._worker_comm.barrier()
+        elif comm_world == "server":
+            self._server_comm.barrier()
+        else:
+            self._nodes_comm.barrier()
+
+    def all_reduce(self, input, mode="sum", comm_world="worker"):
+        if not self._is_initialized:
+            warnings.warn(self._err_init)
+            return input
+
+        if comm_world not in self._comm_world:
+            raise ValueError(self._err_world)
+
+        input = np.array(input)
+        input_shape = input.shape
+        input_list = input.reshape(-1).tolist()
+
+        self.barrier(comm_world)
+
+        if comm_world == "worker":
+            ans = self._worker_comm.all_reduce(input_list, mode)
+        elif comm_world == "server":
+            ans = self._server_comm.all_reduce(input_list, mode)
+        else:
+            ans = self._nodes_comm.all_reduce(input_list, mode)
+
+        output = np.array(ans).reshape(input_shape)
+        return output
+
+    def all_gather(self, input, comm_world="worker"):
+        """
+        dummy all gather, do nothing
+        Args:
+            obj(any): obj to do all gather
+        """
+        if not self._is_initialized:
+            warnings.warn(self._err_init)
+            return input
+
+        if comm_world not in self._comm_world:
+            raise ValueError(self._err_world)
+
+        if comm_world == "worker":
+            output = self._worker_comm.all_gather(input)
+        elif comm_world == "server":
+            output = self._server_comm.all_gather(input)
+        else:
+            output = self._nodes_comm.all_gather(input)
+
+        return output
 
 
 class RoleMakerBase(object):
@@ -47,23 +361,19 @@ class RoleMakerBase(object):
         self._heter_trainer_device = "CPU"
         self._is_heter_parameter_server_mode = False
 
-        self._node_type = None
-        self._node_type_comm = None
-        self._all_comm = None
-
-    def is_worker(self):
+    def _is_worker(self):
         """
         return is_worker() of current process
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def is_server(self):
+    def _is_server(self):
         """
         return is_server() of current process
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def is_first_worker(self):
+    def _is_first_worker(self):
         """
         Check whether the node is the first instance of worker.
         Returns:
@@ -72,7 +382,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def worker_num(self):
+    def _worker_num(self):
         """
         Get current total worker number.
 
@@ -81,7 +391,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def server_num(self):
+    def _server_num(self):
         """
         Get current total server number.
 
@@ -90,7 +400,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def worker_index(self):
+    def _worker_index(self):
         """
         Get current worker id.
 
@@ -99,7 +409,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def server_index(self):
+    def _server_index(self):
         """
         Get current server id.
 
@@ -108,7 +418,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def role_id(self):
+    def _role_id(self):
         """
         Get current id.
 
@@ -117,7 +427,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def node_num(self):
+    def _node_num(self):
         """
         Get the training node number
         Returns:
@@ -125,13 +435,13 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def get_trainer_endpoints(self):
+    def _get_trainer_endpoints(self):
         """
         return trainer endpoints
         """
         return self._worker_endpoints
 
-    def get_pserver_endpoints(self):
+    def _get_pserver_endpoints(self):
         """
         return pserver endpoints
         """
@@ -142,19 +452,11 @@ class RoleMakerBase(object):
             self._role, self._current_id, self._worker_endpoints,
             self._server_endpoints)
 
-    def _all_gather(self, comm_world, input):
-        """
-
-        Args:
-            input(int|float): input value
-
-        Returns:
-            return a list of values
-        """
-        print("warning: RoleMakerBase does not have all gather.")
+    def _all_gather(self, input, comm_world="worker"):
+        print("warning: RoleMakerBase does not have all gather worker.")
         return None
 
-    def _all_reduce(self, comm_world, input, mode="sum"):
+    def _all_reduce(self, input, mode="sum", comm_world="worker"):
         """
         Args:
             input(list/numpy.array): array of one dim
@@ -221,158 +523,112 @@ class PaddleCloudRoleMaker(RoleMakerBase):
     def __init__(self, is_collective=False, **kwargs):
         super(PaddleCloudRoleMaker, self).__init__()
         self._is_collective = is_collective
-        self._init_gloo = False  # default no init gloo
-        self._kwargs = kwargs
 
+        self._non_distributed = False
+
+        self._kwargs = kwargs
         self._role_is_generated = False
 
         self._server_endpoints = None
         self._worker_endpoints = None
 
-        self._node_type_comm = None
-        self._all_comm = None
-
-        self._non_distributed = False
-
-        if not self._is_collective:
-            self._hdfs_name = kwargs.get("hdfs_name", "")
-            self._hdfs_ugi = kwargs.get("hdfs_ugi", "")
-            self._hdfs_path = kwargs.get("path", "").rstrip("/")
-            self._init_timeout_seconds = kwargs.get("init_timeout_seconds",
-                                                    3600)
-            self._run_timeout_seconds = kwargs.get("run_timeout_seconds",
-                                                   9999999)
-            ip_port = kwargs.get("http_ip_port", "")
-            self._http_ip_port = []
-            self._http_server = None
-            # if ip_port is not empty, it will use http instead of hdfs
-            if ip_port != "":
-                self._http_ip_port = ip_port.split(":")
-                # it's for communication between processes
-                self._manager = Manager()
-                # global dict to store status
-                self._http_server_d = self._manager.dict()
-                # set running status of http server
-                self._http_server_d["running"] = False
-            self._iface = self.__get_default_iface()
-            # this environment variable can be empty
-            self._prefix = os.getenv("SYS_JOB_ID", "")
+        self._gloo = Gloo()  # gloo instance
 
     def _barrier(self, comm_world):
-        if isinstance(comm_world, fluid.core.Gloo):
-            comm_world.barrier()
-        else:
-            print("warning: must init Gloo before using _barrier() function")
+        self._gloo.barrier(comm_world)
 
-    def _all_gather(self, comm_world, input):
-        if isinstance(comm_world, fluid.core.Gloo):
-            self._barrier(comm_world)
-            output = comm_world.all_gather(input)
-            return output
-        else:
-            print("warning: must init Gloo before using _all_gather() function")
-            return None
+    def _all_gather(self, input, comm_world="worker"):
+        return self._gloo.all_gather(input, comm_world)
 
-    def _all_reduce(self, comm_world, input, mode="sum"):
-        if isinstance(comm_world, fluid.core.Gloo):
+    def _all_reduce(self, input, mode="sum", comm_world="worker"):
+        return self._gloo.all_reduce(input, mode, comm_world)
 
-            input = np.array(input)
-
-            input_shape = input.shape
-            input_list = input.reshape(-1).tolist()
-
-            self._barrier(comm_world)
-            ans = comm_world.all_reduce(input_list, mode)
-            output = np.array(ans).reshape(input_shape)
-            return output
-        else:
-            print("warning: must init Gloo before using _all_reduce() function")
-            return None
-
-    def is_worker(self):
+    def _is_worker(self):
         """
         whether current process is worker
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._role == Role.WORKER
 
-    def is_server(self):
+    def _is_server(self):
         """
         whether current process is server
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._role == Role.SERVER
 
-    def is_first_worker(self):
+    def _is_first_worker(self):
         """
         whether current process is worker of rank 0
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._role == Role.WORKER and self._current_id == 0
 
-    def worker_index(self):
+    def _worker_index(self):
         """
         get index of current worker
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._current_id
 
-    def server_index(self):
+    def _server_index(self):
         """
         get index of current server
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._current_id
 
-    def role_id(self):
+    def _role_id(self):
         """
         get index of current node
         """
+        if not self._role_is_generated:
+            self._generate_role()
         return self._current_id
 
-    def worker_num(self):
+    def _worker_num(self):
         """
         retrun the current number of worker
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._trainers_num
 
-    def server_num(self):
+    def _server_num(self):
         """
         return the current number of server
         """
         if not self._role_is_generated:
-            self.generate_role()
-        return self._trainers_num
+            self._generate_role()
+        return len(self._get_pserver_endpoints())
 
-    def node_num(self):
+    def _node_num(self):
         """
         return the training node number
         """
         if not self._role_is_generated:
-            self.generate_role()
-        return self._node_num
+            self._generate_role()
+        return self._nodes_num
 
-    def get_trainer_endpoints(self):
+    def _get_trainer_endpoints(self):
         """
         get endpoint of all trainers
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._worker_endpoints
 
-    def get_pserver_endpoints(self):
+    def _get_pserver_endpoints(self):
         """
         get endpoint of all pservers
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._server_endpoints
 
     def _is_non_distributed(self):
@@ -381,7 +637,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         (use python-run to launch fleet-code directly)
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._non_distributed
 
     def _heter_worker_num(self):
@@ -389,7 +645,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         get heter worker nums
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._heter_trainers_num
 
     def _is_heter_worker(self):
@@ -397,45 +653,35 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         whether current process is heter worker
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._role == Role.HETER_WORKER
 
-    def _get_rank(self):
-        """
-        get current rank in all workers and pservers
-        """
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._rank
-
-    def _get_size(self):
-        """
-        get total num of all workers and pservers
-        """
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._size
-
     def _ps_env(self):
         try:
             # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
             # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
             self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST")
-            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
-                                               "").split(",")
+
             if self._server_endpoints is None:
                 # back to non_distributed execution.
                 self._server_endpoints = ""
                 self._trainers_num = 1
                 self._role = Role.WORKER
                 self._current_id = 0
-                self._node_num = 1
+                self._nodes_num = 1
                 self._heter_trainers_num = 0
                 self._heter_trainer_endpoints = None
                 self._non_distributed = True
                 return
 
             self._server_endpoints = self._server_endpoints.split(",")
+
+            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+            if self._worker_endpoints:
+                self._worker_endpoints = self._worker_endpoints.split(",")
+            else:
+                self._worker_endpoints = []
+
             trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
             training_role = os.environ["TRAINING_ROLE"]
 
@@ -497,7 +743,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         self._trainers_num = trainers_num
         self._role = role
         self._current_id = current_id
-        self._node_num = len(
+        self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
         self._heter_trainers_num = heter_trainers_num
         self._heter_trainer_endpoints = heter_trainer_eplist
@@ -506,6 +752,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
         self._training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         assert (self._training_role == "TRAINER")
+        self._role = Role.WORKER
         self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
         self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
         if self._worker_endpoints is None:
@@ -515,142 +762,86 @@ class PaddleCloudRoleMaker(RoleMakerBase):
             self._non_distributed = True
         self._worker_endpoints = self._worker_endpoints.split(",")
         self._trainers_num = len(self._worker_endpoints)
-        self._node_num = len(
+        self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
 
-    def _init_gloo_env(self):
-        def init_gloo_instance(role="trainer"):
-            role = role.lower()
-            assert role in ["trainer", "pserver", "all"]
-            if role == "trainer":
-                all_list = self._worker_endpoints
-                rank = self._current_id
-            elif role == "pserver":
-                all_list = self._server_endpoints
-                rank = self._current_id
-            else:
-                all_list = self._worker_endpoints + self._server_endpoints
-                rank = all_list.index(self._cur_endpoint)
-            gloo = fluid.core.Gloo()
-            gloo.set_rank(rank)
-            gloo.set_size(len(all_list))
-            gloo.set_prefix(self._prefix)
-            gloo.set_iface(self._iface)
-            gloo.set_timeout_seconds(self._init_timeout_seconds,
-                                     self._run_timeout_seconds)
-            if len(self._http_ip_port) != 0:
-                gloo.set_http_store(self._http_ip_port[0],
-                                    int(self._http_ip_port[1]), role)
-            else:
-                gloo.set_hdfs_store(self._hdfs_path + "/" + role,
-                                    self._hdfs_name, self._hdfs_ugi)
-            gloo.init()
-            return gloo
-
-        # paddlecloud support gloo
-        if self._role == Role.WORKER:
-            if self._current_id == 0 and len(self._http_ip_port) != 0:
-                size_d = {
-                    "trainer": len(self._worker_endpoints),
-                    "pserver": len(self._server_endpoints),
-                    "all":
-                    len(self._worker_endpoints) + len(self._server_endpoints)
-                }
-                # child process for http server
-                self._http_server = Process(
-                    target=self.__start_kv_server,
-                    args=(self._http_server_d, size_d))
-                self._http_server.daemon = True
-                # set running status to True
-                self._http_server_d["running"] = True
-                # start child process
-                self._http_server.start()
-            self._node_type = 1
-            gloo = init_gloo_instance("trainer")
-            self._node_type_comm = gloo
+    def _gloo_init(self):
+        # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier
+        use_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0"))
+        if use_gloo not in [1, 2]:
+            return
+
+        # PADDLE_GLOO_RENDEZVOUS 1: HDFS 2: FILE 3: HTTP
+        rendezvous_type = int(os.getenv("PADDLE_GLOO_RENDEZVOUS", "0"))
+        prefix = os.getenv("SYS_JOB_ID", "")
+        if rendezvous_type not in [
+                Gloo.RENDEZVOUS.HDFS, Gloo.RENDEZVOUS.HTTP, Gloo.RENDEZVOUS.FILE
+        ]:
+            raise ValueError(self._gloo._err_type)
+
+        need_init_all = True if use_gloo == 2 else False
+
+        if rendezvous_type == Gloo.RENDEZVOUS.HDFS:
+            dfs_name = os.getenv("PADDLE_GLOO_FS_NAME", "")
+            dfs_ugi = os.getenv("PADDLE_GLOO_FS_UGI", "")
+            dfs_path = os.getenv("PADDLE_GLOO_FS_PATH", "")
+            kwargs = {
+                "dfs.name": dfs_name,
+                "dfs.ugi": dfs_ugi,
+                "dfs.path": dfs_path,
+                "store.prefix": prefix,
+            }
+        elif rendezvous_type == Gloo.RENDEZVOUS.HTTP:
+            ip = os.getenv("PADDLE_GLOO_HTTP_HOST", "")
+            port = os.getenv("PADDLE_GLOO_HTTP_PORT", "")
+            kwargs = {
+                "http.host": ip,
+                "http.port": port,
+                "store.prefix": prefix,
+            }
         else:
-            assert self._role == Role.SERVER
-            self._node_type = 0
-            gloo = init_gloo_instance("pserver")
-            self._node_type_comm = gloo
-
-        all_list = self._worker_endpoints + self._server_endpoints
-        self._rank = all_list.index(self._cur_endpoint)
-        self._size = len(all_list)
-
-        gloo = init_gloo_instance("all")
-        self._all_comm = gloo
+            dfs_path = os.getenv("PADDLE_GLOO_FS_PATH", "")
+            kwargs = {
+                "dfs.path": dfs_path,
+                "store.prefix": prefix,
+            }
+
+        if rendezvous_type == Gloo.RENDEZVOUS.HDFS:
+            type = "HDFS"
+        elif rendezvous_type == Gloo.RENDEZVOUS.HTTP:
+            type = "HTTP"
+        else:
+            type = "FILE"
+        print("Gloo init with {}: need_init_all: {}, args: {}".format(
+            type, need_init_all, kwargs))
 
-        if self._http_server is not None:
-            # set running status to False
-            self._http_server_d["running"] = False
-            # wait until child process exits
-            self._http_server.join()
+        self._gloo.init(
+            rendezvous=rendezvous_type,
+            role=self._role,
+            role_id=self._role_id(),
+            worker_num=self._worker_num(),
+            server_num=self._server_num(),
+            need_init_all=need_init_all,
+            kwargs=kwargs)
 
-    def generate_role(self):
+    def _generate_role(self):
         """
         generate role for role maker
         """
         if not self._role_is_generated:
             if not self._is_collective:
                 self._ps_env()
-                if "PADDLE_WITH_GLOO" in os.environ:
-                    self._init_gloo = bool(os.environ["PADDLE_WITH_GLOO"])
-                if self._init_gloo:
-                    self._init_gloo_env()
             else:
                 self._collective_env()
             self._role_is_generated = True
-
-    def __get_default_iface(self):
-        """
-        get default physical interface
-        """
-        default1 = self.__get_default_iface_from_gateway()
-        default2 = self.__get_default_iface_from_interfaces()
-        return default2 if default1 == "lo" else default1
-
-    def __get_default_iface_from_gateway(self):
-        """
-        get default physical interface
-        """
-        import netifaces
-        gateways = netifaces.gateways()
-        if gateways.get(netifaces.AF_INET) != None:
-            gateway = gateways[netifaces.AF_INET]
-            if len(gateway) > 0 and len(gateway[0]) > 1:
-                return gateway[0][1]
-        return "lo"
-
-    def __get_default_iface_from_interfaces(self):
-        """
-        get default physical interface
-        """
-        import netifaces
-        for intf_name in netifaces.interfaces():
-            addresses = netifaces.ifaddresses(intf_name)
-            if netifaces.AF_INET in addresses:
-                ipv4_addresses = addresses[netifaces.AF_INET]
-                for ipv4_address in ipv4_addresses:
-                    if 'broadcast' in ipv4_address:
-                        return intf_name
-        return "lo"
-
-    def __start_kv_server(self, http_server_d, size_d):
-        from paddle.distributed.fleet.utils import KVServer
-        http_server = KVServer(int(self._http_ip_port[1]), size_d)
-        http_server.start()
-        wait_seconds = 5
-        while http_server_d.get("running",
-                                False) and not http_server.shoud_stop():
-            time.sleep(wait_seconds)
-        http_server.stop()
+            self._gloo_init()
 
 
 class UserDefinedRoleMaker(PaddleCloudRoleMaker):
     def __init__(self, is_collective=False, init_gloo=False, **kwargs):
         super(UserDefinedRoleMaker, self).__init__(
             is_collective=is_collective, init_gloo=init_gloo, **kwargs)
+        self._init_gloo = init_gloo
 
     def _user_defined_ps_env(self):
         self._server_endpoints = self._kwargs.get("server_endpoints")
@@ -669,26 +860,24 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
             self._cur_endpoint = self._worker_endpoints[self._current_id]
         elif self._role == Role.SERVER:
             self._cur_endpoint = self._server_endpoints[self._current_id]
-        self._node_num = len(
+        self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
 
     def _user_defined_collective_env(self):
         self._worker_endpoints = self._kwargs.get("worker_endpoints")
         self._current_id = self._kwargs.get("current_id")
         self._trainers_num = len(self._worker_endpoints)
-        self._training_role = Role.Worker
-        self._node_num = len(
+        self._training_role = Role.WORKER
+        self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
 
-    def generate_role(self):
+    def _generate_role(self):
         """
         generate role for role maker
         """
         if not self._role_is_generated:
             if not self._is_collective:
                 self._user_defined_ps_env()
-                if self._init_gloo:
-                    self._init_gloo_env()
             else:
                 self._user_defined_collective_env()
             self._role_is_generated = True
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index f5a6c417c0c45bea819c5832f98b5b6c9fabbd4b..efaa854c0879ddb57c7746cede68047ff82931a0 100644
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -16,20 +16,18 @@
 """basic collective operations in python"""
 """remote file system"""
 
-__all__ = ['UtilBase']
-
-import numpy as np
-import os
-
-import subprocess
-from paddle.fluid import core
-from collections import OrderedDict
-import paddle.fluid as fluid
-from google.protobuf import text_format
-from paddle.fluid import debugger
-from paddle.fluid.framework import Program
-from paddle.fluid.proto import framework_pb2
 from ..utils.fs import FS, LocalFS, HDFSClient
+from paddle.fluid.proto import framework_pb2
+from paddle.fluid.framework import Program
+from paddle.fluid import debugger
+from google.protobuf import text_format
+import paddle.fluid as fluid
+from collections import OrderedDict
+from paddle.fluid import core
+import subprocess
+import os
+import numpy as np
+__all__ = ['UtilBase']
 
 
 class UtilFactory(object):
@@ -53,76 +51,194 @@ class UtilBase(object):
     def _set_role_maker(self, role_maker):
         self.role_maker = role_maker
 
-    def set_file_system(self, fs_client):
+    def _set_file_system(self, fs_client):
         assert isinstance(
             fs_client, FS
         ), "fs_client must be the instance of paddle.distributed.fleet.utils.FS"
         self.fs_client = fs_client
 
-    def __check_comm_world(self, comm_world="worker"):
-        if not self.role_maker._role_is_generated:
-            self.role_maker.generate_role()
-
-        _comm_world = None
-        comm_world_upper = comm_world.upper()
-        if comm_world_upper == "WORKER":
-            if not self.role_maker.is_worker():
-                print(
-                    "warning: current role is not worker in collective_func(comm_world=\"worker\")"
-                )
-            _comm_world = self.role_maker._node_type_comm
-        elif comm_world_upper == "SERVER":
-            if not self.role_maker.is_server():
-                print(
-                    "warning: current role is not server in collective_func(comm_world=\"server\")"
-                )
-            _comm_world = self.role_maker._node_type_comm
-        elif comm_world_upper == "ALL":
-            _comm_world = self.role_maker._all_comm
-        else:
-            raise ValueError(
-                "not support comm_world, please choose one from [worker, server, all]"
-            )
+    def all_reduce(self, input, mode="sum", comm_world="worker"):
+        """
+        All reduce `input` between specified collection. This is a distributed API.
 
-        return _comm_world
+        Args:
+            input (list|numpy.array): The input variable to do all_reduce between specified collection.
+            mode (str): "sum" or "min" or "max".
+            comm_world (str, optional): Collection used to execute all_reduce operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
 
-    def all_reduce(self, input, mode, comm_world="worker"):
-        _comm_world = self.__check_comm_world(comm_world)
-        return self.role_maker._all_reduce(_comm_world, input, mode)
+        Returns:
+            output(Numpy.array|None): A numpy array with the same shape as the `input` .
+
+        Examples:
+            .. code-block:: python
+
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+                import numpy as np
+
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+
+                    if fleet.is_server():
+                        input = [1, 2]
+                        output = fleet_util.all_reduce(input, "sum", "server")
+                        print(output)
+                        # [2, 4]
+                    elif fleet.is_worker():
+                        input = np.array([3, 4])
+                        output = fleet_util.all_reduce(input, "sum", "worker")
+                        print(output)
+                        # [6, 8]
+                    output = fleet_util.all_reduce(input, "sum", "all")
+                    print(output)
+                    # [8, 12]
+                if __name__ == "__main__":
+                    train()
+        """
+        return self.role_maker._all_reduce(input, mode, comm_world)
 
     def barrier(self, comm_world="worker"):
-        _comm_world = self.__check_comm_world(comm_world)
-        self.role_maker._barrier(_comm_world)
+        """
+        Barrier between specified collection.
+
+        Args:
+            comm_world (str, optional): Collection used to execute barrier operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+
+        Examples:
+
+            .. code-block:: python
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+
+                    if fleet.is_server():
+                        fleet_util.barrier("server")
+                        print("all server arrive here")
+                    elif fleet.is_worker():
+                        fleet_util.barrier("worker")
+                        print("all server arrive here")
+                    fleet_util.barrier("all")
+                    print("all servers and workers arrive here")
+
+                if __name__ == "__main__":
+                    train()
+        """
+        self.role_maker._barrier(comm_world)
 
     def all_gather(self, input, comm_world="worker"):
-        _comm_world = self.__check_comm_world(comm_world)
-        return self.role_maker._all_gather(_comm_world, input)
+        """
+        All gather `input` between specified collection.
+
+        Args:
+            input (Int|Float): The input variable to do all_gather between specified collection.
+            comm_world (str, optional): Collection used to execute all_reduce operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+
+        Returns:
+            output (List): A list of gathered values.
+
+        Examples:
+
+            .. code-block:: python
+
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+
+                    if fleet.is_server():
+                        input = fleet.server_index()
+                        output = fleet_util.all_gather(input, "server")
+                        print(output)
+                        # output = [0, 1]
+                    elif fleet.is_worker():
+                        input = fleet.worker_index()
+                        output = fleet_util.all_gather(input, "worker")
+                        # output = [0, 1]
+                        print(output)
+                    output = fleet_util.all_gather(input, "all")
+                    print(output)
+                    # output = [0, 1, 0, 1]
+
+                if __name__ == "__main__":
+                    train()
+        """
+
+        return self.role_maker._all_gather(input, comm_world)
 
-    def broadcast(self):
+    def _broadcast(self):
         pass
 
-    def scatter(self):
+    def _scatter(self):
         pass
 
     def get_file_shard(self, files):
         """
-        split files before distributed training,
-        example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
-                   0 gets [a, b, c] and trainer 1 gets [d, e].
-        example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
-                   [a], trainer 1 gets [b],  trainer 2 gets []
+        Split files before distributed training, and return filelist assigned to the current trainer.
+
+        .. code-block:: text
+
+            example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
+                    0 gets [a, b, c] and trainer 1 gets [d, e].
+            example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
+                    [a], trainer 1 gets [b],  trainer 2 gets []
 
         Args:
-            files(list): file list need to be read.
+            files(list): File list need to be read.
 
         Returns:
-            list: files belongs to this worker.
+            List: Files belong to this worker.
+
+        Examples:
+
+            .. code-block:: python
+
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet.base.role_maker as role_maker
+
+                role = role_maker.UserDefinedRoleMaker(
+                    is_collective=False,
+                    init_gloo=False,
+                    current_id=0,
+                    role=role_maker.Role.WORKER,
+                    worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+                    server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+                fleet_util._set_role_maker(role)
+                files = fleet_util.get_file_shard(["file1", "file2", "file3"])
+                # files = ["file1", "file2"]
         """
         if not isinstance(files, list):
             raise TypeError("files should be a list of file need to be read.")
 
-        trainer_id = self.role_maker.worker_index()
-        trainers = self.role_maker.worker_num()
+        trainer_id = self.role_maker._worker_index()
+        trainers = self.role_maker._worker_num()
 
         remainder = len(files) % trainers
         blocksize = int(len(files) / trainers)
@@ -140,7 +256,31 @@ class UtilBase(object):
         return trainer_files[trainer_id]
 
     def print_on_rank(self, message, rank_id):
-        if self.role_maker.worker_index() != rank_id:
+        """
+        Woker of rank `rank_id` print some message. 
+
+        Args:
+            message(str): Log to be printed.
+            rank_id(int): trainer id.
+
+        Examples:
+
+            .. code-block:: python
+
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet.base.role_maker as role_maker
+
+                role = role_maker.UserDefinedRoleMaker(
+                    is_collective=False,
+                    init_gloo=False,
+                    current_id=0,
+                    role=role_maker.Role.WORKER,
+                    worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+                    server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+                fleet_util._set_role_maker(role)
+                fleet_util.print_on_rank("I'm worker 0", 0)
+        """
+        if self.role_maker._worker_index() != rank_id:
             return
         print(message)
 
@@ -297,7 +437,7 @@ class UtilBase(object):
         with fluid.scope_guard(scope):
             inference_program, feed_target_names, fetch_targets = \
                 fluid.io.load_inference_model(config.dump_model_dir, exe, model_filename=model_filename,
-                                            params_filename=config.save_params_filename)
+                                              params_filename=config.save_params_filename)
 
             # check program vars and saved vars shape
             orig_para_shape = {
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 6dba385c569be75b5b83e0a63e560ffa8ab73696..4b629bc35ce59da9af0b72a2ab4ee44e587a86f1 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -55,7 +55,10 @@ launch a process on each of the given gpu card or cpu machine.
 """
 
 from __future__ import print_function
+
+import shutil
 import sys
+import tempfile
 from sys import version
 import subprocess
 import os
@@ -87,7 +90,7 @@ def _parse_args():
 see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
 ''')
 
-    #Optional arguments for the launch helper
+    # Optional arguments for the launch helper
     parser.add_argument(
         "--ips",
         type=str,
@@ -115,7 +118,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         default="log",
         help="The path for each process's log.If it's not set, the log will printed to default pipe."
     )
-    #positional
+    # positional
     parser.add_argument(
         "training_script",
         type=str,
@@ -124,7 +127,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         "followed by all the arguments for the "
         "training script")
 
-    #rest from the training program
+    # rest from the training program
     parser.add_argument('training_script_args', nargs=REMAINDER)
     return parser.parse_args()
 
@@ -138,7 +141,7 @@ def get_cluster_from_args(args, gpus):
 
     # node_ip = args.node_ip
     assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
-                % (node_ip, node_ips)
+        % (node_ip, node_ips)
     node_rank = node_ips.index(node_ip)
 
     logger.debug("parsed from args: node_ips:{} node_ip:{} node_rank:{}".format(
@@ -213,12 +216,20 @@ def launch_collective(args):
         cluster, pod = get_cluster_from_args(args, gpus)
         logger.debug("get cluster from args:{}".format(cluster))
 
+    global_envs = copy.copy(os.environ.copy())
+    gloo_rendezvous_dir = tempfile.mkdtemp()
+    # add gloo env
+    global_envs["PADDLE_WITH_GLOO"] = "1"
+    global_envs["PADDLE_GLOO_RENDEZVOUS"] = "2"
+    global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
+
     procs = start_local_trainers(
         cluster,
         pod,
         training_script=args.training_script,
         training_script_args=args.training_script_args,
-        log_dir=args.log_dir)
+        log_dir=args.log_dir,
+        envs=global_envs)
 
     while True:
         alive = watch_local_trainers(procs, cluster.trainers_nranks())
@@ -230,6 +241,9 @@ def launch_collective(args):
 
         time.sleep(3)
 
+    if os.path.exists(gloo_rendezvous_dir):
+        shutil.rmtree(gloo_rendezvous_dir)
+
 
 def launch_ps(args):
     ports = None
@@ -280,7 +294,7 @@ def launch_ps(args):
         _, current_node_ip = get_host_name_ip()
 
     assert current_node_ip in node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
-                % (current_node_ip, node_ips)
+        % (current_node_ip, node_ips)
     node_rank = node_ips.index(current_node_ip)
     logger.debug(
         "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}, server_ports:{}".
@@ -315,6 +329,13 @@ def launch_ps(args):
 
     default_env = os.environ.copy()
     current_env = copy.copy(default_env)
+
+    gloo_rendezvous_dir = tempfile.mkdtemp()
+    # add gloo env
+    current_env["PADDLE_WITH_GLOO"] = "1"
+    current_env["PADDLE_GLOO_RENDEZVOUS"] = "2"
+    current_env["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
+
     current_env.pop("http_proxy", None)
     current_env.pop("https_proxy", None)
     procs = []
@@ -323,10 +344,12 @@ def launch_ps(args):
     for idx, cur_server in enumerate(pod.servers):
         proc_env = {
             "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
+            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
             "PADDLE_PORT": cur_server.endpoint.split(":")[1],
             "TRAINING_ROLE": "PSERVER",
             "PADDLE_TRAINERS_NUM": str(worker_num),
-            "POD_IP": cur_server.endpoint.split(":")[0]
+            "POD_IP": cur_server.endpoint.split(":")[0],
+            "PADDLE_WITH_GLOO": "1"
         }
         current_env.update(proc_env)
 
@@ -365,7 +388,8 @@ def launch_ps(args):
             "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
             "PADDLE_TRAINERS_NUM": str(worker_num),
             "TRAINING_ROLE": "TRAINER",
-            "PADDLE_TRAINER_ID": str(cur_worker.rank)
+            "PADDLE_TRAINER_ID": str(cur_worker.rank),
+            "PADDLE_WITH_GLOO": "1"
         }
         current_env.update(proc_env)
 
@@ -416,6 +440,9 @@ def launch_ps(args):
         procs[i].proc.terminate()
     print("all parameter server are killed", file=sys.stderr)
 
+    if os.path.exists(gloo_rendezvous_dir):
+        shutil.rmtree(gloo_rendezvous_dir)
+
 
 def launch():
     args = _parse_args()
@@ -430,7 +457,11 @@ def launch():
         co_arg for co_arg in collective_args
         if co_arg in " ".join(sys.argv[1:-1])
     ]
-    cuda_device_num = fluid.core.get_cuda_device_count()
+    if fluid.core.is_compiled_with_cuda():
+        cuda_device_num = fluid.core.get_cuda_device_count()
+    else:
+        cuda_device_num = 0
+
     if len(has_ps_args) > 0 or cuda_device_num == 0:
         logger.info(
             "Run parameter-sever cpu mode. pserver arguments:{}, cuda count:{}".
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index b6f4c75a276920f966a6b324a9bea16148bf337c..17d3b96cf4466e560381c20fe265b39cac6697f0 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -398,8 +398,14 @@ def start_local_trainers(cluster,
                          pod,
                          training_script,
                          training_script_args,
-                         log_dir=None):
-    current_env = copy.copy(os.environ.copy())
+                         log_dir=None,
+                         envs=None):
+
+    if envs is None:
+        current_env = copy.copy(os.environ.copy())
+    else:
+        current_env = copy.copy(envs)
+
     #paddle broadcast ncclUniqueId use socket, and
     #proxy maybe make trainers unreachable, so delete them.
     #if we set them to "", grpc will log error message "bad uri"
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index 70b010978bb4d5be98310efa8ff04a3f853602ab..8ff4114bf8eda4080c252a736d7b6ee69990faa4 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -57,12 +57,12 @@ class CollectiveHelper(object):
         if startup_program is None:
             self.startup_program = fluid.default_startup_program()
 
-        endpoints = self.role_maker.get_trainer_endpoints()
-        current_endpoint = endpoints[self.role_maker.worker_index()]
+        endpoints = self.role_maker._get_trainer_endpoints()
+        current_endpoint = endpoints[self.role_maker._worker_index()]
         for ring_id in range(self.nrings):
             self._init_communicator(
                 self.startup_program, current_endpoint, endpoints,
-                self.role_maker.worker_index(), ring_id, self.wait_port)
+                self.role_maker._worker_index(), ring_id, self.wait_port)
         self._broadcast_params()
 
     def _init_communicator(self, program, current_endpoint, endpoints, rank,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 3f6ed1ed2f23d4595b3aadff6f259f9e27f129b2..6806a479d30f467bd8b6f6d5c6832dda63af4055 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -47,7 +47,7 @@ class DGCOptimizer(MetaOptimizerBase):
             sparsity=configs['sparsity'],
             parameter_list=opt._parameter_list,
             use_nesterov=opt._use_nesterov,
-            num_trainers=self.role_maker.worker_num(),
+            num_trainers=self.role_maker._worker_num(),
             regularization=opt.regularization,
             grad_clip=opt._grad_clip,
             name=opt._name)
@@ -60,7 +60,7 @@ class DGCOptimizer(MetaOptimizerBase):
             if not isinstance(self.inner_opt, Momentum):
                 logging.warn("dgc only works on Momentum optimizer")
                 return False
-            if self.role_maker.worker_num() <= 1:
+            if self.role_maker._worker_num() <= 1:
                 logging.warn("dgc only works on multi cards")
                 return False
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 6c1cc3d7a9769a5c61997ab761a5458b7e8df4a3..0ad9e5680eab4a1beb340359e1af44fce9217097 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -50,12 +50,12 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
 
     # should fix the variable
     def _setup_nccl_op(self, startup_program, main_program, build_strategy):
-        trainer_endpoints = self.role_maker.get_trainer_endpoints()
+        trainer_endpoints = self.role_maker._get_trainer_endpoints()
         trainers = trainer_endpoints
-        trainer_id = self.role_maker.worker_index()
-        current_endpoint = self.role_maker.get_trainer_endpoints()[trainer_id]
+        trainer_id = self.role_maker._worker_index()
+        current_endpoint = self.role_maker._get_trainer_endpoints()[trainer_id]
         trainer_endpoints_env = ",".join(trainer_endpoints)
-        trainers_num = self.role_maker.worker_num()
+        trainers_num = self.role_maker._worker_num()
         nccl_id_var = startup_program.global_block().create_var(
             name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
         for i in range(1, build_strategy.nccl_comm_num):
@@ -127,8 +127,8 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
             local_build_strategy.enable_sequential_execution = True
 
         exe_strategy = self.user_defined_strategy.execution_strategy
-        worker_num = self.role_maker.worker_num()
-        node_num = self.role_maker.node_num()
+        worker_num = self.role_maker._worker_num()
+        node_num = self.role_maker._node_num()
 
         if self.role_maker._is_collective:
             assert worker_num >= 1, "nccl2 worker_num must >= 1, now:{}" % worker_num
@@ -170,9 +170,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         # TODO(guru4elephant): should be an independent optimizer
         self._setup_nccl_op(startup_program, main_program, local_build_strategy)
 
-        local_build_strategy.num_trainers = self.role_maker.worker_num()
-        local_build_strategy.trainer_id = self.role_maker.worker_index()
-        local_build_strategy.trainers_endpoints = self.role_maker.get_trainer_endpoints(
+        local_build_strategy.num_trainers = self.role_maker._worker_num()
+        local_build_strategy.trainer_id = self.role_maker._worker_index()
+        local_build_strategy.trainers_endpoints = self.role_maker._get_trainer_endpoints(
         )
         local_build_strategy.enable_backward_optimizer_op_deps = True
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 4ebac20888dd708bd90f91abdef4a472bac2847c..9f094978d842a8ba194742b527dc6f3cd19234cd 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -38,7 +38,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         if not self.user_defined_strategy.localsgd:
             return False
 
-        if self.role_maker.worker_num() <= 1:
+        if self.role_maker._worker_num() <= 1:
             return False
 
         return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
@@ -168,7 +168,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                         inputs={'X': [param]},
                         outputs={'Out': [param]},
                         attrs={
-                            'scale': 1.0 / self.role_maker.worker_num(),
+                            'scale': 1.0 / self.role_maker._worker_num(),
                             OP_ROLE_KEY: OpRole.Optimize
                         })
                     sub_block.append_op(
@@ -208,7 +208,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
         if not self.user_defined_strategy.adaptive_localsgd:
             return False
 
-        if self.role_maker.worker_num() <= 1:
+        if self.role_maker._worker_num() <= 1:
             return False
 
         return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
@@ -275,7 +275,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
             inputs={'X': [avg_loss]},
             outputs={'Out': [avg_loss]},
             attrs={
-                'scale': 1.0 / self.role_maker.worker_num(),
+                'scale': 1.0 / self.role_maker._worker_num(),
                 OP_ROLE_KEY: OpRole.Optimize
             })
 
@@ -398,7 +398,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
                         inputs={'X': [param]},
                         outputs={'Out': [param]},
                         attrs={
-                            'scale': 1.0 / self.role_maker.worker_num(),
+                            'scale': 1.0 / self.role_maker._worker_num(),
                             OP_ROLE_KEY: OpRole.Optimize
                         })
                     sub_block.append_op(
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
index 7dc532c86ea681d8479710732ec33e96c58c35d5..dfa765364f357b6e685c3983c73cfb4f1b2cce61 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
@@ -31,7 +31,7 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
         if k_steps < 0:
             return False
 
-        if self.role_maker.is_server():
+        if self.role_maker._is_server():
             return False
 
         if self.role_maker._is_heter_parameter_server_mode:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index 51d4d343165b9057c803a22aa428081109d7d35f..38ad41f8836b4e8c3b304dbf539b47d5293a8221 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -239,10 +239,10 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                                                      strategy, self.role_maker)
         compiled_config.strategy = strategy
 
-        if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
+        if self.role_maker._is_worker() or self.role_maker._is_heter_worker():
             main_program, startup_program = self._build_trainer_programs(
                 compiled_config)
-        elif self.role_maker.is_server():
+        elif self.role_maker._is_server():
             main_program, startup_program = self._build_pserver_programs(
                 compiled_config)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index 87fa70779111ea485319f50b58901c605fffa23c..889fec838ed3d6dc83d2c15e92138f49e62f01dd 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -126,11 +126,11 @@ class PipelineOptimizer(MetaOptimizerBase):
         optimize_ops, params_grads, prog_list = \
             self.wrapped_opt.minimize(loss, startup_program,
                                       parameter_list, no_grad_set)
-        if self.role_maker.worker_num() == 1:
+        if self.role_maker._worker_num() == 1:
             return optimize_ops, params_grads
 
-        endpoints = self.role_maker.get_trainer_endpoints()
-        current_endpoint = endpoints[self.role_maker.worker_index()]
+        endpoints = self.role_maker._get_trainer_endpoints()
+        current_endpoint = endpoints[self.role_maker._worker_index()]
         self.startup_program = startup_program
         if startup_program is None:
             self.startup_program = fluid.default_startup_program()
@@ -142,7 +142,7 @@ class PipelineOptimizer(MetaOptimizerBase):
         self.nranks = nranks
         self.nrings = len(self.main_program_list)
 
-        self.rank = self.role_maker.worker_index()
+        self.rank = self.role_maker._worker_index()
         self.endpoints = endpoints
         self.current_endpoint = current_endpoint
 
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 227f8f60210ee8a44ab9e87ed7b88337c79ac7f1..ae5c53b8a37c4958e58ed5b09ce7cc8194f1ff52 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -104,9 +104,9 @@ class ParameterServerRuntime(RuntimeBase):
     def _init_worker(self):
         def sync_strategy_envs():
             kwargs = {}
-            kwargs["pserver_endpoints"] = self.role_maker.get_pserver_endpoints(
-            )
-            kwargs["trainer_id"] = self.role_maker.worker_index()
+            kwargs[
+                "pserver_endpoints"] = self.role_maker._get_pserver_endpoints()
+            kwargs["trainer_id"] = self.role_maker._worker_index()
             return kwargs
 
         def geo_strategy_envs():
@@ -150,7 +150,7 @@ class ParameterServerRuntime(RuntimeBase):
                 return "#".join(init_attrs)
 
             kwargs = {}
-            kwargs["trainers"] = self.role_maker.worker_num()
+            kwargs["trainers"] = self.role_maker._worker_num()
             kwargs["sparse_attrs"] = get_sparse_attrs()
             return kwargs
 
@@ -338,7 +338,7 @@ class ParameterServerRuntime(RuntimeBase):
                 block.append_op(
                     type='recv_save',
                     attrs={
-                        "trainer_id": self.role_maker.worker_index(),
+                        "trainer_id": self.role_maker._worker_index(),
                         "shape": var.shape,
                         "slice_shapes":
                         [",".join([str(i) for i in var.shape])],
@@ -378,14 +378,15 @@ class ParameterServerRuntime(RuntimeBase):
             block.append_op(
                 type='recv_save',
                 attrs={
-                    "trainer_id": self.role_maker.worker_index(),
+                    "trainer_id": self.role_maker._worker_index(),
                     "shape": var.shape,
                     "slice_shapes": slice_shapes,
                     "slice_varnames": var_ctx.split_varnames(),
                     "remote_varnames": var_ctx.split_varnames(),
                     "is_sparse": True,
                     "endpoints": var_ctx.split_endpoints(),
-                    "pserver_num": len(self.role_maker.get_pserver_endpoints()),
+                    "pserver_num":
+                    len(self.role_maker._get_pserver_endpoints()),
                     "file_path": os.path.join(dirname, var.name)
                 })
 
@@ -403,7 +404,7 @@ class ParameterServerRuntime(RuntimeBase):
                 block.append_op(
                     type='recv_save',
                     attrs={
-                        "trainer_id": self.role_maker.worker_index(),
+                        "trainer_id": self.role_maker._worker_index(),
                         "shape": var.shape,
                         "slice_shapes": slice_shapes,
                         "slice_varnames": slice_varnames,
@@ -411,7 +412,7 @@ class ParameterServerRuntime(RuntimeBase):
                         "is_sparse": True,
                         "endpoints": var_ctx.split_endpoints(),
                         "pserver_num":
-                        len(self.role_maker.get_pserver_endpoints()),
+                        len(self.role_maker._get_pserver_endpoints()),
                         "file_path": os.path.join(dirname, var.name)
                     })
 
@@ -422,7 +423,7 @@ class ParameterServerRuntime(RuntimeBase):
                 block.append_op(
                     type='recv_save',
                     attrs={
-                        "trainer_id": self.role_maker.worker_index(),
+                        "trainer_id": self.role_maker._worker_index(),
                         "shape": var.shape,
                         "slice_shapes":
                         [",".join([str(i) for i in var.shape])],
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index f1911408c84a9dde56a8674e88e0fb8ad575cae7..abf198b97e6e818e1fbe59006f98492640bcee54 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -11,8 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .fs import *
-from .http_server import KVHandler, KVHTTPServer, KVServer
-
-#__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 966b7219d609d5da71a466d3bda86b13408be281..b7c50bda3eae0a004bd3d0169fc62260393b28b8 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -32,10 +32,7 @@ import functools
 from pathlib import PurePosixPath, Path
 import shutil
 
-__all__ = [
-    'FS', 'LocalFS', 'HDFSClient', 'ExecuteError', 'FSTimeOut',
-    'FSFileExistsError', 'FSFileNotExistsError', 'FSShellCmdAborted'
-]
+__all__ = ['LocalFS', 'HDFSClient']
 
 
 class ExecuteError(Exception):
@@ -117,7 +114,37 @@ class FS(object):
 
 
 class LocalFS(FS):
+    """
+    A tool of local file system.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.distributed.fleet.utils.fs import LocalFS
+
+            client = LocalFS()
+            subdirs, files = client.ls_dir("./")
+    """
+
     def ls_dir(self, fs_path):
+        """	
+        List directorys and files under `fs_path` .
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Tuple: Return a 2-tuple, the first is a list of all its subdirectories, 
+            and the second is a list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                subdirs, files = client.ls_dir("./")
+        """
         if not self.is_exist(fs_path):
             return [], []
 
@@ -132,11 +159,46 @@ class LocalFS(FS):
         return dirs, files
 
     def mkdirs(self, fs_path):
+        """
+        Create a remote HDFS directory.
+
+        Args:
+            fs_path(str): The local directory path.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.mkdirs("test_mkdirs")
+                client.delete("test_mkdirs")
+        """
         assert not os.path.isfile(fs_path), "{} is already a file".format(
             fs_path)
         os.system("mkdir -p {}".format(fs_path))
 
     def rename(self, fs_src_path, fs_dst_path):
+        """
+        Rename the file.
+
+        Args:
+            fs_src_path(str): The actual name of the file or directory
+            fs_dst_path(str): The new name of the file or directory.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_rename_src")
+                print(client.is_exists("test_rename_src")) # True
+                client.rename("test_rename_src", "test_rename_dst")
+                print(client.is_exists("test_rename_src")) # False
+                print(client.is_exists("test_rename_dst")) # True
+                client.delete("test_rename_dst")
+        """
         os.rename(fs_src_path, fs_dst_path)
 
     def _rmr(self, fs_path):
@@ -146,6 +208,21 @@ class LocalFS(FS):
         os.remove(fs_path)
 
     def delete(self, fs_path):
+        """
+        Delete the local file path, whether it's a file or directory.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.mkdirs("test_localFS_mkdirs")
+                client.delete("test_localFS_mkdirs")
+        """
         if not self.is_exist(fs_path):
             return
 
@@ -158,15 +235,88 @@ class LocalFS(FS):
         return False
 
     def is_file(self, fs_path):
+        """
+        Whether the local file path is a file.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a file, otherwise return false.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_is_file")
+                print(client.is_file("test_is_file")) # True
+                client.delete("test_is_file")
+        """
         return os.path.isfile(fs_path)
 
     def is_dir(self, fs_path):
+        """
+        Whether the local file path is a directory.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a directory, otherwise return false.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.mkdirs("test_is_dir")
+                print(client.is_dir("test_is_file")) # True
+                client.delete("test_is_dir")
+        """
         return os.path.isdir(fs_path)
 
     def is_exist(self, fs_path):
+        """
+        Whether the local file path exists.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Bool: Wheter it's a file or directory, return true if the path exists, 
+            otherwise return false.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                ret = local_fs.is_exist("test_is_exist")
+        """
         return os.path.exists(fs_path)
 
     def touch(self, fs_path, exist_ok=True):
+        """
+        Create a local file.
+
+        Args:
+            fs_path(str): The local file path.
+            exist_ok(bool): When `fs_path` exists, if `exist_ok` is set false,
+            program will throw an Exception. Default is true.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_touch")
+                client.delete("test_touch")
+        """
         if self.is_exist(fs_path):
             if exist_ok:
                 return
@@ -175,6 +325,26 @@ class LocalFS(FS):
         return Path(fs_path).touch(exist_ok=True)
 
     def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
+        """
+        Move a local file or directory from `src_path` to `dst_path` .
+
+        Args:
+            src_path(str):  Name of the file or directory, that's needed to be moved.
+            dst_path(str):  Name of the file or directory to which to move to.
+            overwrite(bool): Whether to re-write `dst_path` if that exists. Default is False.
+            test_exists(bool): Check the existence of `src_path` and `dst_path` . 
+            When `test_exists` is set true, if `src_path` doesn't exist or `dst_path` exists, program will throw an Excetption. 
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_mv_src")
+                client.mv("test_mv_src", "test_mv_dst")
+                client.delete("test_mv_dst")
+        """
         if not self.is_exist(src_path):
             raise FSFileNotExistsError
 
@@ -188,7 +358,21 @@ class LocalFS(FS):
 
     def list_dirs(self, fs_path):
         """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
+        Only list directorys under `fs_path` .
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            List: A list of all its subdirectories, e.g. [subdirname1, subdirname1, ...].
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                subdirs = client.list_dirs("./")
         """
         if not self.is_exist(fs_path):
             return []
@@ -217,7 +401,7 @@ def _handle_errors(max_time_out=None):
             while True:
                 try:
                     return f(*args, **kwargs)
-                #important: only ExecuteError need to retry
+                # important: only ExecuteError need to retry
                 except ExecuteError as e:
                     if time.time() - start >= time_out:
                         raise FSTimeOut("args:{} timeout:{}".format(
@@ -236,12 +420,36 @@ def _handle_errors(max_time_out=None):
 
 
 class HDFSClient(FS):
+    """
+    A tool of HDFS.
+
+    Args:
+        hadoop_home(str): Hadoop home. 
+        configs(dict): Hadoop config. It is a dictionary and needs to contain the
+            keys: "fs.default.name" and "hadoop.job.ugi".
+
+    Examples:
+
+        .. code-block:: text
+
+            from paddle.distributed.fleet.utils.fs import HDFSClient
+            hadoop_home = "/home/client/hadoop-client/hadoop/"
+
+            configs = {
+                "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                "hadoop.job.ugi": "hello,hello123"
+            }
+
+            client = HDFSClient(hadoop_home, configs)
+            client.ls_dir("hdfs:/test_hdfs_client")
+    """
+
     def __init__(
             self,
             hadoop_home,
             configs,
-            time_out=5 * 60 * 1000,  #ms
-            sleep_inter=1000):  #ms
+            time_out=5 * 60 * 1000,  # ms
+            sleep_inter=1000):  # ms
         # Raise exception if JAVA_HOME not exists.
         java_home = os.environ["JAVA_HOME"]
 
@@ -272,6 +480,30 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def list_dirs(self, fs_path):
+        """	
+        Only list directorys under `fs_path` .
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            List: A list of all its subdirectories, e.g. [subdirname1, subdirname1, ...].
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                subdirs = client.list_dirs("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return []
 
@@ -281,7 +513,29 @@ class HDFSClient(FS):
     @_handle_errors()
     def ls_dir(self, fs_path):
         """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
+        List directorys and files under `fs_path` .
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories, 
+            and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                subdirs, files = client.ls_dir("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return [], []
@@ -320,6 +574,30 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def is_dir(self, fs_path):
+        """
+        Whether the remote HDFS path is a directory.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a directory, otherwise return false.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_file("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return False
 
@@ -338,6 +616,30 @@ class HDFSClient(FS):
         return True
 
     def is_file(self, fs_path):
+        """
+        Whether the remote HDFS path is a file.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a file, otherwise return false.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_file("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return False
 
@@ -345,6 +647,31 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def is_exist(self, fs_path):
+        """
+        Whether the remote HDFS path exists.
+
+        Args:
+            fs_path(str): The hdfs file path.
+
+        Returns:
+            Bool: Whether it's is file or directory, return true if the path exists,
+            otherwise return false.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_exist("hdfs:/test_hdfs_client")
+        """
         cmd = "ls {} ".format(fs_path)
         ret, out = self._run_cmd(cmd, redirect_stderr=True)
         if ret != 0:
@@ -357,6 +684,28 @@ class HDFSClient(FS):
 
     # can't retry
     def upload(self, local_path, fs_path):
+        """
+        Upload the local path to remote HDFS.
+
+        Args:
+            local_path(str): The local path.
+            fs_path(str): The HDFS path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.upload("test_hdfs_client", "hdfs:/test_hdfs_client")
+        """
         if self.is_exist(fs_path):
             raise FSFileExistsError("{} exists".format(fs_path))
 
@@ -380,6 +729,28 @@ class HDFSClient(FS):
 
     # can't retry
     def download(self, fs_path, local_path):
+        """
+        Download remote HDFS path to the local.
+
+        Args:
+            fs_path(str):  The HDFS path.
+            local_path(str): The local path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.download("hdfs:/test_hdfs_client", "./")
+        """
         if self.is_exist(local_path):
             raise FSFileExistsError("{} exists".format(local_path))
 
@@ -403,6 +774,27 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def mkdirs(self, fs_path):
+        """
+        Create a remote HDFS directory.
+
+        Args:
+            fs_path(str): The HDFS directory path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.mkdirs("hdfs:/test_hdfs_client")
+        """
         if self.is_exist(fs_path):
             return
 
@@ -425,6 +817,30 @@ class HDFSClient(FS):
                 raise ExecuteError(cmd)
 
     def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
+        """
+        Move a remote HDFS file or directory from `fs_src_path` to `fs_dst_path` .
+
+        Args:
+            fs_src_path(str):  Name of the file or directory, that's needed to be moved.
+            fs_dst_path(str):  Name of the file or directory to which to move to.
+            overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
+            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption. 
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.mv("hdfs:/test_hdfs_client", "hdfs:/test_hdfs_client2")
+        """
         if overwrite and self.is_exist(fs_dst_path):
             self.delete(fs_dst_path)
 
@@ -467,6 +883,27 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def delete(self, fs_path):
+        """
+        Delete a remote HDFS path, whether it's a file or directory.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.delete("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return
 
@@ -477,6 +914,27 @@ class HDFSClient(FS):
         return self._rm(fs_path)
 
     def touch(self, fs_path, exist_ok=True):
+        """
+        Create a remote HDFS file.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.touch("hdfs:/test_hdfs_client")
+        """
         if self.is_exist(fs_path):
             if exist_ok:
                 return
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 9f748b7956f9faa6b1c948d87f0ef4659057a421..e8cc6ce99016075a950f13d9e23f2957c9686471 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -197,6 +197,7 @@ def __bootstrap__():
         'free_when_no_cache_hit',
         'call_stack_level',
         'sort_sum_gradient',
+        'max_inplace_grad_add',
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index d51cacd1a5cad53ef77b325e5380100c537e057e..478fecf74e4013e0d695c68af86a0e39a4a4e845 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -251,12 +251,19 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
         begin_idx = 0
     if end_idx is None:
         end_idx = len(op_descs)
-    for i in range(begin_idx, end_idx):
-        op_desc = op_descs[i]
-        if isinstance(op_desc, tuple):
-            op_desc = op_desc[0]
-        op_desc._rename_input(old_name, new_name)
-        op_desc._rename_output(old_name, new_name)
+    if isinstance(op_descs, (list, tuple)):
+        for i in range(begin_idx, end_idx):
+            op_desc = op_descs[i]
+            if isinstance(op_desc, tuple):
+                op_desc = op_desc[0]
+            op_desc._rename_input(old_name, new_name)
+            op_desc._rename_output(old_name, new_name)
+    if isinstance(op_descs, collections.OrderedDict):
+        for key, value in op_descs.items():
+            if isinstance(value, (list, tuple)):
+                for op_desc in value:
+                    op_desc._rename_input(old_name, new_name)
+                    op_desc._rename_output(old_name, new_name)
 
 
 def _create_op_desc_(op_type, inputs, outputs, attrs):
@@ -369,6 +376,41 @@ def _append_grad_suffix_(name):
     return cpt.to_text(name) + core.grad_var_suffix()
 
 
+def _accumulate_gradients_by_sum_op_(var_name, renamed_vars, pending_sum_ops,
+                                     op_idx):
+    """
+    Use sum op to accumulate_gradients, the gradients are stored in renamed_vars.
+    """
+    if op_idx not in pending_sum_ops.keys():
+        pending_sum_ops[op_idx] = []
+    pending_sum_ops[op_idx].append(
+        _create_op_desc_("sum", {"X": renamed_vars[var_name]},
+                         {"Out": [var_name]}, {"use_mkldnn": False}))
+    renamed_vars[var_name] = [var_name]
+
+
+def _accumulate_gradients_by_add_ops_(var_name, renamed_vars, pending_sum_ops,
+                                      op_idx):
+    """
+    Use several inplace add op to accumulate_gradients, the gradients are stored in renamed_vars.
+    """
+    if op_idx not in pending_sum_ops.keys():
+        pending_sum_ops[op_idx] = []
+    out_name = renamed_vars[var_name][0]
+    for i in range(1, len(renamed_vars[var_name])):
+        x_name = out_name
+        y_name = renamed_vars[var_name][i]
+        if i != len(renamed_vars[var_name]) - 1:
+            out_name = var_name + '@ADD@' + str(i)
+        else:
+            out_name = var_name
+        pending_sum_ops[op_idx].append(
+            _create_op_desc_("grad_add", {"X": [x_name],
+                                          "Y": [y_name]}, {"Out": [out_name]},
+                             {"use_mkldnn": False}))
+    renamed_vars[var_name] = [var_name]
+
+
 def _addup_repetitive_outputs_(op_descs, block_idx):
     """
     In backward part, an variable may be the output of more than one ops.
@@ -376,7 +418,9 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
     In these cases, the variable should be the accumulation of all the outputs.
     `sum_op`s are added to implement the accumulate.
     """
-    pending_sum_ops = []
+    _MAX_ADD_NUM_ = core.globals()['FLAGS_max_inplace_grad_add']
+    #pending_sum_ops = []
+    pending_sum_ops = collections.OrderedDict()
     var_rename_count = collections.defaultdict(int)
     renamed_vars = collections.defaultdict(list)
     renamed_var_start_idx = collections.defaultdict(list)
@@ -385,10 +429,13 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
             if "@GRAD" not in var_name:
                 continue
             if len(renamed_vars[var_name]) > 1:
-                pending_sum_ops.append((_create_op_desc_(
-                    "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
-                    {"use_mkldnn": False}), idx))
-                renamed_vars[var_name] = [var_name]
+                if len(renamed_vars[var_name]) > _MAX_ADD_NUM_:
+                    _accumulate_gradients_by_sum_op_(var_name, renamed_vars,
+                                                     pending_sum_ops, idx)
+                else:
+                    _accumulate_gradients_by_add_ops_(var_name, renamed_vars,
+                                                      pending_sum_ops, idx)
+
         for param_idx, param_name in enumerate(op_desc.output_names()):
             arg_names = op_desc.output(param_name)
             for arg_idx, var_name in enumerate(arg_names):
@@ -440,13 +487,26 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
                     renamed_vars[var_name].append(new_name)
 
     for var_name, inputs in six.iteritems(renamed_vars):
-        if len(inputs) > 1:
-            pending_sum_ops.append(
-                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
-                                  {"use_mkldnn": False}), len(op_descs)))
+        if len(renamed_vars[var_name]) > 1:
+            if len(renamed_vars[var_name]) > _MAX_ADD_NUM_:
+                _accumulate_gradients_by_sum_op_(var_name, renamed_vars,
+                                                 pending_sum_ops, len(op_descs))
+            else:
+                _accumulate_gradients_by_add_ops_(var_name, renamed_vars,
+                                                  pending_sum_ops,
+                                                  len(op_descs))
+
     # sum_op descs are sorted according to their insert position
-    for p in reversed(pending_sum_ops):
-        op_descs.insert(p[1], p[0])
+    for key, value in collections.OrderedDict(
+            reversed(list(pending_sum_ops.items()))).items():
+
+        # NOTE(zhiqiu): Since reversed, the idx of op_descs to be inserted will remains correct.
+        # For example, [0, 1, 2], and we want to insert 'a' at idx 1, 'b' at idx 2, and the expected result is [0, 1, 'a', 2, 'b'].
+        # If reversed, we first insert 'b' at idx 2, it becomes [0, 1, 2, 'b'], and then insert 'a' at idx 1, it becomes [0, 1, 'a', 2, 'b'].
+        # If not reverse, we first insert 'a' at idx 1, it becomes [0, 1, 'a', 2], and then insert 'b' at idx 2, it becomes [0, 1, 'a', 'b', 2].
+        idx = key
+        for i, op in enumerate(value):
+            op_descs.insert(idx + i, op)
 
     return op_descs
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 8d399c929018f08eb3d02e50981566705536bbf5..8d7ebcf4caa53929c5dd97159e63cf3cd02f5636 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -67,6 +67,7 @@ class ImperativeQuantAware(object):
         Examples:
         .. code-block:: python
 
+            import paddle
             from paddle.fluid.contrib.slim.quantization \
                 import ImperativeQuantAware
             from paddle.vision.models \
@@ -86,20 +87,24 @@ class ImperativeQuantAware(object):
             # ...
             
             # Save quant model for the inference.
-            imperative_qat.save_quantized_model(
-                dirname="./resnet50_qat",
-                model=model,
-                input_shape=[(3, 224, 224)],
-                input_dtype=['float32'],
-                feed=[0],
-                fetch=[0])
+            paddle.jit.save(
+                layer=model,
+                model_path="./resnet50_qat",
+                input_spec=[
+                    paddle.static.InputSpec(
+                    shape=[None, 3, 224, 224], dtype='float32')])
         """
         super(ImperativeQuantAware, self).__init__()
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
         self._moving_rate = moving_rate
 
-        quant_type = {'abs_max', 'moving_average_abs_max'}
+        quant_type = {
+            'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max'
+        }
+
+        assert activation_quantize_type != 'channel_wise_abs_max', \
+            "The activation quantization type does not support 'channel_wise_abs_max'."
         if activation_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown activation_quantize_type : '%s'. It can only be "
@@ -108,8 +113,8 @@ class ImperativeQuantAware(object):
         if weight_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown weight_quantize_type: '%s'. It can only be "
-                "'abs_max' or 'moving_average_abs_max' now." %
-                (str(weight_quantize_type)))
+                "'abs_max' or 'moving_average_abs_max' or 'channel_wise_abs_max' now."
+                % (str(weight_quantize_type)))
         self._activation_quantize_type = activation_quantize_type
         self._weight_quantize_type = weight_quantize_type
 
@@ -148,75 +153,6 @@ class ImperativeQuantAware(object):
             quant_layer = self._get_quantized_counterpart(layer)
             setattr(obj, target, quant_layer)
 
-    def save_quantized_model(self,
-                             dirname,
-                             model,
-                             input_shape,
-                             input_dtype,
-                             feed,
-                             fetch,
-                             append_batch_size=True):
-        """
-        Save the quantized model for the inference.
-
-        Args:
-            dirname (str): the directory to save the quantized model.
-            model(fluid.dygraph.Layer): the quantized model to be saved.
-            input_shape(list[tuple(int)]): The shape value for each input,
-                e.g. [(3, 224, 224)].
-            input_dtype(list[str]): The dtype value for each input,
-                e.g. ['float32'].
-            feed(list[int]): the indices of the input variables of the
-                imperative functions which will be saved as input variables in
-                inference model.
-            fetch(list[int]): the indices of the returned variable of the
-                imperative functions which will be saved as output variables in
-                inference model.
-            append_batch_size(bool, optional):
-                If true, it prepends an extra axis to the input_shape, meanwhile,
-                the input_shape shouldn't contain the batch size dimension.
-                Otherwise, it just uses the input_shape. Default True.
-        Returns:
-            None
-        """
-        assert isinstance(
-            input_shape, list), "The parameter `input_shape` shoubld be a list."
-        assert isinstance(
-            input_dtype, list), "The parameter `input_dtype` shoubld be a list."
-        assert isinstance(feed, list), "The parameter `feed` shoubld be a list."
-        assert isinstance(fetch,
-                          list), "The parameter `fetch` shoubld be a list."
-        assert len(input_shape) == len(
-            input_dtype
-        ), "The length of input_shape should be equal to  input_dtype's."
-        assert len(input_dtype) == len(
-            feed), "The length of input_shape should be equal to  feed's."
-
-        with dygraph.guard():
-            model.eval()
-            input_vars = []
-            for i, (shape, dtype) in enumerate(zip(input_shape, input_dtype)):
-                if append_batch_size:
-                    shape = [None] + list(shape)
-                # Note(Aurelius84): need a elegant way to name this.
-                in_spec = paddle.static.InputSpec(shape, dtype, 'feed_%d' % i)
-                input_vars.append(in_spec)
-            # use `declarative` to convert dygraph into static program
-            model.forward = dygraph.jit.declarative(
-                model.forward, input_spec=input_vars)
-            outputs = model.forward.concrete_program.outputs
-        input_spec = [input_vars[i] for i in feed]
-        configs = dygraph.jit.SaveLoadConfig()
-        configs.separate_params = True
-        if not isinstance(outputs, (tuple, list)):
-            outputs = [outputs]
-        configs.output_spec = [outputs[i] for i in fetch]
-        dygraph.jit.save(
-            layer=model,
-            model_path=dirname,
-            input_spec=input_spec,
-            configs=configs)
-
     def _get_quantized_counterpart(self, layer):
         quant_layers = tuple(self._quant_layers_map.values())
         quantized_counterpart = tuple('Quantized' + k
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index e22c980b0a7c6030c5d6a2fbc4fd58d2ec66958a..2e35ac288c7158a220e3b96babb146e28d50a5ee 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -24,7 +24,7 @@ from paddle.fluid.data_feeder import check_variable_and_dtype
 
 __all__ = [
     'FakeQuantMovingAverage', 'FakeQuantAbsMax', 'QuantizedConv2D',
-    'QuantizedLinear'
+    'QuantizedLinear', 'FakeChannelWiseQuantDequantAbsMax'
 ]
 
 
@@ -209,6 +209,89 @@ class FakeQuantAbsMax(layers.Layer):
         return quant_out
 
 
+class FakeChannelWiseQuantDequantAbsMax(layers.Layer):
+    def __init__(self,
+                 name=None,
+                 channel_num=None,
+                 quant_bits=8,
+                 quant_axis=0,
+                 dtype='float32',
+                 quant_on_weight=False):
+        assert quant_on_weight == True, "Channel_wise only can be used on weight quantization."
+        super(FakeChannelWiseQuantDequantAbsMax, self).__init__()
+        self._quant_bits = quant_bits
+        self._quant_axis = quant_axis
+        self._dtype = dtype
+        self._name = name
+        self._channel_num = channel_num
+        scale_prefix = "{}.scale".format(
+            name) if name else 'quant_dequant.scale'
+        self._scale_name = unique_name.generate(scale_prefix)
+        if quant_on_weight:
+            scale_attr = ParamAttr(
+                name=self._scale_name,
+                initializer=Constant(0.0),
+                trainable=False)
+            self._scale = self.create_parameter(
+                shape=[self._channel_num], attr=scale_attr, dtype=self._dtype)
+            self._scale.stop_gradient = True
+        else:
+            self._scale = None
+
+    def forward(self, input):
+        if in_dygraph_mode():
+            attrs = ('bit_length', self._quant_bits, 'quant_axis',
+                     self._quant_axis)
+            quant_out = _varbase_creator(
+                type=input.type,
+                name="{}.quantized.dequantized".format(input.name),
+                shape=input.shape,
+                dtype=input.dtype,
+                persistable=False)
+
+            out_scale = self._scale
+            if out_scale is None:
+                out_scale = _varbase_creator(
+                    type=core.VarDesc.VarType.LOD_TENSOR,
+                    name=self._scale_name,
+                    shape=[self._channel_num],
+                    dtype=self._dtype,
+                    persistable=False)
+                out_scale.stop_gradient = True
+
+            out, _, = core.ops.fake_channel_wise_quantize_dequantize_abs_max(
+                input, quant_out, out_scale, *attrs)
+            return out
+
+        check_variable_and_dtype(input, 'input', ['float32'],
+                                 "FakeChannelWiseQuantDequantAbsMax")
+        attrs = {'bit_length': self._quant_bits, 'quant_axis': self._quant_axis}
+        inputs = {"X": [input]}
+        quant_out = self._helper.create_variable(
+            name="{}.quantized.dequantized".format(input.name),
+            dtype=input.dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=False)
+        out_scale = self._scale
+        if not out_scale:
+            out_scale = self._helper.create_variable(
+                name=self._scale_name,
+                dtype=self._dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=True)
+        outputs = {"Out": [quant_out], "OutScale": [out_scale]}
+
+        self._helper.append_op(
+            type="fake_channel_wise_quantize_dequantize_abs_max",
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs)
+
+        return quant_out
+
+
 def _get_fake_quant_type(quant_type, **kwargs):
     call_args = {
         "name": kwargs.get("name", None),
@@ -220,10 +303,17 @@ def _get_fake_quant_type(quant_type, **kwargs):
         call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
     elif quant_type == 'moving_average_abs_max':
         call_args["moving_rate"] = kwargs.get("moving_rate", 0.9)
-
+    elif quant_type == 'channel_wise_abs_max':
+        call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
+        call_args["channel_num"] = kwargs.get("channel_num", None)
+        call_args["quant_axis"] = kwargs.get("quant_axis", 0)
+        assert call_args["channel_num"] is not None, (
+            "You need to input channel_num"
+            "when you use channel_wise_abs_max strategy.")
     fake_quant_map = {
         'abs_max': FakeQuantAbsMax,
-        'moving_average_abs_max': FakeQuantMovingAverage
+        'moving_average_abs_max': FakeQuantMovingAverage,
+        'channel_wise_abs_max': FakeChannelWiseQuantDequantAbsMax
     }
 
     return fake_quant_map[quant_type](**call_args)
@@ -255,19 +345,23 @@ class QuantizedConv2D(layers.Layer):
         self.weight = getattr(layer, 'weight')
         self.bias = getattr(layer, 'bias')
         # For FakeQuant
+        self._conv2d_quant_axis = 0
         self._fake_quant_weight = _get_fake_quant_type(
             weight_quantize_type,
             name=self.weight.name,
             moving_rate=moving_rate,
             quant_bits=weight_bits,
             dtype=self._dtype,
-            quant_on_weight=True)
+            quant_on_weight=True,
+            channel_num=self.weight.shape[self._conv2d_quant_axis],
+            quant_axis=self._conv2d_quant_axis)
         self._fake_quant_input = _get_fake_quant_type(
             activation_quantize_type,
             name=layer.full_name(),
             moving_rate=moving_rate,
             quant_bits=activation_bits,
-            dtype=self._dtype)
+            dtype=self._dtype,
+            quant_on_weight=False)
 
     def forward(self, input):
         quant_input = self._fake_quant_input(input)
@@ -341,19 +435,23 @@ class QuantizedLinear(layers.Layer):
         self.weight = getattr(layer, 'weight')
         self.bias = getattr(layer, 'bias')
         # For FakeQuant
+        self._linear_quant_axis = 1
         self._fake_quant_weight = _get_fake_quant_type(
             weight_quantize_type,
             name=self.weight.name,
             moving_rate=moving_rate,
             quant_bits=weight_bits,
             dtype=self._dtype,
-            quant_on_weight=True)
+            quant_on_weight=True,
+            channel_num=self.weight.shape[self._linear_quant_axis],
+            quant_axis=self._linear_quant_axis)
         self._fake_quant_input = _get_fake_quant_type(
             activation_quantize_type,
             name=layer.full_name(),
             moving_rate=moving_rate,
             quant_bits=activation_bits,
-            dtype=self._dtype)
+            dtype=self._dtype,
+            quant_on_weight=False)
 
     def forward(self, input):
         quant_input = self._fake_quant_input(input)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 79b0bbd6a4dd3850f49aa0b5124e9be86d4e6ee3..0d047a0cd3be3c039e054512827c7addf549f63e 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -181,7 +181,6 @@ class TestImperativeQat(unittest.TestCase):
 
                     img = fluid.dygraph.to_variable(x_data)
                     label = fluid.dygraph.to_variable(y_data)
-
                     out = lenet(img)
                     acc = fluid.layers.accuracy(out, label)
                     loss = fluid.layers.cross_entropy(out, label)
@@ -221,7 +220,7 @@ class TestImperativeQat(unittest.TestCase):
             model_dict = lenet.state_dict()
             fluid.save_dygraph(model_dict, "save_temp")
 
-            # test the correctness of `save_quantized_model`
+            # test the correctness of `paddle.jit.save`
             data = next(test_reader())
             test_data = np.array([x[0].reshape(1, 28, 28)
                                   for x in data]).astype('float32')
@@ -231,13 +230,14 @@ class TestImperativeQat(unittest.TestCase):
 
         # save inference quantized model
         path = "./mnist_infer_model"
-        imperative_qat.save_quantized_model(
-            dirname=path,
-            model=lenet,
-            input_shape=[(1, 28, 28)],
-            input_dtype=['float32'],
-            feed=[0],
-            fetch=[0])
+        paddle.jit.save(
+            layer=lenet,
+            model_path=path,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
         else:
@@ -245,7 +245,10 @@ class TestImperativeQat(unittest.TestCase):
         exe = fluid.Executor(place)
         [inference_program, feed_target_names, fetch_targets] = (
             fluid.io.load_inference_model(
-                dirname=path, executor=exe))
+                dirname=path,
+                executor=exe,
+                model_filename="__model__",
+                params_filename="__variables__"))
         after_save, = exe.run(inference_program,
                               feed={feed_target_names[0]: test_data},
                               fetch_list=fetch_targets)
@@ -332,13 +335,13 @@ class TestImperativeQat(unittest.TestCase):
                 if batch_id % 100 == 0:
                     _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
 
-        imperative_qat.save_quantized_model(
-            dirname="./dynamic_mnist",
-            model=lenet,
-            input_shape=[(1, 28, 28)],
-            input_dtype=['float32'],
-            feed=[0],
-            fetch=[0])
+        paddle.jit.save(
+            layer=lenet,
+            model_path="./dynamic_mnist",
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
 
         # static graph train
         _logger.info(
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..17c613281a8c48fd3244601ab2d87a91bb90fa97
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -0,0 +1,428 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import random
+import unittest
+import logging
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.framework import IrGraph
+from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid.dygraph.container import Sequential
+from paddle.fluid.dygraph.nn import Conv2D
+from paddle.fluid.dygraph.nn import Pool2D
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.log_helper import get_logger
+
+os.environ["CPU_NUM"] = "1"
+if core.is_compiled_with_cuda():
+    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+def StaticLenet(data, num_classes=10, classifier_activation='softmax'):
+    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+    conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
+    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+    conv1 = fluid.layers.conv2d(
+        data,
+        num_filters=6,
+        filter_size=3,
+        stride=1,
+        padding=1,
+        param_attr=conv2d_w1_attr,
+        bias_attr=conv2d_b1_attr)
+    pool1 = fluid.layers.pool2d(
+        conv1, pool_size=2, pool_type='max', pool_stride=2)
+    conv2 = fluid.layers.conv2d(
+        pool1,
+        num_filters=16,
+        filter_size=5,
+        stride=1,
+        padding=0,
+        param_attr=conv2d_w2_attr,
+        bias_attr=conv2d_b2_attr)
+    pool2 = fluid.layers.pool2d(
+        conv2, pool_size=2, pool_type='max', pool_stride=2)
+
+    fc1 = fluid.layers.fc(input=pool2,
+                          size=120,
+                          param_attr=fc_w1_attr,
+                          bias_attr=fc_b1_attr)
+    fc2 = fluid.layers.fc(input=fc1,
+                          size=84,
+                          param_attr=fc_w2_attr,
+                          bias_attr=fc_b2_attr)
+    fc3 = fluid.layers.fc(input=fc2,
+                          size=num_classes,
+                          act=classifier_activation,
+                          param_attr=fc_w3_attr,
+                          bias_attr=fc_b3_attr)
+
+    return fc3
+
+
+class ImperativeLenet(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10, classifier_activation='softmax'):
+        super(ImperativeLenet, self).__init__()
+        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
+        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        self.features = Sequential(
+            Conv2D(
+                num_channels=1,
+                num_filters=6,
+                filter_size=3,
+                stride=1,
+                padding=1,
+                param_attr=conv2d_w1_attr,
+                bias_attr=conv2d_b1_attr),
+            Pool2D(
+                pool_size=2, pool_type='max', pool_stride=2),
+            Conv2D(
+                num_channels=6,
+                num_filters=16,
+                filter_size=5,
+                stride=1,
+                padding=0,
+                param_attr=conv2d_w2_attr,
+                bias_attr=conv2d_b2_attr),
+            Pool2D(
+                pool_size=2, pool_type='max', pool_stride=2))
+
+        self.fc = Sequential(
+            Linear(
+                input_dim=400,
+                output_dim=120,
+                param_attr=fc_w1_attr,
+                bias_attr=fc_b1_attr),
+            Linear(
+                input_dim=120,
+                output_dim=84,
+                param_attr=fc_w2_attr,
+                bias_attr=fc_b2_attr),
+            Linear(
+                input_dim=84,
+                output_dim=num_classes,
+                act=classifier_activation,
+                param_attr=fc_w3_attr,
+                bias_attr=fc_b3_attr))
+
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        x = fluid.layers.flatten(x, 1)
+        x = self.fc(x)
+        return x
+
+
+class TestImperativeQat(unittest.TestCase):
+    """
+    QAT = quantization-aware training
+    """
+
+    def test_qat_save(self):
+        imperative_qat = ImperativeQuantAware(
+            weight_quantize_type='channel_wise_abs_max',
+            activation_quantize_type='moving_average_abs_max')
+
+        with fluid.dygraph.guard():
+            lenet = ImperativeLenet()
+            imperative_qat.quantize(lenet)
+            adam = AdamOptimizer(
+                learning_rate=0.001, parameter_list=lenet.parameters())
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=32)
+
+            epoch_num = 1
+            for epoch in range(epoch_num):
+                lenet.train()
+                for batch_id, data in enumerate(train_reader()):
+                    x_data = np.array([x[0].reshape(1, 28, 28)
+                                       for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                    img = fluid.dygraph.to_variable(x_data)
+                    label = fluid.dygraph.to_variable(y_data)
+                    out = lenet(img)
+                    acc = fluid.layers.accuracy(out, label)
+                    loss = fluid.layers.cross_entropy(out, label)
+                    avg_loss = fluid.layers.mean(loss)
+                    avg_loss.backward()
+                    adam.minimize(avg_loss)
+                    lenet.clear_gradients()
+                    if batch_id % 100 == 0:
+                        _logger.info(
+                            "Train | At epoch {} step {}: loss = {:}, acc= {:}".
+                            format(epoch, batch_id,
+                                   avg_loss.numpy(), acc.numpy()))
+
+                lenet.eval()
+                for batch_id, data in enumerate(test_reader()):
+                    x_data = np.array([x[0].reshape(1, 28, 28)
+                                       for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                    img = fluid.dygraph.to_variable(x_data)
+                    label = fluid.dygraph.to_variable(y_data)
+
+                    out = lenet(img)
+                    acc_top1 = fluid.layers.accuracy(
+                        input=out, label=label, k=1)
+                    acc_top5 = fluid.layers.accuracy(
+                        input=out, label=label, k=5)
+
+                    if batch_id % 100 == 0:
+                        _logger.info(
+                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
+                            format(epoch, batch_id,
+                                   acc_top1.numpy(), acc_top5.numpy()))
+
+            # save weights
+            model_dict = lenet.state_dict()
+            fluid.save_dygraph(model_dict, "save_temp")
+
+            # test the correctness of `paddle.jit.save`
+            data = next(test_reader())
+            test_data = np.array([x[0].reshape(1, 28, 28)
+                                  for x in data]).astype('float32')
+            test_img = fluid.dygraph.to_variable(test_data)
+            lenet.eval()
+            before_save = lenet(test_img)
+
+        # save inference quantized model
+        path = "./mnist_infer_model"
+        paddle.jit.save(
+            layer=lenet,
+            model_path=path,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        [inference_program, feed_target_names, fetch_targets] = (
+            fluid.io.load_inference_model(
+                dirname=path,
+                executor=exe,
+                model_filename="__model__",
+                params_filename="__variables__"))
+        after_save, = exe.run(inference_program,
+                              feed={feed_target_names[0]: test_data},
+                              fetch_list=fetch_targets)
+
+        self.assertTrue(
+            np.allclose(after_save, before_save.numpy()),
+            msg='Failed to save the inference quantized model.')
+
+    def test_qat_acc(self):
+        def _build_static_lenet(main, startup, is_test=False, seed=1000):
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    main.random_seed = seed
+                    startup.random_seed = seed
+                    img = fluid.layers.data(
+                        name='image', shape=[1, 28, 28], dtype='float32')
+                    label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+                    prediction = StaticLenet(img)
+                    if not is_test:
+                        loss = fluid.layers.cross_entropy(
+                            input=prediction, label=label)
+                        avg_loss = fluid.layers.mean(loss)
+                    else:
+                        avg_loss = prediction
+            return img, label, avg_loss
+
+        reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
+        weight_quantize_type = 'channel_wise_abs_max'
+        activation_quant_type = 'moving_average_abs_max'
+        param_init_map = {}
+        seed = 1000
+        lr = 0.1
+
+        # imperative train
+        _logger.info(
+            "--------------------------dynamic graph qat--------------------------"
+        )
+        imperative_qat = ImperativeQuantAware(
+            weight_quantize_type=weight_quantize_type,
+            activation_quantize_type=activation_quant_type)
+
+        with fluid.dygraph.guard():
+            np.random.seed(seed)
+            fluid.default_main_program().random_seed = seed
+            fluid.default_startup_program().random_seed = seed
+            lenet = ImperativeLenet()
+            fixed_state = {}
+            for name, param in lenet.named_parameters():
+                p_shape = param.numpy().shape
+                p_value = param.numpy()
+                if name.endswith("bias"):
+                    value = np.zeros_like(p_value).astype('float32')
+                else:
+                    value = np.random.normal(
+                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
+                            p_shape).astype('float32')
+                fixed_state[name] = value
+                param_init_map[param.name] = value
+            lenet.set_dict(fixed_state)
+
+            imperative_qat.quantize(lenet)
+            adam = AdamOptimizer(
+                learning_rate=lr, parameter_list=lenet.parameters())
+            dynamic_loss_rec = []
+            lenet.train()
+            for batch_id, data in enumerate(reader()):
+                x_data = np.array([x[0].reshape(1, 28, 28)
+                                   for x in data]).astype('float32')
+                y_data = np.array(
+                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                img = fluid.dygraph.to_variable(x_data)
+                label = fluid.dygraph.to_variable(y_data)
+
+                out = lenet(img)
+                loss = fluid.layers.cross_entropy(out, label)
+                avg_loss = fluid.layers.mean(loss)
+                avg_loss.backward()
+                adam.minimize(avg_loss)
+                lenet.clear_gradients()
+                dynamic_loss_rec.append(avg_loss.numpy()[0])
+                if batch_id % 100 == 0:
+                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
+
+        paddle.jit.save(
+            layer=lenet,
+            model_path="./dynamic_mnist",
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
+        # static graph train
+        _logger.info(
+            "--------------------------static graph qat--------------------------"
+        )
+        static_loss_rec = []
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+
+        main = fluid.Program()
+        infer = fluid.Program()
+        startup = fluid.Program()
+        static_img, static_label, static_loss = _build_static_lenet(
+            main, startup, False, seed)
+        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
+                                                      seed)
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                opt = AdamOptimizer(learning_rate=lr)
+                opt.minimize(static_loss)
+
+        scope = core.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup)
+        for param in main.all_parameters():
+            param_tensor = scope.var(param.name).get_tensor()
+            param_tensor.set(param_init_map[param.name], place)
+
+        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
+        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
+        transform_pass = QuantizationTransformPass(
+            scope=scope,
+            place=place,
+            activation_quantize_type=activation_quant_type,
+            weight_quantize_type=weight_quantize_type,
+            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
+        transform_pass.apply(main_graph)
+        transform_pass.apply(infer_graph)
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.fuse_all_reduce_ops = False
+        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
+            loss_name=static_loss.name, build_strategy=build_strategy)
+
+        feeder = fluid.DataFeeder(
+            feed_list=[static_img, static_label], place=place)
+        with fluid.scope_guard(scope):
+            for batch_id, data in enumerate(reader()):
+                loss_v, = exe.run(binary,
+                                  feed=feeder.feed(data),
+                                  fetch_list=[static_loss])
+                static_loss_rec.append(loss_v[0])
+                if batch_id % 100 == 0:
+                    _logger.info('{}: {}'.format('loss', loss_v))
+
+        save_program = infer_graph.to_program()
+        with fluid.scope_guard(scope):
+            fluid.io.save_inference_model("./static_mnist", [infer_img.name],
+                                          [infer_pre], exe, save_program)
+        rtol = 1e-05
+        atol = 1e-08
+        for i, (loss_d,
+                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
+            diff = np.abs(loss_d - loss_s)
+            if diff > (atol + rtol * np.abs(loss_s)):
+                _logger.info(
+                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
+                    format(diff, i, loss_d, loss_s))
+                break
+
+        self.assertTrue(
+            np.allclose(
+                np.array(dynamic_loss_rec),
+                np.array(static_loss_rec),
+                rtol=rtol,
+                atol=atol,
+                equal_nan=True),
+            msg='Failed to do the imperative qat.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index a05aa3b0a84b57bb1f9ce00b0ad007280c316c6e..2e3bb6b00218a47467c59a2a88ea45ec80c32419 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -39,6 +39,11 @@ try:
         third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
         os.environ['path'] = third_lib_path + ';' + os.environ['path']
         sys.path.insert(0, third_lib_path)
+        # Note: from python3.8, PATH will not take effect
+        # https://github.com/python/cpython/pull/12302
+        # Use add_dll_directory to specify dll resolution path
+        if sys.version_info[:2] >= (3, 8):
+            os.add_dll_directory(third_lib_path)
 
 except ImportError as e:
     from .. import compat as cpt
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index 5152799ca72f1461d6fbfc3a619a6aa9b9477934..5050067e48a1b147d43abd955c64c7fbb8cf6068 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -60,7 +60,7 @@ class DygraphToStaticAst(gast.NodeTransformer):
     def transfer_from_node_type(self, node_wrapper):
         translator_logger = logging_utils.TranslatorLogger()
         translator_logger.log(
-            1, "   Source code: \n{}".format(ast_to_source_code(self.root)))
+            1, "Source code: \n{}".format(ast_to_source_code(self.root)))
         # Generic transformation
         self.visit(node_wrapper.node)
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index 37ce8b0a152ff8e258e8aee2a54ed7215f77c146..3d1ed836ff1aca38d796f3a247e9a5d6f6cf3add 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -12,17 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 import six
 import inspect
 import numpy as np
 import collections
+
 import paddle
 from paddle.fluid import core
 from paddle.fluid.dygraph import layers
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers.utils import pack_sequence_as
 from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
 from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
@@ -291,7 +292,7 @@ def convert_to_input_spec(inputs, input_spec):
         if len(inputs) > len(input_spec):
             for rest_input in inputs[len(input_spec):]:
                 if isinstance(rest_input, (core.VarBase, np.ndarray)):
-                    logging.warning(
+                    logging_utils.warn(
                         "The inputs constain `{}` without specificing InputSpec, its shape and dtype will be treated immutable. "
                         "Please specific InputSpec information in `@declarative` if you expect them as mutable inputs.".
                         format(type_name(rest_input)))
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
index c52872b15016169504359b54ad5a40360e244ce0..4d9ed5916adfd79013be1d8d1bb90f3c44428b49 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -26,6 +26,8 @@ CODE_LEVEL_ENV_NAME = 'TRANSLATOR_CODE_LEVEL'
 DEFAULT_VERBOSITY = -1
 DEFAULT_CODE_LEVEL = -1
 
+LOG_AllTransformer = 100
+
 
 def synchronized(func):
     def wrapper(*args, **kwargs):
@@ -53,10 +55,15 @@ class TranslatorLogger(object):
             return
 
         self._initialized = True
+        self.logger_name = "Dynamic-to-Static"
         self._logger = log_helper.get_logger(
-            __name__, 1, fmt='%(asctime)s-%(levelname)s: %(message)s')
+            self.logger_name,
+            1,
+            fmt='%(asctime)s %(name)s %(levelname)s: %(message)s')
         self._verbosity_level = None
         self._transformed_code_level = None
+        self._need_to_echo_log_to_stdout = None
+        self._need_to_echo_code_to_stdout = None
 
     @property
     def logger(self):
@@ -86,6 +93,28 @@ class TranslatorLogger(object):
         self.check_level(level)
         self._transformed_code_level = level
 
+    @property
+    def need_to_echo_log_to_stdout(self):
+        if self._need_to_echo_log_to_stdout is not None:
+            return self._need_to_echo_log_to_stdout
+        return False
+
+    @need_to_echo_log_to_stdout.setter
+    def need_to_echo_log_to_stdout(self, log_to_stdout):
+        assert isinstance(log_to_stdout, (bool, type(None)))
+        self._need_to_echo_log_to_stdout = log_to_stdout
+
+    @property
+    def need_to_echo_code_to_stdout(self):
+        if self._need_to_echo_code_to_stdout is not None:
+            return self._need_to_echo_code_to_stdout
+        return False
+
+    @need_to_echo_code_to_stdout.setter
+    def need_to_echo_code_to_stdout(self, code_to_stdout):
+        assert isinstance(code_to_stdout, (bool, type(None)))
+        self._need_to_echo_code_to_stdout = code_to_stdout
+
     def check_level(self, level):
         if isinstance(level, (six.integer_types, type(None))):
             rv = level
@@ -110,34 +139,56 @@ class TranslatorLogger(object):
 
     def error(self, msg, *args, **kwargs):
         self.logger.error(msg, *args, **kwargs)
+        if self.need_to_echo_log_to_stdout:
+            self._output_to_stdout('ERROR: ' + msg, *args)
 
     def warn(self, msg, *args, **kwargs):
-        self.logger.warn(msg, *args, **kwargs)
+        self.logger.warning(msg, *args, **kwargs)
+        if self.need_to_echo_log_to_stdout:
+            self._output_to_stdout('WARNING: ' + msg, *args)
 
     def log(self, level, msg, *args, **kwargs):
         if self.has_verbosity(level):
-            self.logger.log(level, msg, *args, **kwargs)
+            msg_with_level = '(Level {}) {}'.format(level, msg)
+            self.logger.info(msg_with_level, *args, **kwargs)
+            if self.need_to_echo_log_to_stdout:
+                self._output_to_stdout('INFO: ' + msg_with_level, *args)
 
     def log_transformed_code(self, level, ast_node, transformer_name, *args,
                              **kwargs):
         if self.has_code_level(level):
             source_code = ast_to_source_code(ast_node)
-            header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
-                .format(level, transformer_name)
+            if level == LOG_AllTransformer:
+                header_msg = "After the last level ast transformer: '{}', the transformed code:\n" \
+                    .format(transformer_name)
+            else:
+                header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
+                    .format(level, transformer_name)
 
             msg = header_msg + source_code
             self.logger.info(msg, *args, **kwargs)
 
+            if self.need_to_echo_code_to_stdout:
+                self._output_to_stdout('INFO: ' + msg, *args)
+
+    def _output_to_stdout(self, msg, *args):
+        msg = self.logger_name + ' ' + msg
+        print(msg % args)
+
 
 _TRANSLATOR_LOGGER = TranslatorLogger()
 
 
-def set_verbosity(level=0):
+def set_verbosity(level=0, also_to_stdout=False):
     """
-    Sets the verbosity level of log for dygraph to static graph.
+    Sets the verbosity level of log for dygraph to static graph. Logs can be output to stdout by setting `also_to_stdout`.
+
     There are two means to set the logging verbosity:
-     1. Call function `set_verbosity`
-     2. Set environment variable `TRANSLATOR_VERBOSITY`
+
+    1. Call function `set_verbosity`
+
+    2. Set environment variable `TRANSLATOR_VERBOSITY`
+
 
     **Note**:
     `set_verbosity` has a higher priority than the environment variable.
@@ -145,6 +196,7 @@ def set_verbosity(level=0):
     Args:
         level(int): The verbosity level. The larger value idicates more verbosity.
             The default value is 0, which means no logging.
+        also_to_stdout(bool): Whether to also output log messages to `sys.stdout`.
 
     Examples:
         .. code-block:: python
@@ -159,27 +211,30 @@ def set_verbosity(level=0):
             # The verbosity level is now 3, but it has no effect because it has a lower priority than `set_verbosity`
     """
     _TRANSLATOR_LOGGER.verbosity_level = level
+    _TRANSLATOR_LOGGER.need_to_echo_log_to_stdout = also_to_stdout
 
 
 def get_verbosity():
     return _TRANSLATOR_LOGGER.verbosity_level
 
 
-LOG_AllTransformer = 100
-
-
-def set_code_level(level=LOG_AllTransformer):
+def set_code_level(level=LOG_AllTransformer, also_to_stdout=False):
     """
-    Sets the level to print code from specific level of Ast Transformer.
+    Sets the level to print code from specific level Ast Transformer. Code can be output to stdout by setting `also_to_stdout`.
+
     There are two means to set the code level:
-     1. Call function `set_code_level`
-     2. Set environment variable `TRANSLATOR_CODE_LEVEL`
+
+    1. Call function `set_code_level`
+
+    2. Set environment variable `TRANSLATOR_CODE_LEVEL`
+
 
     **Note**:
     `set_code_level` has a higher priority than the environment variable.
 
     Args:
         level(int): The level to print code. Default is 100, which means to print the code after all AST Transformers.
+        also_to_stdout(bool): Whether to also output code to `sys.stdout`.
 
     Examples:
         .. code-block:: python
@@ -195,6 +250,7 @@ def set_code_level(level=LOG_AllTransformer):
 
     """
     _TRANSLATOR_LOGGER.transformed_code_level = level
+    _TRANSLATOR_LOGGER.need_to_echo_code_to_stdout = also_to_stdout
 
 
 def get_code_level():
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 59cb5fb144eb50f4616c94ed78348d56a4029834..1004665ca15fbc2458c1626735f161c7f4904596 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -14,21 +14,17 @@
 
 from __future__ import print_function
 import numpy as np
-import logging
 import six
 
-from paddle.fluid import log_helper
 from paddle.fluid import framework, backward, core
 from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers.utils import pack_sequence_as
 import paddle.compat as cpt
 
-_logger = log_helper.get_logger(
-    __name__, logging.WARNING, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
 
 class NestSequence(object):
     """
@@ -72,7 +68,7 @@ class NestSequence(object):
                 if not isinstance(var, (framework.Variable, core.VarBase)):
                     warning_types.add(type(var))
             if warning_types:
-                _logger.warning(
+                logging_utils.warn(
                     "Output of traced function contains non-tensor type values: {}. "
                     "Currently, We don't support to update them while training and will return "
                     "what we first saw. Please try to return them as tensor.".
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
index bfeeb3bd9bd1864d6f26f8a14a67188ec79a8157..c227c281bb15412beae07d1ec0ba24bfb83490ae 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -15,14 +15,8 @@
 from __future__ import print_function
 
 import gast
-import logging
 
-from paddle.fluid import log_helper
-from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, NodeVarType, StaticAnalysisVisitor
-from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
-
-_logger = log_helper.get_logger(
-    __name__, logging.WARNING, fmt='%(asctime)s-%(levelname)s: %(message)s')
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, StaticAnalysisVisitor
 
 
 class PrintTransformer(gast.NodeTransformer):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index dbf030ccda16fb102af4e00e2a2a4d1fd7983a06..5218c0aac957422a665513b5eb2a0391c5c7a01f 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -13,17 +13,15 @@
 # limitations under the License.
 
 from __future__ import print_function
-import gast
+
 import collections
-import logging
+import gast
 import inspect
 import six
 import textwrap
 import threading
-import warnings
 import weakref
 
-import gast
 from paddle.fluid import framework
 from paddle.fluid import in_dygraph_mode
 from paddle.fluid.dygraph import layers
@@ -451,7 +449,7 @@ class StaticLayer(object):
                     format(self._function_spec))
         # If more than one programs have been cached, return the recent converted program by default.
         elif cached_program_len > 1:
-            logging.warning(
+            logging_utils.warn(
                 "Current {} has more than one cached programs: {}, the last traced progam will be return by default.".
                 format(self._function_spec, cached_program_len))
 
@@ -632,7 +630,7 @@ class ProgramCache(object):
             # Note: raise warnings if number of traced program is more than `max_tracing_count`
             current_tracing_count = len(self._caches)
             if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
-                logging.warning(
+                logging_utils.warn(
                     "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. "
                     "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors.".
                     format(current_tracing_count, MAX_TRACED_PROGRAM_COUNT))
@@ -804,8 +802,9 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_output"
+
         if not self.enable_to_static:
-            warnings.warn(
+            logging_utils.warn(
                 "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable to False. "
                 "We will just return dygraph output. "
                 "Please call ProgramTranslator.enable(True) if you would like to get static output."
@@ -879,8 +878,9 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_func"
+
         if not self.enable_to_static:
-            warnings.warn(
+            logging_utils.warn(
                 "The ProgramTranslator.get_func doesn't work when setting ProgramTranslator.enable to False. We will "
                 "just return dygraph output. Please call ProgramTranslator.enable(True) if you would like to get static output."
             )
@@ -933,8 +933,9 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_program"
+
         if not self.enable_to_static:
-            warnings.warn(
+            logging_utils.warn(
                 "The ProgramTranslator.get_program doesn't work when setting ProgramTranslator.enable to False."
                 "We will just return dygraph output. "
                 "Please call ProgramTranslator.enable(True) if you would like to get static output."
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 834c1a737d73bdb8ec3dd89eb5ccd6c0780a211d..10819e4b320dd0630c7ac43fdf89b84252823a94 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -26,6 +26,7 @@ from paddle.fluid import core
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticLayer, unwrap_decorators
 from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
@@ -120,7 +121,7 @@ def _dygraph_to_static_func_(dygraph_func):
     def __impl__(*args, **kwargs):
         program_translator = ProgramTranslator()
         if in_dygraph_mode() or not program_translator.enable_to_static:
-            warnings.warn(
+            logging_utils.warn(
                 "The decorator 'dygraph_to_static_func' doesn't work in "
                 "dygraph mode or set ProgramTranslator.enable to False. "
                 "We will just return dygraph output.")
@@ -215,7 +216,7 @@ def declarative(function=None, input_spec=None):
         if isinstance(function, Layer):
             if isinstance(function.forward, StaticLayer):
                 class_name = function.__class__.__name__
-                warnings.warn(
+                logging_utils.warn(
                     "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.".
                     format(class_name))
             function.forward = decorated(function.forward)
diff --git a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
index ad51a043a0a50f89f77811adb7f95759a4f220be..a8c1656b2b03758043aec9058db317e52b45a5af 100644
--- a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
@@ -98,7 +98,7 @@ class AutoCheckpointChecker(object):
             self._fs_cache = os.getenv("PADDLE_EDL_FS_CACHE", ".cache")
 
             self._save_checkpoint_inter = int(
-                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  #s
+                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  # s
 
             if not self._ce_test:
                 assert len(self._hdfs_home) > 3 and \
@@ -132,7 +132,7 @@ class AutoCheckpointChecker(object):
         if in_dygraph_mode():
             return False
 
-        return  self._run_env is not None and \
+        return self._run_env is not None and \
             self._platform is not None and \
             self._job_id is not None and \
             self._hdfs_home is not None and \
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index 216478479a7cfdcffac5f21855d0974309842c89..e348c67ae0461674358fa6d34ee8a73648862a6d 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -170,22 +170,40 @@ class CompileTimeStrategy(object):
         return trainer.mode == DistributedMode.ASYNC
 
     def get_role_id(self):
-        return self.role_maker.role_id()
+        try:
+            return self.role_maker._role_id()
+        except Exception:
+            return self.role_maker.role_id()
 
     def get_trainers(self):
-        return self.role_maker.worker_num()
+        try:
+            return self.role_maker._worker_num()
+        except Exception:
+            return self.role_maker.worker_num()
 
     def get_ps_endpoint(self):
-        return self.role_maker.get_pserver_endpoints()[self.get_role_id()]
+        try:
+            return self.role_maker._get_pserver_endpoints()[self.get_role_id()]
+        except Exception:
+            return self.role_maker.get_pserver_endpoints()[self.get_role_id()]
 
     def get_ps_endpoints(self):
-        return self.role_maker.get_pserver_endpoints()
+        try:
+            return self.role_maker._get_pserver_endpoints()
+        except Exception:
+            return self.role_maker.get_pserver_endpoints()
 
     def get_heter_worker_endpoints(self):
-        return self.role_maker._get_heter_worker_endpoints()
+        try:
+            return self.role_maker._get_heter_worker_endpoints()
+        except Exception:
+            return self.role_maker.get_heter_worker_endpoints()
 
     def get_heter_worker_endpoint(self):
-        return self.role_maker._get_heter_worker_endpoint()
+        try:
+            return self.role_maker._get_heter_worker_endpoint()
+        except Exception:
+            return self.role_maker.get_heter_worker_endpoint()
 
     def get_origin_programs(self):
         return self.origin_main_program, self.origin_startup_program
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bc9f182d95e3b728fbc0866e1c79f5508d3a04aa..4a750f301a02c1a8f90e4103103c174baf32ead9 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -11229,7 +11229,7 @@ def shape(input):
                 input.shape = [3, 2]
 
     Args:
-        input (Variable): The input can be N-D Tensor or SelectedRows with data type float16, float32, float64, int32, int64.
+        input (Variable): The input can be N-D Tensor or SelectedRows with data type bool, float16, float32, float64, int32, int64.
                           If input variable is type of SelectedRows, returns the shape of it's inner tensor.
 
     Returns:
@@ -11253,8 +11253,8 @@ def shape(input):
             print(res) # [array([  3, 100, 100], dtype=int32)]
     """
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'shape')
+        input, 'input',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'shape')
     helper = LayerHelper('shape', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
     helper.append_op(
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 89acfc6075be0b625da04d187cd46dd47ac699c9..0ce7c098e2d53cbcea0f491a6a816c388d14ea4b 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -680,8 +680,10 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     if not isinstance(value, Variable):
         if dtype in ['int64', 'int32']:
             attrs['str_value'] = str(int(value))
+            attrs['value'] = int(value)
         else:
             attrs['str_value'] = str(float(value))
+            attrs['value'] = float(value)
 
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index b7848066280b720bd19a2d2f03e3f01bd98a77b9..8d236dca22f2266771a029b7cdbf7db21aefb1fe 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -4,6 +4,7 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FL
 set(dist_ENVS http_proxy="" https_proxy="")
 
 file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
+list(REMOVE_ITEM DIST_TEST_OPS "test_dist_op")
 if(NOT WITH_NCCL)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
 endif()
@@ -102,7 +103,6 @@ if(WIN32)
 endif()
 
 
-LIST(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_new)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint1)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint2)
@@ -463,8 +463,8 @@ if(WITH_DISTRIBUTE)
 	   #py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
         if(NOT WIN32)
             py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
-            #py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
-            #py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
         endif(NOT WIN32)
     endif(NOT APPLE)
     if(WITH_DGC)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
index 510b615654751500c33dc3311353ba7e2f8baf40..b8a18179742df108d44dbf527adba819eefe91cd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
@@ -56,8 +56,30 @@ class TestLoggingUtils(unittest.TestCase):
         with self.assertRaises(TypeError):
             paddle.jit.set_verbosity(3.3)
 
-    def test_code_level(self):
+    def test_also_to_stdout(self):
+        logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout = None
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout, False)
 
+        paddle.jit.set_verbosity(also_to_stdout=False)
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout, False)
+
+        logging_utils._TRANSLATOR_LOGGER.need_to_echo_node_to_stdout = None
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_code_to_stdout, False)
+
+        paddle.jit.set_code_level(also_to_stdout=True)
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_code_to_stdout, True)
+
+        with self.assertRaises(AssertionError):
+            paddle.jit.set_verbosity(also_to_stdout=1)
+
+        with self.assertRaises(AssertionError):
+            paddle.jit.set_code_level(also_to_stdout=1)
+
+    def test_set_code_level(self):
         paddle.jit.set_code_level(None)
         os.environ[logging_utils.CODE_LEVEL_ENV_NAME] = '2'
         self.assertEqual(logging_utils.get_code_level(), 2)
@@ -71,7 +93,25 @@ class TestLoggingUtils(unittest.TestCase):
         with self.assertRaises(TypeError):
             paddle.jit.set_code_level(3.3)
 
-    def test_log(self):
+    def test_log_api(self):
+        # test api for CI Converage
+        logging_utils.set_verbosity(1, True)
+
+        logging_utils.warn("warn")
+        logging_utils.error("error")
+
+        logging_utils.log(1, "log level 1")
+        logging_utils.log(2, "log level 2")
+
+        source_code = "x = 3"
+        ast_code = gast.parse(source_code)
+        logging_utils.set_code_level(1, True)
+        logging_utils.log_transformed_code(1, ast_code, "TestTransformer")
+        logging_utils.set_code_level(logging_utils.LOG_AllTransformer, True)
+        logging_utils.log_transformed_code(logging_utils.LOG_AllTransformer,
+                                           ast_code, "TestTransformer")
+
+    def test_log_message(self):
         stream = io.BytesIO() if six.PY2 else io.StringIO()
         log = self.translator_logger.logger
         stdout_handler = logging.StreamHandler(stream)
@@ -84,13 +124,14 @@ class TestLoggingUtils(unittest.TestCase):
 
         if six.PY3:
             with mock.patch.object(sys, 'stdout', stream):
+                logging_utils.set_verbosity(1, False)
                 logging_utils.warn(warn_msg)
                 logging_utils.error(error_msg)
-                self.translator_logger.verbosity_level = 1
                 logging_utils.log(1, log_msg_1)
                 logging_utils.log(2, log_msg_2)
 
-            result_msg = '\n'.join([warn_msg, error_msg, log_msg_1, ""])
+            result_msg = '\n'.join(
+                [warn_msg, error_msg, "(Level 1) " + log_msg_1, ""])
             self.assertEqual(result_msg, stream.getvalue())
 
     def test_log_transformed_code(self):
diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
index 6a752bc3053d7d0672bd0002250252c3bbbfa1e1..766dcc39af1a55614b0c179a686dbaab5af1e349 100644
--- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@@ -19,7 +19,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec0bd52e9261017335f0bf424d32f26d4c465029
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class ConvAffineChannelFusePassExplicitPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=False,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
+
+
+class ConvAffineChannelFusePassValidPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='VALID',
+                bias_attr=False,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
+
+
+class ConvAffineChannelFusePassSamePaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='SAME',
+                bias_attr=False,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
+
+
+class ConvEltwiseAddAffineChannelFusePassExplicitPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=param_attr,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_eltwiseadd_affine_channel_fuse_pass'))
+
+
+class ConvEltwiseAddAffineChannelFusePassValidPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='VALID',
+                bias_attr=param_attr,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_eltwiseadd_affine_channel_fuse_pass'))
+
+
+class ConvEltwiseAddAffineChannelFusePassSamePaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='Same',
+                bias_attr=param_attr,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_eltwiseadd_affine_channel_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eb397b5a95b240dcaff9dee3758646b35ab5022
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+"""Test for fusion of conv and bias."""
+
+
+#padding SAME
+class ConvBiasMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="SAME",
+                bias_attr=param_attr)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#padding VALID
+class ConvBiasMkldnnFusePassTest1(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                bias_attr=param_attr)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#padding number
+class ConvBiasMkldnnFusePassTest2(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding=[2, 4, 6, 8],
+                bias_attr=param_attr)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#dilation not supported yet, just print warning log and does not fuse
+class ConvBiasMkldnnFusePassTest3(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                dilation=2,
+                groups=3,
+                bias_attr=param_attr,
+                use_cudnn=False,
+                act="softmax",
+                data_format="NCHW")
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#all conv params except for dilation
+class ConvBiasMkldnnFusePassTest4(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                groups=3,
+                bias_attr=param_attr,
+                use_cudnn=False,
+                act="softmax",
+                data_format="NCHW")
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffe177e59d86e4451b3da1e40fd62e1d398b03b2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class ConvBnFusePassExplicitPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=False,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
+
+
+class ConvBnFusePassValidPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='VALID',
+                bias_attr=False,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
+
+
+class ConvBnFusePassSamePaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='SAME',
+                bias_attr=False,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
+
+
+class ConvEltwiseAddBnFuseExplicitPaddingPass(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=None,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
+
+
+class ConvEltwiseAddBnFuseValidPaddingPass(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='VALID',
+                bias_attr=None,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
+
+
+class ConvEltwiseAddBnFuseSamePaddingPass(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='SAME',
+                bias_attr=None,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
index d6dbd397b90368d5cac27c3c5d92b7a7dce9dcf5..6907b6a7eb50eb90224af678c0ae6d053e05ac9b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
@@ -19,6 +19,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 """Test for fusion of conv, elementwise_add and 2 act."""
 
@@ -46,6 +47,9 @@ class ConvElementwiseAdd2ActFusePassTest(InferencePassTest):
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_elementwise_add2_act_fuse_pass'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
index 2e9035420d7ee45dd69de8d3cd8acc9bb1590c72..6ff60aa6debf8968e1f674e3c7aff822dfde075c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
@@ -19,6 +19,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 """Test for fusion of conv, elementwise_add and act."""
 
@@ -48,6 +49,9 @@ class ConvElementwiseAddActFusePassTest(InferencePassTest):
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_elementwise_add_act_fuse_pass'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
index 7c4e0d6e76ec45c4b75ba91522306a4dd0abc7c5..96b046edaec49038d6c8e137494c13bd7484cb7f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
@@ -19,6 +19,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 """Test for fusion of conv and elementwise_add."""
 
@@ -44,6 +45,8 @@ class ConvElementwiseAddFusePassTest(InferencePassTest):
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_elementwise_add_fuse_pass'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..a62adcea3f94379aa81643e26a7df53ab92fe676
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class FcFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 128, 768], dtype="float32")
+            data_y = fluid.data(name="y", shape=[-1, 128, 768], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=3072,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            fc_out2 = fluid.layers.fc(input=fc_out1,
+                                      size=768,
+                                      num_flatten_dims=2)
+
+        self.feeds = {"data": np.random.random((4, 128, 768)).astype("float32")}
+        self.fetch_list = [fc_out2]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+        self.assertTrue(PassVersionChecker.IsCompatible('fc_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7b43470d402f8671091365db04237797a012e78
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class FcGruFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            dict_dim, emb_dim = 128, 64
+            data = fluid.data(
+                name='step_data', shape=[None], dtype='int64', lod_level=1)
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            hidden_dim = 512
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
+            hidden = fluid.layers.dynamic_gru(
+                input=x,
+                size=hidden_dim,
+                bias_attr=True,
+                origin_mode=False,
+                is_reverse=True)
+
+        batch = 16
+        lod_tensor = fluid.LoDTensor()
+        lod_tensor.set(np.random.randint(
+            0, dict_dim, size=[batch]).astype("int64"),
+                       fluid.CPUPlace())
+        lod_tensor.set_lod([[0, batch]])
+        self.feeds = {"step_data": lod_tensor}
+        self.fetch_list = [hidden]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+        self.assertTrue(PassVersionChecker.IsCompatible('fc_gru_fuse_pass'))
+
+
+class MulGruFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            dict_dim, emb_dim = 128, 64
+            data = fluid.data(
+                name='step_data', shape=[None], dtype='int64', lod_level=1)
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            hidden_dim = 512
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3, bias_attr=False)
+            hidden = fluid.layers.dynamic_gru(
+                input=x,
+                size=hidden_dim,
+                bias_attr=True,
+                origin_mode=False,
+                is_reverse=True)
+
+        batch = 16
+        lod_tensor = fluid.LoDTensor()
+        lod_tensor.set(np.random.randint(
+            0, dict_dim, size=[batch]).astype("int64"),
+                       fluid.CPUPlace())
+        lod_tensor.set_lod([[0, batch]])
+        self.feeds = {"step_data": lod_tensor}
+        self.fetch_list = [hidden]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+        self.assertTrue(PassVersionChecker.IsCompatible('mul_gru_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbb4373dae2c44148a5ac6b65c11a3d47adfd1a1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class MulLstmFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            dict_dim, emb_dim = 128, 64
+            hidden_dim = 512
+
+            data = fluid.data(
+                name='data', shape=[1], dtype='int64', lod_level=1)
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 4, bias_attr=False)
+            forward, cell = fluid.layers.dynamic_lstm(
+                input=x, size=hidden_dim * 4)
+
+        batch = 16
+        lod_tensor = fluid.LoDTensor()
+        lod_tensor.set(np.random.randint(
+            0, dict_dim, size=[batch]).astype("int64"),
+                       fluid.CPUPlace())
+        lod_tensor.set_lod([[0, batch]])
+        self.feeds = {"data": lod_tensor}
+        self.fetch_list = [forward, cell]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+        self.assertTrue(PassVersionChecker.IsCompatible('mul_lstm_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..c78884480dab38446a20c456aed060eff847a9ed
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class RepeatedFcReluFusePass3Test(InferencePassTest):
+    def setUp(self):
+        fc_num = 3
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                bias_attr=param_attr,
+                act=None)
+            fc_outs = []
+            fc_outs.append(
+                fluid.layers.fc(input=[conv_out], act="relu", size=1000))
+            for i in range(1, fc_num):
+                fc_outs.append(
+                    fluid.layers.fc(
+                        input=[fc_outs[i - 1]], act="relu", size=1000))
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [fc_outs[fc_num - 1]]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('repeated_fc_relu_fuse_pass'))
+
+
+class RepeatedFcReluFusePass9Test(InferencePassTest):
+    def setUp(self):
+        fc_num = 9
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                bias_attr=param_attr,
+                act=None)
+            fc_outs = []
+            fc_outs.append(
+                fluid.layers.fc(input=[conv_out], act="relu", size=1000))
+            for i in range(1, fc_num):
+                fc_outs.append(
+                    fluid.layers.fc(
+                        input=[fc_outs[i - 1]], act="relu", size=1000))
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [fc_outs[fc_num - 1]]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('repeated_fc_relu_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..eadda5ba06a79f061bcf87f9b0bf2c0770c763f5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class SeqconvEltaddReluFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.sequence_conv(
+                input=data,
+                num_filters=16,
+                filter_size=4,
+                padding_start=0,
+                act="relu",
+                bias_attr=param_attr)
+
+        np_data = np.random.random((80, 100)).astype('float32')
+        x_lod_tensor = fluid.create_lod_tensor(np_data, [[10, 20, 30, 20]],
+                                               fluid.CPUPlace())
+        self.feeds = {"data": x_lod_tensor}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+
+
+class SeqconvEltaddReluFusePassTestPaddingStartPositive(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[-1, 4], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.sequence_conv(
+                input=data,
+                num_filters=16,
+                filter_size=3,
+                padding_start=2,
+                act="relu",
+                bias_attr=param_attr)
+
+        np_data = np.array([[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3],
+                            [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7]]).astype('float32')
+        x_lod_tensor = fluid.create_lod_tensor(np_data, [[5, 2]],
+                                               fluid.CPUPlace())
+        self.feeds = {"data": x_lod_tensor}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+
+
+class SeqconvEltaddReluFusePassTestPaddingStartNegative(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.sequence_conv(
+                input=data,
+                num_filters=16,
+                filter_size=4,
+                padding_start=-1,
+                act="relu",
+                bias_attr=param_attr)
+
+        np_data = np.random.random((80, 100)).astype('float32')
+        x_lod_tensor = fluid.create_lod_tensor(np_data, [[10, 20, 30, 20]],
+                                               fluid.CPUPlace())
+        self.feeds = {"data": x_lod_tensor}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+
+
+class SeqconvEltaddReluFusePassTestPaddingStartNone(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.sequence_conv(
+                input=data,
+                num_filters=16,
+                filter_size=4,
+                act="relu",
+                bias_attr=param_attr)
+
+        np_data = np.random.random((80, 100)).astype('float32')
+        x_lod_tensor = fluid.create_lod_tensor(np_data, [[10, 20, 30, 20]],
+                                               fluid.CPUPlace())
+        self.feeds = {"data": x_lod_tensor}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fa242df4e412fb9c2f3af08b3a186c3e086f2d6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class SquaredMatSubFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_a = fluid.data(name="data_a", shape=[128, 1], dtype="float32")
+            data_b = fluid.data(name="data_b", shape=[256, 1], dtype="float32")
+
+            fc_a = fluid.layers.fc(data_a, size=256)
+            fc_b = fluid.layers.fc(data_b, size=64)
+
+            data_a_square = paddle.square(fc_a)
+            data_b_square = paddle.square(fc_b)
+
+            matmul_ab = paddle.matmul(fc_a, fc_b)
+            matmul_ab_square = paddle.square(matmul_ab)
+            matmul_square_ab = paddle.matmul(data_a_square, data_b_square)
+
+            scale = paddle.fill_constant(shape=[1], value=0.5, dtype='float32')
+
+            sub_val = paddle.elementwise_sub(matmul_ab_square, matmul_square_ab)
+            squared_mat_sub_out = fluid.layers.elementwise_mul(sub_val, scale)
+
+        self.feeds = {
+            "data_a": np.random.random((128, 1)).astype("float32"),
+            "data_b": np.random.random((256, 1)).astype("float32")
+        }
+        self.fetch_list = [squared_mat_sub_out]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('squared_mat_sub_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
index 34a52e7aed342ac8db471ad94b277efd0faf9d27..83d4b7091cb3276ba8e2c1ff9fd7dca9b1692c63 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
@@ -75,7 +75,9 @@ class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest):
             use_gpu = True
             self.check_output_with_option(use_gpu)
 
-        PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass')
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'transpose_flatten_concat_fuse_pass'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9c304496afcc694e47924b9cf0fbd168d08c665
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class ShuffleChannelFuseTRTPassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            reshape1 = fluid.layers.reshape(x=data, shape=[-1, 2, 3, 64, 64])
+            trans = fluid.layers.transpose(x=reshape1, perm=[0, 2, 1, 3, 4])
+            reshape2 = fluid.layers.reshape(x=trans, shape=[-1, 6, 64, 64])
+            out = fluid.layers.batch_norm(reshape2, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = ShuffleChannelFuseTRTPassTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('shuffle_channel_detect_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
index 90db9595d92ef602c03fa7dd104484a4f6101a87..3c78438bdf68538da598f19270d8812e1286474d 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
@@ -67,13 +67,13 @@ class AutoCheckpointTestDist(AutoCheckPointACLBase):
         save_dir = "./run_save_0"
         fs.delete(save_dir)
 
-        #basic
+        # basic
         exe, main_prog, startup_prog = self._generate()
 
         compiled, data_loader, optimizer, loss, image, label = \
             self._init_env(exe, main_prog, startup_prog, minimize=False)
 
-        #fleet
+        # fleet
         os.environ["TRAINING_ROLE"] = "TRAINER"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 4982cd195820811b9a8ec3fe6d01955234032120..c6190590108876ba97feb4dad0c31884727ec978 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -26,7 +26,7 @@ def stable_softmax(x):
     return exps / np.sum(exps)
 
 
-def log_softmax(x, axis=-1):
+def log_softmax(x, axis=1):
     softmax_out = np.apply_along_axis(stable_softmax, axis, x)
     return np.log(softmax_out)
 
diff --git a/python/paddle/fluid/tests/unittests/test_device_guard.py b/python/paddle/fluid/tests/unittests/test_device_guard.py
index eb8861f1bc462e61d4bd685fca39452bfd8a8c4e..330065ecd92f1537e10df7b993682da821a46548 100644
--- a/python/paddle/fluid/tests/unittests/test_device_guard.py
+++ b/python/paddle/fluid/tests/unittests/test_device_guard.py
@@ -33,6 +33,14 @@ def execute(main_program, startup_program):
     exe.run(main_program)
 
 
+def get_vaild_warning_num(warning, w):
+    num = 0
+    for i in range(len(w)):
+        if warning in str(w[i].message):
+            num += 1
+    return num
+
+
 class TestDeviceGuard(unittest.TestCase):
     def test_device_guard(self):
         main_program = fluid.Program()
@@ -133,7 +141,10 @@ class TestDeviceGuard(unittest.TestCase):
                         i = fluid.layers.increment(x=i, value=1, in_place=True)
                         fluid.layers.less_than(x=i, y=loop_len, cond=cond)
 
-        assert len(w) == 1
+        warning = "The Op(while) is not support to set device."
+        warning_num = get_vaild_warning_num(warning, w)
+        assert warning_num == 1
+
         all_ops = main_program.global_block().ops
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
         for op in all_ops:
@@ -169,7 +180,10 @@ class TestDeviceGuard(unittest.TestCase):
                         shape=[1], value=4.0, dtype='float32')
                     result = fluid.layers.less_than(x=x, y=y, force_cpu=False)
 
-        assert len(w) == 2
+        warning = "\'device_guard\' has higher priority when they are used at the same time."
+        warning_num = get_vaild_warning_num(warning, w)
+        assert warning_num == 2
+
         all_ops = main_program.global_block().ops
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
         for op in all_ops:
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
index f339081e31b87b8d5584fd4f866e0aaf6f391ea7..007affc14084956870e50e0e1b6b650d13cc3926 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -67,6 +67,13 @@ class TestElementwiseModOp_scalar(TestElementwiseModOp):
         self.out = np.floor_divide(self.x, self.y)
 
 
+class TestElementwiseModOpInverse(TestElementwiseModOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
 class TestFloorDivideOp(unittest.TestCase):
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_empty_like_op.py b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..32d732d9a809950ade5484431b833056336acd54
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
@@ -0,0 +1,192 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.data_feeder import convert_dtype
+import paddle.fluid.core as core
+from paddle.static import program_guard, Program
+
+
+class TestEmptyLikeAPICommon(unittest.TestCase):
+    def __check_out__(self, out):
+        data_type = convert_dtype(out.dtype)
+        self.assertEqual(data_type, self.dst_dtype,
+                         'dtype should be %s, but get %s' %
+                         (self.dst_dtype, data_type))
+
+        shape = out.shape
+        self.assertTupleEqual(shape, self.dst_shape,
+                              'shape should be %s, but get %s' %
+                              (self.dst_shape, shape))
+
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(out)
+            min_value = np.nanmin(out)
+            always_non_full_zero = max_value > min_value
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = out.size
+            true_num = np.sum(out == True)
+            false_num = np.sum(out == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+
+class TestEmptyLikeAPI(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        out = paddle.empty_like(self.x, self.dtype)
+        self.__check_out__(out.numpy())
+        paddle.enable_static()
+
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float32")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI2(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float64")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI3(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI4(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int64")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI5(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("bool")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI6(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float64")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI7(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI8(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int64")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI9(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("bool")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI10(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float32")
+        self.dtype = "bool"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI_Static(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+
+    def test_static_graph(self):
+        dtype = 'float32'
+
+        train_program = Program()
+        startup_program = Program()
+
+        with program_guard(train_program, startup_program):
+            x = np.random.random(self.x_shape).astype(dtype)
+            data_x = paddle.static.data(
+                'x', shape=self.data_x_shape, dtype=dtype)
+
+            out = paddle.empty_like(data_x)
+
+        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        res = exe.run(train_program, feed={'x': x}, fetch_list=[out])
+
+        self.dst_dtype = dtype
+        self.dst_shape = x.shape
+        self.__check_out__(res[0])
+
+    def init_config(self):
+        self.x_shape = (200, 3)
+        self.data_x_shape = [200, 3]
+
+
+class TestEmptyLikeAPI_Static2(TestEmptyLikeAPI_Static):
+    def init_config(self):
+        self.x_shape = (3, 200, 3)
+        self.data_x_shape = [-1, 200, 3]
+
+
+class TestEmptyError(unittest.TestCase):
+    def test_attr(self):
+        def test_dtype():
+            x = np.random.random((200, 3)).astype("float64")
+            dtype = 'uint8'
+            result = paddle.empty_like(x, dtype=dtype)
+
+        self.assertRaises(TypeError, test_dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 7835fd3f53ddb7f9a95313c6cc5fc7b72ae6d664..01f0abe0f217c342c4ea14cb55b4c40b5d273284 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -306,5 +306,70 @@ class TestFakeQuantDequantAbsOp(OpTest):
         self.check_grad(["X"], "Out", user_defined_grads=gradient)
 
 
+class TestChannelWiseFakeQuantDequantOp(OpTest):
+    def setUp(self):
+        self.set_arg()
+        assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
+
+        self.op_type = "fake_channel_wise_quantize_dequantize_abs_max"
+        self.attrs = {'bit_length': 8, 'quant_axis': self.quant_axis}
+
+        scales = []
+        outputs = self.inputs['X'].copy()
+        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
+        if self.quant_axis == 0:
+            for i in range(self.inputs['X'].shape[0]):
+                scale_v = np.max(np.abs(self.inputs['X'][i])).astype("float32")
+                scales.append(scale_v)
+                outputs[i] = np.round(outputs[i] * range_v /
+                                      scale_v) * scale_v / range_v
+        elif self.quant_axis == 1:
+            for i in range(self.inputs['X'].shape[1]):
+                scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
+                    "float32")
+                scales.append(scale_v)
+                outputs[:, i] = np.round(outputs[:, i] * range_v /
+                                         scale_v) * scale_v / range_v
+
+        self.outputs = {
+            'Out': outputs,
+            'OutScale': np.array(scales).astype("float32"),
+        }
+
+    def set_arg(self):
+        self.quant_axis = 0
+        self.inputs = {
+            'X': np.random.random((3, 4, 64, 64)).astype("float32"),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        x = self.inputs["X"]
+        gradient = [np.ones(x.shape) / np.product(x.shape)]
+        self.check_grad(["X"], "Out", user_defined_grads=gradient)
+
+
+class TestChannelWiseFakeQuantDequantOp1(TestChannelWiseFakeQuantDequantOp):
+    def set_arg(self):
+        self.quant_axis = 1
+        self.inputs = {
+            'X': np.random.random((15, 20, 5, 5)).astype("float32"),
+        }
+
+
+class TestChannelWiseFakeQuantDequantOp2(TestChannelWiseFakeQuantDequantOp):
+    def set_arg(self):
+        self.quant_axis = 0
+        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+
+
+class TestChannelWiseFakeQuantDequantOp3(TestChannelWiseFakeQuantDequantOp):
+    def set_arg(self):
+        self.quant_axis = 1
+        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 4ced9841ee43e02a3d1e3f292bf97200dec29f5c..45597e7253c4d5bab50aa58f5f58e13e89ce1c1e 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -24,10 +24,10 @@ import numpy as np
 class TestFleetBase(unittest.TestCase):
     def setUp(self):
         os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36000"
         os.environ["PADDLE_TRAINERS_NUM"] = "2"
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+            "127.0.0.1:36001,127.0.0.2:36002"
 
     def test_init(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
@@ -58,37 +58,56 @@ class TestFleetBase(unittest.TestCase):
     def test_worker_endpoints(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        print(fleet.worker_endpoints(to_string=True))
+        self.assertEqual(
+            "127.0.0.1:36000", fleet.worker_endpoints(to_string=True))
+        self.assertEqual(["127.0.0.1:36000"], fleet.worker_endpoints())
 
     def test_server_num(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
-        if fleet.is_server():
-            print("fleet server num: {}".format(fleet.server_num()))
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        self.assertEqual(2, fleet.server_num())
 
     def test_server_index(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
-        if fleet.is_server():
-            print("fleet server index: {}".format(fleet.server_index()))
+        self.assertEqual(0, fleet.server_index())
 
     def test_server_endpoints(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
         if fleet.is_server():
-            print("fleet server index: {}".format(
-                fleet.server_endpoints(to_string=True)))
+            self.assertEqual(
+                "127.0.0.1:36001,127.0.0.2:36002",
+                fleet.server_endpoints(to_string=True))
+            self.assertEqual(["127.0.0.1:36001", "127.0.0.2:36002"],
+                             fleet.server_endpoints())
 
     def test_is_server(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
-        if fleet.is_server():
-            print("test fleet is server")
+        self.assertTrue(fleet.is_server())
 
     def test_util(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        self.assertEqual(fleet.util, None)
+        self.assertEqual(fleet.util(), None)
 
     def test_barrier_worker(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
@@ -99,20 +118,17 @@ class TestFleetBase(unittest.TestCase):
     def test_init_worker(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        if fleet.is_worker():
-            fleet.init_worker()
 
-    def test_run_server(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        if fleet.is_worker():
-            fleet.run_worker()
+        with self.assertRaises(ValueError):
+            if fleet.is_worker():
+                fleet.init_worker()
 
     def test_stop_worker(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        if fleet.is_worker():
-            fleet.stop_worker()
+        with self.assertRaises(ValueError):
+            if fleet.is_worker():
+                fleet.stop_worker()
 
     def test_distributed_optimizer(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
index a831f6e838e950f9955c762544c312ed2d8766a9..dae7907161697107a50eaf1b1501881f74509b76 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
@@ -87,7 +87,7 @@ class TestCloudRoleMaker2(unittest.TestCase):
         role2._all_gather(1)
         role2._all_gather(1)
         role2._barrier_server()
-        role2.all_gather(1)
+        role2._all_gather(1)
         role3 = GeneralRoleMaker(path="./test_gloo_3")
         role3._worker_gather(1)
         role3._worker_gather(1)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
index 6414ef18d635aea4b73c17a4931c37e596ed6029..6cb40eef27e4d93606167232dba3fc181af3c17a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
@@ -40,9 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase):
             from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
             from paddle.fluid.incubate.fleet.base.role_maker import \
                 GeneralRoleMaker
-            from paddle.distributed.fleet.utils import KVHandler
-            from paddle.distributed.fleet.utils import KVServer
-            from paddle.distributed.fleet.utils import KVHTTPServer
+            from paddle.distributed.fleet.utils.http_server import KVHandler
+            from paddle.distributed.fleet.utils.http_server import KVServer
+            from paddle.distributed.fleet.utils.http_server import KVHTTPServer
         except:
             print("warning: no fleet, skip test_pslib_4")
             return
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index cf9b3e1e9a1605a714b47d99183511b24c903722..4dd254af251ae955878f9846e0f0e06f65c3ec90 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -15,7 +15,11 @@
 
 from __future__ import print_function
 import os
+import platform
+import shutil
+import tempfile
 import unittest
+import paddle
 import paddle.distributed.fleet.base.role_maker as role_maker
 
 
@@ -26,25 +30,25 @@ class TestRoleMakerBase(unittest.TestCase):
 
     def test_rolemaker_base(self):
         role = role_maker.RoleMakerBase()
-        self.assertRaises(Exception, role.is_worker)
-        self.assertRaises(Exception, role.is_server)
-        self.assertRaises(Exception, role.is_first_worker)
-        self.assertRaises(Exception, role.worker_num)
-        self.assertRaises(Exception, role.server_num)
-        self.assertRaises(Exception, role.worker_index)
-        self.assertRaises(Exception, role.server_index)
-        self.assertRaises(Exception, role.role_id)
-        self.assertRaises(Exception, role.node_num)
-
-        trainer_endpoints = role.get_trainer_endpoints()
+        self.assertRaises(Exception, role._is_worker)
+        self.assertRaises(Exception, role._is_server)
+        self.assertRaises(Exception, role._is_first_worker)
+        self.assertRaises(Exception, role._worker_num)
+        self.assertRaises(Exception, role._server_num)
+        self.assertRaises(Exception, role._worker_index)
+        self.assertRaises(Exception, role._server_index)
+        self.assertRaises(Exception, role._role_id)
+        self.assertRaises(Exception, role._node_num)
+
+        trainer_endpoints = role._get_trainer_endpoints()
         self.assertTrue(len(trainer_endpoints) == 0)
-        pserver_endpoints = role.get_pserver_endpoints()
+        pserver_endpoints = role._get_pserver_endpoints()
         self.assertTrue(len(pserver_endpoints) == 0)
 
         print(role.to_string())
-        self.assertTrue(role._all_gather(role._node_type_comm, 1) is None)
-        self.assertTrue(role._all_reduce(role._node_type_comm, 1) is None)
-        role._barrier(role._node_type_comm)
+        self.assertTrue(role._all_gather(1, "worker") is None)
+        self.assertTrue(role._all_reduce(1, "sum", "worker") is None)
+        role._barrier("worker")
 
 
 class TestCloudRoleMaker(unittest.TestCase):
@@ -72,21 +76,33 @@ class TestCloudRoleMaker(unittest.TestCase):
             print("warning: no netifaces, skip test_tr_rolemaker")
             return
 
-        ro = role_maker.PaddleCloudRoleMaker(
-            is_collective=False, init_gloo=False)
-        self.assertTrue(ro.is_worker())
-        self.assertFalse(ro.is_server())
-        self.assertEqual(ro.worker_num(), 2)
-        self.assertTrue(ro.is_first_worker())
-        worker_endpoints = ro.get_trainer_endpoints()
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertTrue(ro._is_worker())
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertFalse(ro._is_server())
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertEqual(ro._worker_num(), 2)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertTrue(ro._is_first_worker())
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        worker_endpoints = ro._get_trainer_endpoints()
         self.assertEqual(worker_endpoints[0], '127.0.0.1:36001')
-        self.assertEqual(ro.role_id(), 0)
-        self.assertEqual(ro.node_num(), 2)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertEqual(ro._role_id(), 0)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertEqual(ro._node_num(), 2)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertFalse(ro._is_non_distributed())
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertEqual(ro._heter_worker_num(), 0)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertFalse(ro._is_heter_worker())
 
     def test_tr_rolemaker_collective(self):
         ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        self.assertEqual(ro.worker_num(), 2)
-        self.assertEqual(ro.node_num(), 2)
+        self.assertEqual(ro._worker_num(), 2)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        self.assertEqual(ro._node_num(), 2)
 
     def test_ps_rolemaker(self):
         """Test ps rolemaker."""
@@ -102,14 +118,15 @@ class TestCloudRoleMaker(unittest.TestCase):
 
         ro = role_maker.PaddleCloudRoleMaker(
             is_collective=False, init_gloo=False)
-        self.assertEqual(ro.server_index(), 0)
-        self.assertFalse(ro.is_worker())
-        self.assertTrue(ro.is_server())
-        self.assertEqual(ro.server_num(), 2)
-        pserver_endpoints = ro.get_pserver_endpoints()
+        self.assertEqual(ro._server_index(), 0)
+        self.assertFalse(ro._is_worker())
+        self.assertTrue(ro._is_server())
+        self.assertEqual(ro._server_num(), 2)
+        pserver_endpoints = ro._get_pserver_endpoints()
         self.assertEqual(pserver_endpoints[0], '127.0.0.1:36001')
-        self.assertTrue(ro._all_gather(ro._all_comm, 1) is None)
-        self.assertTrue(ro._all_reduce(ro._all_comm, 1) is None)
+
+        self.assertEqual(ro._all_gather(1, "worker"), 1)
+        self.assertEqual(ro._all_reduce(1, "sum", "worker"), 1)
 
     def test_traing_role(self):
         """Test training role."""
@@ -121,7 +138,7 @@ class TestCloudRoleMaker(unittest.TestCase):
             return
 
         ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
-        self.assertRaises(ValueError, ro.generate_role)
+        self.assertRaises(ValueError, ro._generate_role)
 
 
 class TestUserDefinedRoleMaker(unittest.TestCase):
@@ -142,14 +159,14 @@ class TestUserDefinedRoleMaker(unittest.TestCase):
         ro = role_maker.UserDefinedRoleMaker(
             is_collective=False,
             init_gloo=False,
-            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
+            server_endpoints=["127.0.0.1:36001", "127.0.0.1:36001"],
             role=role_maker.Role.SERVER,
             current_id=0,
             worker_num=2)
-        self.assertEqual(ro.server_num(), 2)
-        ro.generate_role()
-        self.assertTrue(ro.is_server())
-        self.assertEqual(ro.role_id(), 0)
+        self.assertEqual(ro._server_num(), 2)
+        ro._generate_role()
+        self.assertTrue(ro._is_server())
+        self.assertEqual(ro._role_id(), 0)
 
     def test_tr_rolemaker(self):
         try:
@@ -161,13 +178,589 @@ class TestUserDefinedRoleMaker(unittest.TestCase):
         ro = role_maker.UserDefinedRoleMaker(
             is_collective=False,
             init_gloo=False,
-            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
+            server_endpoints=["127.0.0.1:36001", "127.0.0.1:36001"],
             role=role_maker.Role.WORKER,
             current_id=0,
             worker_num=2)
-        self.assertIn("127.0.0.1:36001", ro.get_pserver_endpoints())
-        self.assertTrue(ro.is_worker())
-        self.assertEqual(ro.role_id(), 0)
+
+        self.assertIn("127.0.0.1:36001", ro._get_pserver_endpoints())
+        self.assertTrue(ro._is_worker())
+        self.assertEqual(ro._role_id(), 0)
+
+
+class TestGlooWithCloudRoleMaker(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def case(self, role, comm_world):
+        role._barrier(comm_world)
+
+        gather = role._all_gather(1, comm_world)
+        self.assertEqual(gather[0], 1)
+
+        all_reduce = role._all_reduce(1, "sum", comm_world)
+        self.assertEqual(1, all_reduce)
+
+    def mkdir(self):
+        tmp = tempfile.mkdtemp()
+        return tmp
+
+    def clean(self, tmp):
+        shutil.rmtree(tmp)
+
+    def test_hdfs_gloo(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "worker")
+        self.clean(tmp)
+
+    def test_fs_gloo(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "worker")
+        self.clean(tmp)
+
+    def test_fs_gloo2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.clean(tmp)
+
+    def test_fs_gloo3(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.clean(tmp)
+
+    def test_fs_gloo4(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
+        os.environ["PADDLE_GLOO_HTTP_HOST"] = "127.0.0.1"
+        os.environ["PADDLE_GLOO_HTTP_PORT"] = "30019"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        import time
+        time.sleep(3)
+
+    def test_fs_gloo5(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.case(role, "all")
+        self.clean(tmp)
+
+    def test_fs_gloo6(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.case(role, "all")
+        self.clean(tmp)
+
+    def test_fs_gloo7(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "5"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_fs_gloo8(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        def net():
+            x = paddle.fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = paddle.fluid.layers.fc(input=x, size=1, act=None)
+            y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+            cost = paddle.fluid.layers.square_error_cost(
+                input=y_predict, label=y)
+            avg_cost = paddle.fluid.layers.mean(cost)
+            return avg_cost
+
+        from paddle.distributed import fleet
+
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        avg_cost = net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+
+        optimizer = paddle.optimizer.SGD(0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(avg_cost)
+
+        comm_world = "server"
+        fleet.util().barrier(comm_world)
+
+        gather = fleet.util().all_gather(1, comm_world)
+        self.assertEqual(gather[0], 1)
+
+        all_reduce = fleet.util().all_reduce(1, "sum", comm_world)
+        self.assertEqual(1, all_reduce)
+
+        self.clean(tmp)
+
+
+class TestGlooWithCloudRoleMaker(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def case(self, role, comm_world):
+        role._barrier(comm_world)
+
+        gather = role._all_gather(1, comm_world)
+        self.assertEqual(gather[0], 1)
+
+        all_reduce = role._all_reduce(1, "sum", comm_world)
+        self.assertEqual(1, all_reduce)
+
+    def mkdir(self):
+        tmp = tempfile.mkdtemp()
+        return tmp
+
+    def clean(self, tmp):
+        shutil.rmtree(tmp)
+
+    def test_hdfs_gloo(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "worker")
+        self.clean(tmp)
+
+    def test_fs_gloo(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "worker")
+        self.clean(tmp)
+
+    def test_fs_gloo2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.clean(tmp)
+
+    def test_fs_gloo3(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.clean(tmp)
+
+    def test_fs_gloo4(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
+        os.environ["PADDLE_GLOO_HTTP_HOST"] = "127.0.0.1"
+        os.environ["PADDLE_GLOO_HTTP_PORT"] = "30019"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        import time
+        time.sleep(3)
+
+    def test_fs_gloo5(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.case(role, "all")
+        self.clean(tmp)
+
+    def test_fs_gloo6(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.case(role, "all")
+        self.clean(tmp)
+
+    def test_fs_gloo7(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "5"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_hdfs_gloo_v2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = ""
+        os.environ["PADDLE_GLOO_FS_UGI"] = ""
+        os.environ["PADDLE_GLOO_FS_PATH"] = ""
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_fs_gloo_v2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = ""
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_http_gloo_v2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
+        os.environ["PADDLE_GLOO_HTTP_HOST"] = ""
+        os.environ["PADDLE_GLOO_HTTP_PORT"] = ""
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_fs_gloo8(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        def net():
+            x = paddle.fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = paddle.fluid.layers.fc(input=x, size=1, act=None)
+            y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+            cost = paddle.fluid.layers.square_error_cost(
+                input=y_predict, label=y)
+            avg_cost = paddle.fluid.layers.mean(cost)
+            return avg_cost
+
+        from paddle.distributed import fleet
+
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        avg_cost = net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+
+        optimizer = paddle.optimizer.SGD(0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(avg_cost)
+
+        comm_world = "server"
+        fleet.util().barrier(comm_world)
+
+        gather = fleet.util().all_gather(1, comm_world)
+        self.assertEqual(gather[0], 1)
+
+        all_reduce = fleet.util().all_reduce(1, "sum", comm_world)
+        self.assertEqual(1, all_reduce)
+
+        self.clean(tmp)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py
index dde36e073fb20eed3b17c79a886739f59ecb185d..1570912e7406f930212eead64305e1e35e1b8ac0 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -59,7 +59,7 @@ class TestFleetUtil(unittest.TestCase):
         import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        default_util = fleet.util
+        default_util = fleet.util()
         self.assertEqual(default_util, None)
 
     def test_set_user_defined_util(self):
@@ -76,108 +76,17 @@ class TestFleetUtil(unittest.TestCase):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         my_util = UserDefinedUtil()
-        fleet.util = my_util
-        user_id = fleet.util.get_user_id()
+        fleet.set_util(my_util)
+        user_id = fleet.util().get_user_id()
         self.assertEqual(user_id, 10)
 
     def test_fs(self):
-        from paddle.distributed.fleet.utils import LocalFS
+        from paddle.distributed.fleet.utils.fs import LocalFS
         fs = LocalFS()
         dirs, files = fs.ls_dir("test_tmp")
         dirs, files = fs.ls_dir("./")
         self.assertFalse(fs.need_upload_download())
-        fleet_util.set_file_system(fs)
-
-    def test_barrier(self):
-        try:
-            import netifaces
-        except:
-            print("warning: no netifaces, skip test_barrier")
-            return
-
-        gloo = fluid.core.Gloo()
-        gloo.set_rank(0)
-        gloo.set_size(1)
-        gloo.set_prefix("123")
-        gloo.set_iface("lo")
-        gloo.set_hdfs_store("./tmp_test_fleet_barrier", "", "")
-        gloo.init()
-
-        role = role_maker.UserDefinedRoleMaker(
-            is_collective=False,
-            init_gloo=False,
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_endpoints=["127.0.0.1:6003"],
-            server_endpoints=["127.0.0.1:6001"])
-        role._node_type_comm = gloo
-        role._role_is_generated = True
-        fleet_util._set_role_maker(role)
-
-        fleet_util.barrier("worker")
-
-    def test_all_reduce(self):
-        try:
-            import netifaces
-        except:
-            print("warning: no netifaces, skip test_all_reduce")
-            return
-
-        gloo = fluid.core.Gloo()
-        gloo.set_rank(0)
-        gloo.set_size(1)
-        gloo.set_prefix("123")
-        gloo.set_iface("lo")
-        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
-        gloo.init()
-
-        role = role_maker.UserDefinedRoleMaker(
-            is_collective=False,
-            init_gloo=False,
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_endpoints=["127.0.0.1:6003"],
-            server_endpoints=["127.0.0.1:6001"])
-        role._node_type_comm = gloo
-        role._role_is_generated = True
-        fleet_util._set_role_maker(role)
-
-        output = fleet_util.all_reduce(1, "sum", comm_world="server")
-        print(output)
-
-    # self.assertEqual(output, 1)
-
-    def test_all_gather(self):
-        try:
-            import netifaces
-        except:
-            print("warning: no netifaces, skip test_all_gather")
-            return
-
-        gloo = fluid.core.Gloo()
-        gloo.set_rank(0)
-        gloo.set_size(1)
-        gloo.set_prefix("123")
-        gloo.set_iface("lo")
-        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
-        gloo.init()
-
-        role = role_maker.UserDefinedRoleMaker(
-            is_collective=False,
-            init_gloo=False,
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_endpoints=["127.0.0.1:6003"],
-            server_endpoints=["127.0.0.1:6001"])
-        role._node_type_comm = gloo
-        role._all_comm = gloo
-        role._role_is_generated = True
-        fleet_util._set_role_maker(role)
-
-        output = fleet_util.all_gather(1, comm_world="all")
-        print(output)
-        # self.assertTrue(len(output) == 1 and output[0] == 1)
-        self.assertRaises(Exception, fleet_util.all_gather, 1, "test")
+        fleet_util._set_file_system(fs)
 
     def download_files(self):
         path = download(self.proto_data_url, self.module_name,
diff --git a/python/paddle/fluid/tests/unittests/test_fs_interface.py b/python/paddle/fluid/tests/unittests/test_fs_interface.py
index c01876531c99c610706265ff646d93c4a197a26e..581fa9738116dc5c737fd3264528b991e210888b 100644
--- a/python/paddle/fluid/tests/unittests/test_fs_interface.py
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@@ -20,7 +20,7 @@ import os
 import sys
 import inspect
 
-from paddle.distributed.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 
 class FSTest(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 1f6e522d2668b5dcd2075ff7af6b4b1ee674632d..5dcce88acf16b96fc26cd56c2e5e034d19405b6c 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -216,7 +216,7 @@ class API_TestGather(unittest.TestCase):
                       "index": index_np,
                       'axis': axis_np},
                 fetch_list=[out])
-            expected_output = gather_numpy(x_np, index_np, axis_np)
+            expected_output = gather_numpy(x_np, index_np, axis_np[0])
         self.assertTrue(np.allclose(result, expected_output))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs1.py b/python/paddle/fluid/tests/unittests/test_hdfs1.py
index 430ed1abe860869d791f0eac17accc8416db1eca..1aac1236156ca159e9b285611a55f38925be22c2 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs1.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs1.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
-
 
 class FSTest1(FSTestBase):
     def test_timeout(self):
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs2.py b/python/paddle/fluid/tests/unittests/test_hdfs2.py
index 7754f89e3c901ac14cb102881e8d338442038559..1fa019bb9cd02c5f9a7181bc321618d5c37a8755 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs2.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
-
 
 class FSTest2(FSTestBase):
     def test_hdfs(self):
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs3.py b/python/paddle/fluid/tests/unittests/test_hdfs3.py
index 1a045f4b17fc9b8b68ccf81a23cb953db58a9db7..218bf12ca608a1fd908b3c6e2533517a960e6a23 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs3.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
-
 
 class FSTest3(FSTestBase):
     def test_hdfs(self):
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..c75acd7c15b1e96c49fba61b9f8348b62ab73894
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
@@ -0,0 +1,114 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid.backward import calc_gradient
+import numpy as np
+
+
+class ConvBNLayer(fluid.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 use_cudnn=False):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = fluid.dygraph.Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False,
+            use_cudnn=use_cudnn)
+
+        self._batch_norm = fluid.dygraph.BatchNorm(num_filters, act=act)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+def create_program():
+    main = fluid.Program()
+    startup = fluid.Program()
+    with fluid.program_guard(main, startup):
+        x = fluid.data(name='img', shape=[-1, 3, 224, 224])
+        x.stop_gradient = False
+        x = fluid.layers.prelu(x, mode="channel")
+        conv = ConvBNLayer(
+            num_channels=3,
+            num_filters=3,
+            filter_size=1,
+            act='relu',
+            use_cudnn=True)
+        y = conv(x) + x
+
+        loss = fluid.layers.reduce_sum(y)
+
+        sgd = fluid.optimizer.SGD(learning_rate=0.01)
+        sgd.minimize(loss)
+
+    return loss, main, startup, conv._conv.weight
+
+
+class TestInplaceAddto(unittest.TestCase):
+    def test_result(self):
+        def run_program(enable_addto):
+            np.random.seed(10)
+            paddle.manual_seed(10)
+            paddle.framework.random._manual_program_seed(10)
+            if fluid.core.is_compiled_with_cuda():
+                fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+            fluid.set_flags({"FLAGS_max_inplace_grad_add": 2})
+            loss, main, startup, w = create_program()
+            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+
+            strategy = fluid.BuildStrategy()
+            strategy.enable_addto = enable_addto
+            compiled = fluid.CompiledProgram(main).with_data_parallel(
+                loss_name=loss.name, build_strategy=strategy)
+
+            exe.run(startup)
+            img = np.random.uniform(-128, 128,
+                                    [8, 3, 224, 224]).astype(np.float32)
+            for i in range(2):
+                res = exe.run(compiled,
+                              feed={'img': img},
+                              fetch_list=[loss.name, w.name])
+            return res
+
+        res1, w1 = run_program(True)
+        res2, w2 = run_program(False)
+        print(res1, res2)
+        self.assertTrue(np.array_equal(res1, res2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index b0b85f633a2bf613cdbdcc2ba7b31b5d970da8ca..80b201d0842183750361d5e08bab5f78f40a858b 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -67,6 +67,22 @@ class TestSumOp6D(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestSumOp8D(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float64")
+        }
+        self.attrs = {'dim': (0, 3)}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=(0, 3))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
@@ -103,6 +119,40 @@ class TestMinOp(OpTest):
         self.check_output()
 
 
+class TestMin6DOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {
+            'X': np.random.random((2, 4, 3, 5, 6, 10)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMin8DOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {
+            'X': np.random.random((2, 4, 3, 5, 6, 3, 2, 4)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestProdOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_prod"
@@ -116,6 +166,42 @@ class TestProdOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestProd6DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.inputs = {
+            'X': np.random.random((5, 6, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestProd8DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestAllOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_all"
@@ -127,12 +213,40 @@ class TestAllOp(OpTest):
         self.check_output()
 
 
+class TestAll8DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'reduce_all': True, 'dim': (2, 3, 4)}
+        self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAllOpWithDim(OpTest):
     def setUp(self):
         self.op_type = "reduce_all"
         self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': [1]}
-        self.outputs = {'Out': self.inputs['X'].all(axis=1)}
+        self.attrs = {'dim': (1, )}
+        self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAll8DOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (1, 3, 4)}
+        self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
 
     def test_check_output(self):
         self.check_output()
@@ -152,6 +266,23 @@ class TestAllOpWithKeepDim(OpTest):
         self.check_output()
 
 
+class TestAll8DOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (5, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].all(axis=self.attrs['dim']), axis=5)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAllOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
@@ -175,6 +306,20 @@ class TestAnyOp(OpTest):
         self.check_output()
 
 
+class TestAny8DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'reduce_all': True, 'dim': (3, 5, 4)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAnyOpWithDim(OpTest):
     def setUp(self):
         self.op_type = "reduce_any"
@@ -186,14 +331,45 @@ class TestAnyOpWithDim(OpTest):
         self.check_output()
 
 
+class TestAny8DOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (3, 6)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAnyOpWithKeepDim(OpTest):
     def setUp(self):
         self.op_type = "reduce_any"
         self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAny8DOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
         self.outputs = {
             'Out': np.expand_dims(
-                self.inputs['X'].any(axis=1), axis=1)
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
         }
 
     def test_check_output(self):
@@ -283,6 +459,18 @@ class Test3DReduce3(Test1DReduce):
         }
 
 
+class Test8DReduce0(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': (4, 2, 3)}
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
 class TestKeepDimReduce(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
@@ -294,6 +482,19 @@ class TestKeepDimReduce(Test1DReduce):
         }
 
 
+class TestKeepDim8DReduce(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': (3, 4, 5), 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
+
+
 class TestReduceAll(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
@@ -302,6 +503,16 @@ class TestReduceAll(Test1DReduce):
         self.outputs = {'Out': self.inputs['X'].sum()}
 
 
+class TestReduceAll(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'reduce_all': True, 'dim': (3, 4, 5)}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=self.attrs['dim'])}
+
+
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
diff --git a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py b/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
index b74a6e10917f71b669a2d2e6906d6c7310c59c7e..4c63dced83b199acaffc3e703785b3eb8ec8913b 100644
--- a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
+++ b/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
@@ -50,7 +50,7 @@ class TestSaveModelWithoutVar(unittest.TestCase):
                 params_filename='params')
             expected_warn = "no variable in your model, please ensure there are any variables in your model to save"
             self.assertTrue(len(w) > 0)
-            self.assertTrue(expected_warn == str(w[0].message))
+            self.assertTrue(expected_warn == str(w[-1].message))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
index 54e7765c0fb76844a6123fceea6c1ef79dc0c2bf..b9d96f329b5bb48f7167d005f11f64136fdf5d01 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
@@ -63,28 +63,28 @@ class TestTopkOp(OpTest):
         self.check_grad(set(['X']), 'Out')
 
 
-class TestTopOp1(TestTopkOp):
+class TestTopkOp1(TestTopkOp):
     def init_args(self):
         self.k = 3
         self.axis = 0
         self.largest = True
 
 
-class TestTopOp2(TestTopkOp):
+class TestTopkOp2(TestTopkOp):
     def init_args(self):
         self.k = 3
         self.axis = 0
         self.largest = False
 
 
-class TestTopOp3(TestTopkOp):
+class TestTopkOp3(TestTopkOp):
     def init_args(self):
         self.k = 4
         self.axis = 0
         self.largest = False
 
 
-class TestTopOp4(TestTopkOp):
+class TestTopkOp4(TestTopkOp):
     def init_args(self):
         self.k = 4
         self.axis = 0
@@ -189,6 +189,8 @@ class TestTopKAPI(unittest.TestCase):
             result1 = paddle.topk(input_tensor, k=2)
             result2 = paddle.topk(input_tensor, k=2, axis=-1)
             result3 = paddle.topk(input_tensor, k=k_tensor, axis=1)
+            self.assertEqual(result3[0].shape, (6, -1, 8))
+            self.assertEqual(result3[1].shape, (6, -1, 8))
             result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False)
             result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
             result6 = paddle.topk(large_input_tensor, k=1, axis=-1)
@@ -239,6 +241,15 @@ class TestTopKAPI(unittest.TestCase):
             self.run_dygraph(place)
             self.run_static(place)
 
+    def test_errors(self):
+        paddle.disable_static()
+        x = paddle.to_tensor([1, 2, 3])
+        with self.assertRaises(BaseException):
+            paddle.topk(x, k=-1)
+
+        with self.assertRaises(BaseException):
+            paddle.topk(x, k=0)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index 5fea9f69a18c83be0f6af05784735ea53d0993d2..bd76edc9d8cadf14c6cf224b7708ff4acd6efef4 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -474,6 +474,141 @@ class TestTransformer(unittest.TestCase):
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
+    def test_transformer_attr_1(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                weight_attr=[None],
+                bias_attr=[False])
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+    def test_transformer_attr_2(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                weight_attr=[None, None],
+                bias_attr=[False, False])
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+    def test_transformer_attr_3(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                weight_attr=[None, None, None],
+                bias_attr=[False, False, True])
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+    def test_transformer_attr_boolean(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                bias_attr=False)
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index d5d1fdc5b20b9d786f1861d63bb4a646117b80ed..56333211469db5705d04cc5ca253bf01679190a5 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -99,6 +99,18 @@ class TestCase7(TestTransposeOp):
         self.axis = (0, 1, 3, 2)
 
 
+class TestCase8(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
+
+
+class TestCase9(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
+
+
 class TestTransposeOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index c445977df1405240fecc6eb9ff623dcf8ea147d5..d41852c9d7f4f5812a852d6a5c644e75d137f530 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -792,15 +792,14 @@ class Model(object):
     switched by `paddle.disable_static()`. The usage is as follows.
     But note, the switching between dynamic and static should be before
     instantiating a Model. The input description, i.e, paddle.static.InputSpec,
-    must be required for static graph.
+    must be required.
 
     Args:
         network (paddle.nn.Layer): The network is an instance of
             paddle.nn.Layer.
         inputs (InputSpec|list|dict|None): `inputs`, entry points of network,
             could be a InputSpec instance, or lits of InputSpec instances,
-            or dict ({name: InputSpec}), or None. For static graph,
-            inputs must be set. For dynamic graph, it could be None.
+            or dict ({name: InputSpec}), and it couldn't be None.
         labels (InputSpec|list|None): `labels`, entry points of network,
             could be a InputSpec instnace or lits of InputSpec instances,
             or None. For static graph, if labels is required in loss,
@@ -849,10 +848,9 @@ class Model(object):
         self._optimizer = None
         self._test_dataloader = None
 
-        if not in_dygraph_mode():
-            if not isinstance(inputs, (list, dict, Input)):
-                raise TypeError(
-                    "'inputs' must be list or dict in static graph mode")
+        if not isinstance(inputs, (list, dict, Input)):
+            raise TypeError(
+                "'inputs' must be list or dict, and couldn't be None.")
         self._inputs = self._verify_spec(inputs, True)
         self._labels = self._verify_spec(labels)
 
@@ -1004,11 +1002,7 @@ class Model(object):
         have no variable need to save (like SGD), the fill will not generated).
         This function will silently overwrite existing file at the target location.
 
-        If `training` is set to False, only inference model will be saved. It 
-        should be noted that before using `save`, you should run the model, and 
-        the shape of input you saved is as same as the input of its running.
-        `@paddle.jit.to_static` must be added on `forward` function of your layer 
-        in dynamic mode now and these will be optimized later.
+        If `training` is set to False, only inference model will be saved.
 
         Args:
             path (str): The file prefix to save model. The format is
@@ -1037,8 +1031,6 @@ class Model(object):
                             nn.Linear(200, 10),
                             nn.Softmax())
 
-                    # If save for inference in dygraph, need this
-                    @paddle.jit.to_static
                     def forward(self, x):
                         return self.net(x)
 
@@ -1046,7 +1038,7 @@ class Model(object):
                 device = paddle.set_device('cpu')
                 # if use static graph, do not set
                 paddle.disable_static(device) if dynamic else None
-                # inputs and labels are not required for dynamic graph.
+
                 input = InputSpec([None, 784], 'float32', 'x')
                 label = InputSpec([None, 1], 'int64', 'label')
                 model = paddle.Model(Mnist(), input, label)
@@ -1649,10 +1641,6 @@ class Model(object):
                               model_only=False):
         """
         Save inference model can be in static or dynamic mode.
-        It should be noted that before using `save_inference_model`, you should
-        run the model, and the shape you saved is as same as the input of its
-        running. `@paddle.jit.to_static` must be added on `forward` function of
-        your layer in dynamic mode now and these will be optimized later.
 
         Args:
             save_dir (str): The directory path to save the inference model.
@@ -1678,14 +1666,11 @@ class Model(object):
 
             return result_list
 
-        # TODO:
-        # 1. Make it Unnecessary to run model before calling `save_inference_model` for users in dygraph.
-        # 2. Save correct shape of input, now the interface stores the shape that the user sent to
-        #    the inputs of the model in running.
-        # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
         if fluid.in_dygraph_mode():
             with fluid.framework._dygraph_guard(None):
                 layer = self.network
+                layer.forward = paddle.jit.to_static(
+                    layer.forward, input_spec=self._inputs)
 
                 # 1. input check
                 prog_translator = ProgramTranslator()
@@ -1879,18 +1864,7 @@ class Model(object):
     def _verify_spec(self, specs, is_input=False):
         out_specs = []
 
-        if specs is None:
-            # Note(Aurelius84): If not specific specs of `Input`, using argument names of `forward` function
-            # to generate `Input`. But how can we know the actual shape of each input tensor?
-            if is_input:
-                out_specs = [
-                    Input(
-                        name=n, shape=[None])
-                    for n in extract_args(self.network.forward) if n != 'self'
-                ]
-            else:
-                out_specs = to_list(specs)
-        elif isinstance(specs, dict):
+        if isinstance(specs, dict):
             assert is_input == False
             out_specs = [specs[n] \
                 for n in extract_args(self.network.forward) if n != 'self']
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index da086c0955e849619ccbce17a297ca4615a3f3d0..4395520eec70e8483cb61097a166576f4040cb4d 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1093,7 +1093,7 @@ def cross_entropy(input,
             " 'none', but received %s, which is not allowed." % reduction)
 
     #step 1. log_softmax
-    log_softmax_out = paddle.nn.functional.log_softmax(input)
+    log_softmax_out = paddle.nn.functional.log_softmax(input, axis=1)
     if weight is not None and not isinstance(weight, Variable):
         raise ValueError(
             "The weight' is not a Variable, please convert to Variable.")
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 042625a3dbd6b07487d6f77442621959f7492af6..1eb9167d0352f36bfcb87db79ba23dce14bac507 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -713,7 +713,7 @@ def max_pool2d(x,
                 'data_format', data_format)
             return output
 
-    op_type = 'max_pool2d_with_index' if data_format == "NCHW" else "max_pool2d"
+    op_type = 'max_pool2d_with_index' if data_format == "NCHW" else "pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -839,7 +839,7 @@ def max_pool3d(x,
                 'data_format', data_format)
             return output
 
-    op_type = "max_pool3d_with_index" if data_format == "NCDHW" else "max_pool3d"
+    op_type = "max_pool3d_with_index" if data_format == "NCDHW" else "pool3d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 63069e83952172df3136458ebfee4b446749934d..4b199d5816c808d4975c51bc154ad21d46f135eb 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -53,7 +53,22 @@ def _convert_param_attr_to_list(param_attr, n):
     if isinstance(param_attr, (list, tuple)):
         assert len(param_attr) == n, (
             "length of param_attr should be %d when it is a list/tuple" % n)
-        param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+        param_attrs = []
+        for attr in param_attr:
+            if isinstance(attr, bool):
+                if attr:
+                    param_attrs.append(ParamAttr._to_attr(None))
+                else:
+                    param_attrs.append(False)
+            else:
+                param_attrs.append(ParamAttr._to_attr(attr))
+        # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+    elif isinstance(param_attr, bool):
+        param_attrs = []
+        if param_attr:
+            param_attrs = [ParamAttr._to_attr(None) for i in range(n)]
+        else:
+            param_attrs = [False] * n
     else:
         param_attrs = []
         attr = ParamAttr._to_attr(param_attr)
@@ -417,7 +432,7 @@ class TransformerEncoderLayer(Layer):
             Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
             Default: None, which means the default weight parameter property is used.
             See usage for details in :code:`ParamAttr` . 
-        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
             If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
             MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
             Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
@@ -986,22 +1001,31 @@ class Transformer(Layer):
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
         weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
-            self attention, `weight_attr[1]` would be used as `weight_attr` for
-            cross attention, and `weight_attr[2]` would be used as `weight_attr`
-            for linear in FFN. Otherwise, the three sub-layers all uses it as
-            `weight_attr` to create parameters. Default: None, which means the
-            default weight parameter property is used. See usage for details
+            If it is a tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, 
+            `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]` 
+            would be used as `weight_attr` for cross attention of `TransformerDecoder`, 
+            and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. 
+            If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention 
+            and cross attntion and `weight_attr[1]` would be used as `weight_attr` for 
+            linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr` 
+            for self attention, cross attention and linear in FFN. Otherwise, 
+            the three sub-layers all uses it as `weight_attr` to create parameters. 
+            Default: None, which means the default weight parameter property is used. 
+            See usage for details
             in :code:`ParamAttr` . 
         bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
-            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
-            self attention, `bias_attr[1]` would be used as `bias_attr` for
-            cross attention, and `bias_attr[2]` would be used as `bias_attr`
-            for linear in FFN. Otherwise, the three sub-layers all uses it as
-            `bias_attr` to create parameters. The `False` value means the
-            corresponding layer would not have trainable bias parameter. See
-            usage for details in :code:`ParamAttr` . Default: None,which means
-            the default bias parameter property is used.
+            If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
+            `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` 
+            would be used as `bias_attr` for cross attention of `TransformerDecoder`, 
+            and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. 
+            If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention 
+            and cross attntion and `bias_attr[1]` would be used as `bias_attr` for 
+            linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr` 
+            for self attention, cross attention and linear in FFN. Otherwise, 
+            the three sub-layers all uses it as `bias_attr` to create parameters. 
+            The `False` value means the corresponding layer would not have trainable 
+            bias parameter. See usage for details in :code:`ParamAttr` . 
+            Default: None,which means the default bias parameter property is used.
         custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
             Default None
         custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
@@ -1049,13 +1073,51 @@ class Transformer(Layer):
                  custom_decoder=None):
         super(Transformer, self).__init__()
 
+        if isinstance(bias_attr, (list, tuple)):
+            if len(bias_attr) == 1:
+                encoder_bias_attr = [bias_attr[0]] * 2
+                decoder_bias_attr = [bias_attr[0]] * 3
+            elif len(bias_attr) == 2:
+                encoder_bias_attr = bias_attr
+                decoder_bias_attr = [bias_attr[0], bias_attr[0], bias_attr[-1]]
+            elif len(bias_attr) == 3:
+                encoder_bias_attr = [bias_attr[0], bias_attr[-1]]
+                decoder_bias_attr = bias_attr
+            else:
+                assert False, (
+                    "length of bias_attr should be 1 or 2 or 3 when it is a list/tuple"
+                )
+        else:
+            encoder_bias_attr = bias_attr
+            decoder_bias_attr = bias_attr
+
+        if isinstance(weight_attr, (list, tuple)):
+            if len(weight_attr) == 1:
+                encoder_weight_attr = [weight_attr[0]] * 2
+                decoder_weight_attr = [weight_attr[0]] * 3
+            elif len(weight_attr) == 2:
+                encoder_weight_attr = weight_attr
+                decoder_weight_attr = [
+                    weight_attr[0], weight_attr[0], weight_attr[-1]
+                ]
+            elif len(weight_attr) == 3:
+                encoder_weight_attr = [weight_attr[0], weight_attr[-1]]
+                decoder_weight_attr = weight_attr
+            else:
+                assert False, (
+                    "length of weight_attr should be 1 or 2 or 3 when it is a list/tuple"
+                )
+        else:
+            encoder_weight_attr = weight_attr
+            decoder_weight_attr = weight_attr
+
         if custom_encoder is not None:
             self.encoder = custom_encoder
         else:
             encoder_layer = TransformerEncoderLayer(
                 d_model, nhead, dim_feedforward, dropout, activation,
-                attn_dropout, act_dropout, normalize_before, weight_attr,
-                bias_attr)
+                attn_dropout, act_dropout, normalize_before,
+                encoder_weight_attr, encoder_bias_attr)
             encoder_norm = LayerNorm(d_model)
             self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
                                               encoder_norm)
@@ -1065,8 +1127,8 @@ class Transformer(Layer):
         else:
             decoder_layer = TransformerDecoderLayer(
                 d_model, nhead, dim_feedforward, dropout, activation,
-                attn_dropout, act_dropout, normalize_before, weight_attr,
-                bias_attr)
+                attn_dropout, act_dropout, normalize_before,
+                decoder_weight_attr, decoder_bias_attr)
             decoder_norm = LayerNorm(d_model)
             self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
                                               decoder_norm)
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 708aaa788f60d56a2adb41c8a571079354b3c192..24cebf8e6e6388a2d1e9711e3f862090918876a3 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -282,14 +282,13 @@ class Adam(Optimizer):
         for param in self._parameter_list:
             if not param.trainable:
                 continue
-            if hasattr(
-                    param, "_is_sparse"
-            ) and param._is_sparse and self.regularization is not None:
-                raise RuntimeError(
-                    "Adam don't support weight_decay with sparse parameters, please set it to None."
-                )
             if param._grad_ivar() is not None:
                 grad_var = param._grad_ivar()
+                if hasattr(grad_var, "_is_sparse") and grad_var._is_sparse(
+                ) and self.regularization is not None:
+                    raise RuntimeError(
+                        "Adam don't support weight_decay with sparse parameters, please set it to None."
+                    )
                 params_grads.append((param, grad_var))
 
         optimize_ops = self._apply_optimize(
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index ff09f4c562aeb8216dea6e1d40b9492c257aeab4..91a2a78203cbc50fec27b4f3ae8d3541ac4ec5da 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -32,6 +32,21 @@ import random
 import zlib
 import paddle.compat as cpt
 
+# On macOS, the 'spawn' start method is now the default in Python3.8 multiprocessing,
+# Paddle is currently unable to solve this, so forces the process to start using 
+# the 'fork' start method.
+#
+# TODO: This solution is not good, because the fork start method could lead to 
+# crashes of the subprocess. Figure out how to make 'spawn' work.
+#
+# For more details, please refer to
+# https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+# https://bugs.python.org/issue33725
+if sys.version_info >= (3, 8) and sys.platform == 'darwin':
+    fork_context = multiprocessing.get_context('fork')
+else:
+    fork_context = multiprocessing
+
 
 def cache(reader):
     """
@@ -560,9 +575,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
             six.reraise(*sys.exc_info())
 
     def queue_reader():
-        queue = multiprocessing.Queue(queue_size)
+        queue = fork_context.Queue(queue_size)
         for reader in readers:
-            p = multiprocessing.Process(
+            p = fork_context.Process(
                 target=_read_into_queue, args=(reader, queue))
             p.start()
 
@@ -593,9 +608,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
     def pipe_reader():
         conns = []
         for reader in readers:
-            parent_conn, child_conn = multiprocessing.Pipe()
+            parent_conn, child_conn = fork_context.Pipe()
             conns.append(parent_conn)
-            p = multiprocessing.Process(
+            p = fork_context.Process(
                 target=_read_into_pipe, args=(reader, child_conn))
             p.start()
 
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 8bb584be2362e7b02bc5b7c5603b148d37499c2d..a713663e1822d4af2d09efb2986aeb513930bbc0 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -41,6 +41,7 @@ from .creation import triu  #DEFINE_ALIAS
 from .creation import tril  #DEFINE_ALIAS
 from .creation import meshgrid  #DEFINE_ALIAS
 from .creation import empty  #DEFINE_ALIAS
+from .creation import empty_like  #DEFINE_ALIAS
 from .io import save  #DEFINE_ALIAS
 from .io import load  #DEFINE_ALIAS
 from .linalg import matmul  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 8011b92964b7e21fd930f19cec954b27f470e0c6..9aee911e568d1b2cd7aac0cf45e44f2886612a5a 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -49,6 +49,7 @@ __all__ = [
     'full',
     'full_like',
     'empty',
+    'empty_like',
     'triu',
     'tril',
     'meshgrid'
@@ -1068,3 +1069,70 @@ def empty(shape, dtype=None, name=None):
         stop_gradient=True)
     out.stop_gradient = True
     return out
+
+
+def empty_like(x, dtype=None, name=None):
+    """
+    This Op returns a Tensor with uninitialized data which has identical shape of ``x`` and ``dtype``.
+    If the ``dtype`` is None, the data type of Tensor is same with ``x``.
+    
+    Args:
+        x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
+        dtype(np.dtype|str, optional): The data type of output. The data type can be one
+            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
+            data type is the same as input.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        Tensor: Tensor which is created according to ``x`` and ``dtype``, and is uninitialized.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()   # Now we are in imperative mode
+          paddle.set_device("cpu")  # and use cpu device
+
+          x = paddle.randn([2, 3], 'float32')
+          output = paddle.empty_like(x)
+          #[[1.8491974e+20 1.8037303e+28 1.7443726e+28]     # uninitialized
+          # [4.9640171e+28 3.0186127e+32 5.6715899e-11]]    # uninitialized
+    """
+
+    if dtype is None:
+        dtype = x.dtype
+    dtype = convert_dtype(dtype)
+
+    if in_dygraph_mode():
+        out = core.ops.empty('shape', x.shape, 'dtype',
+                             convert_np_dtype_to_dtype_(dtype))
+        out.stop_gradient = True
+        return out
+
+    helper = LayerHelper("empty_like", **locals())
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'empty_like')
+    check_dtype(dtype, 'dtype',
+                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+                'empty_like')
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+
+    inputs = {}
+    attrs = {}
+    attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
+    shape = paddle.shape(x)
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='empty_like')
+
+    helper.append_op(
+        type='empty',
+        inputs=inputs,
+        outputs={'Out': [out]},
+        attrs=attrs,
+        stop_gradient=True)
+    out.stop_gradient = True
+    return out
diff --git a/python/paddle/tests/test_dataset_cifar.py b/python/paddle/tests/test_dataset_cifar.py
index 2ecc41c3f0a81a56cc34e826483ea4f5cc6681d9..672de7ae8e94eceded92dfa0e77621eedac0e3b0 100644
--- a/python/paddle/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
@@ -27,8 +27,10 @@ class TestCifar10Train(unittest.TestCase):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 50000)
         data, label = cifar[idx]
-        self.assertTrue(len(data.shape) == 1)
-        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[2] == 32)
         self.assertTrue(0 <= int(label) <= 9)
 
 
@@ -41,8 +43,10 @@ class TestCifar10Test(unittest.TestCase):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 10000)
         data, label = cifar[idx]
-        self.assertTrue(len(data.shape) == 1)
-        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[2] == 32)
         self.assertTrue(0 <= int(label) <= 9)
 
 
@@ -55,8 +59,10 @@ class TestCifar100Train(unittest.TestCase):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 50000)
         data, label = cifar[idx]
-        self.assertTrue(len(data.shape) == 1)
-        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[2] == 32)
         self.assertTrue(0 <= int(label) <= 99)
 
 
@@ -69,8 +75,10 @@ class TestCifar100Test(unittest.TestCase):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 10000)
         data, label = cifar[idx]
-        self.assertTrue(len(data.shape) == 1)
-        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[2] == 32)
         self.assertTrue(0 <= int(label) <= 99)
 
 
diff --git a/python/paddle/tests/test_datasets.py b/python/paddle/tests/test_datasets.py
index 1e50ff60aa5c3039c21d6e1e3a714c32000462c7..1e0d6dbacf6c4c5a781aaa40440921fe1a281ca9 100644
--- a/python/paddle/tests/test_datasets.py
+++ b/python/paddle/tests/test_datasets.py
@@ -103,12 +103,14 @@ class TestMNISTTest(unittest.TestCase):
 
 class TestMNISTTrain(unittest.TestCase):
     def test_main(self):
-        mnist = MNIST(mode='train', chw_format=False)
+        mnist = MNIST(mode='train')
         self.assertTrue(len(mnist) == 60000)
 
         for i in range(len(mnist)):
             image, label = mnist[i]
-            self.assertTrue(image.shape[0] == 784)
+            self.assertTrue(image.shape[0] == 1)
+            self.assertTrue(image.shape[1] == 28)
+            self.assertTrue(image.shape[2] == 28)
             self.assertTrue(label.shape[0] == 1)
             self.assertTrue(0 <= int(label) <= 9)
 
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 5c4e98feaa686217bc78ad3915423593ad4fcdce..62cc39c1f7b5303d98bffd9eb5814d4579a6d3f1 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -67,35 +67,6 @@ class LeNetDygraph(paddle.nn.Layer):
         return x
 
 
-class LeNetDeclarative(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10, classifier_activation=None):
-        super(LeNetDeclarative, self).__init__()
-        self.num_classes = num_classes
-        self.features = Sequential(
-            Conv2d(
-                1, 6, 3, stride=1, padding=1),
-            ReLU(),
-            Pool2D(2, 'max', 2),
-            Conv2d(
-                6, 16, 5, stride=1, padding=0),
-            ReLU(),
-            Pool2D(2, 'max', 2))
-
-        if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
-
-    @declarative
-    def forward(self, inputs):
-        x = self.features(inputs)
-
-        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
-            x = self.fc(x)
-        return x
-
-
 class MnistDataset(MNIST):
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
@@ -444,7 +415,9 @@ class TestModelFunction(unittest.TestCase):
         # dynamic saving
         device = paddle.set_device('cpu')
         fluid.enable_dygraph(device)
-        model = Model(MyModel(classifier_activation=None))
+        inputs = [InputSpec([None, 20], 'float32', 'x')]
+        labels = [InputSpec([None, 1], 'int64', 'label')]
+        model = Model(MyModel(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -543,11 +516,10 @@ class TestModelFunction(unittest.TestCase):
 
     def test_export_deploy_model(self):
         for dynamic in [True, False]:
-            fluid.enable_dygraph() if dynamic else None
-            # paddle.disable_static() if dynamic else None
+            paddle.disable_static() if dynamic else None
             prog_translator = ProgramTranslator()
             prog_translator.enable(False) if not dynamic else None
-            net = LeNetDeclarative()
+            net = LeNet()
             inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
@@ -556,8 +528,9 @@ class TestModelFunction(unittest.TestCase):
                 os.makedirs(save_dir)
             tensor_img = np.array(
                 np.random.random((1, 1, 28, 28)), dtype=np.float32)
-            ori_results = model.test_batch(tensor_img)
+
             model.save(save_dir, training=False)
+            ori_results = model.test_batch(tensor_img)
             fluid.disable_dygraph() if dynamic else None
 
             place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
@@ -574,6 +547,7 @@ class TestModelFunction(unittest.TestCase):
                 np.testing.assert_allclose(
                     results, ori_results, rtol=1e-5, atol=1e-7)
                 shutil.rmtree(save_dir)
+            paddle.enable_static()
 
 
 class TestRaiseError(unittest.TestCase):
@@ -585,6 +559,14 @@ class TestRaiseError(unittest.TestCase):
         with self.assertRaises(ValueError):
             model = Model(net, inputs, labels)
 
+    def test_input_without_input_spec(self):
+        for dynamic in [True, False]:
+            paddle.disable_static() if dynamic else None
+            net = MyModel(classifier_activation=None)
+            with self.assertRaises(TypeError):
+                model = Model(net)
+            paddle.enable_static()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/text/datasets/uci_housing.py b/python/paddle/text/datasets/uci_housing.py
index a0d465eb1775431ffa0527dfae8031bebd6fc340..a8dfbc44a97127dd074ef5cbfc727aa535d56872 100644
--- a/python/paddle/text/datasets/uci_housing.py
+++ b/python/paddle/text/datasets/uci_housing.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import six
 import numpy as np
 
+import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
@@ -88,6 +89,8 @@ class UCIHousing(Dataset):
         # read dataset into memory
         self._load_data()
 
+        self.dtype = paddle.get_default_dtype()
+
     def _load_data(self, feature_num=14, ratio=0.8):
         data = np.fromfile(self.data_file, sep=' ')
         data = data.reshape(data.shape[0] // feature_num, feature_num)
@@ -103,7 +106,8 @@ class UCIHousing(Dataset):
 
     def __getitem__(self, idx):
         data = self.data[idx]
-        return np.array(data[:-1]), np.array(data[-1:])
+        return np.array(data[:-1]).astype(self.dtype), \
+                np.array(data[-1:]).astype(self.dtype)
 
     def __len__(self):
         return len(self.data)
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 4a786679727fb1b42c216146685e0e6524e858c9..77f5ef7e9661e0ff8f4f200ff7e789ba475459c2 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -16,6 +16,7 @@ from .profiler import ProfilerOptions
 from .profiler import Profiler
 from .profiler import get_profiler
 from .deprecated import deprecated
+from .lazy_import import try_import
 from ..fluid.framework import unique_name
 from ..fluid.framework import load_op_library
 from ..fluid.framework import require_version
diff --git a/python/paddle/utils/lazy_import.py b/python/paddle/utils/lazy_import.py
new file mode 100644
index 0000000000000000000000000000000000000000..69a32b77a8f3da7a5a3432b4c0cbf746bd0b0833
--- /dev/null
+++ b/python/paddle/utils/lazy_import.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lazy imports for heavy dependencies."""
+
+import importlib
+
+
+def try_import(module_name):
+    """Try importing a module, with an informative error message on failure."""
+    install_name = module_name
+    if module_name == 'cv2':
+        install_name = 'opencv-python'
+
+    try:
+        mod = importlib.import_module(module_name)
+        return mod
+    except ImportError:
+        err_msg = (
+            "Failed importing {}. This likely means that some paddle modules "
+            "requires additional dependencies that have to be "
+            "manually installed (usually with `pip install {}`). ").format(
+                module_name, install_name)
+        raise ImportError(err_msg)
diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index 1193be26da56780058beadfe15640bc76533114a..631892ee4dcbf0382bc79ae4279d895872c68ef0 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -139,6 +139,7 @@ class Cifar10(Dataset):
 
     def __getitem__(self, idx):
         image, label = self.data[idx]
+        image = np.reshape(image, [3, 32, 32])
         if self.transform is not None:
             image = self.transform(image)
         return image, label
diff --git a/python/paddle/vision/datasets/folder.py b/python/paddle/vision/datasets/folder.py
index 725fd9acafbab7b6adaf07139d02da8e2c9aaada..8a3053abefc1b28ba36150a1ff68a4dd4c3469c9 100644
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
@@ -14,9 +14,9 @@
 
 import os
 import sys
-import cv2
 
 from paddle.io import Dataset
+from paddle.utils import try_import
 
 __all__ = ["DatasetFolder", "ImageFolder"]
 
@@ -191,6 +191,7 @@ IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
 
 
 def cv2_loader(path):
+    cv2 = try_import('cv2')
     return cv2.imread(path)
 
 
diff --git a/python/paddle/vision/datasets/mnist.py b/python/paddle/vision/datasets/mnist.py
index a98561333921d182c0b3a3f486c90a94e79b6a3d..597d4046441ddb5b04ad7ceafd83e28e409c674c 100644
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
@@ -44,8 +44,6 @@ class MNIST(Dataset):
             :attr:`download` is True. Default None
         label_path(str): path to label file, can be set None if
             :attr:`download` is True. Default None
-        chw_format(bool): If set True, the output shape is [1, 28, 28],
-            otherwise, output shape is [1, 784]. Default True.
         mode(str): 'train' or 'test' mode. Default 'train'.
         download(bool): whether to download dataset automatically if
             :attr:`image_path` :attr:`label_path` is not set. Default True
@@ -70,14 +68,12 @@ class MNIST(Dataset):
     def __init__(self,
                  image_path=None,
                  label_path=None,
-                 chw_format=True,
                  mode='train',
                  transform=None,
                  download=True):
         assert mode.lower() in ['train', 'test'], \
                 "mode should be 'train' or 'test', but got {}".format(mode)
         self.mode = mode.lower()
-        self.chw_format = chw_format
         self.image_path = image_path
         if self.image_path is None:
             assert download, "image_path is not set and downloading automatically is disabled"
@@ -139,10 +135,6 @@ class MNIST(Dataset):
                                                       cols)).astype('float32')
                     offset_img += struct.calcsize(fmt_images)
 
-                    images = images / 255.0
-                    images = images * 2.0
-                    images = images - 1.0
-
                     for i in range(buffer_size):
                         self.images.append(images[i, :])
                         self.labels.append(
@@ -150,8 +142,7 @@ class MNIST(Dataset):
 
     def __getitem__(self, idx):
         image, label = self.images[idx], self.labels[idx]
-        if self.chw_format:
-            image = np.reshape(image, [1, 28, 28])
+        image = np.reshape(image, [1, 28, 28])
         if self.transform is not None:
             image = self.transform(image)
         return image, label
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index b5668fa8c7d6812664512a58faf836b5d9f09300..acceb111e6f84bc444dcfc7427653f77e3eedd3a 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -18,10 +18,11 @@ import random
 import math
 import functools
 
-import cv2
 import numbers
 import numpy as np
 
+from paddle.utils import try_import
+
 if sys.version_info < (3, 3):
     Sequence = collections.Sequence
     Iterable = collections.Iterable
@@ -54,8 +55,8 @@ def flip(image, code):
     Accordding to the code (the type of flip), flip the input image
 
     Args:
-        image: Input image, with (H, W, C) shape
-        code: Code that indicates the type of flip.
+        image (np.ndarray): Input image, with (H, W, C) shape
+        code (int): Code that indicates the type of flip.
             -1 : Flip horizontally and vertically
             0 : Flip vertically
             1 : Flip horizontally
@@ -77,18 +78,28 @@ def flip(image, code):
             # flip horizontally
             F.flip(fake_img, 1)
     """
+    cv2 = try_import('cv2')
     return cv2.flip(image, flipCode=code)
 
 
 @keepdims
-def resize(img, size, interpolation=cv2.INTER_LINEAR):
+def resize(img, size, interpolation=1):
     """
     resize the input data to given size
 
     Args:
-        input: Input data, could be image or masks, with (H, W, C) shape
-        size: Target size of input data, with (height, width) shape.
-        interpolation: Interpolation method.
+        input (np.ndarray): Input data, could be image or masks, with (H, W, C) shape
+        size (int|list|tuple): Target size of input data, with (height, width) shape.
+        interpolation (int, optional): Interpolation method.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
 
     Examples:
         .. code-block:: python
@@ -102,7 +113,7 @@ def resize(img, size, interpolation=cv2.INTER_LINEAR):
 
             F.resize(fake_img, (200, 150))
     """
-
+    cv2 = try_import('cv2')
     if isinstance(interpolation, Sequence):
         interpolation = random.choice(interpolation)
 
@@ -179,6 +190,8 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
     assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \
         'Expected padding mode be either constant, edge, reflect or symmetric, but got {}'.format(padding_mode)
 
+    cv2 = try_import('cv2')
+
     PAD_MOD = {
         'constant': cv2.BORDER_CONSTANT,
         'edge': cv2.BORDER_REPLICATE,
@@ -214,18 +227,22 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
 
 
 @keepdims
-def rotate(img,
-           angle,
-           interpolation=cv2.INTER_LINEAR,
-           expand=False,
-           center=None):
+def rotate(img, angle, interpolation=1, expand=False, center=None):
     """Rotates the image by angle.
 
     Args:
         img (numpy.ndarray): Image to be rotated.
         angle (float|int): In degrees clockwise order.
-        interpolation (int, optional):
-            interpolation: Interpolation method.
+        interpolation (int, optional): Interpolation method. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
         expand (bool|optional): Optional expansion flag.
             If true, expands the output image to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -250,8 +267,9 @@ def rotate(img,
             fake_img = rotate(fake_img, 10)
             print(fake_img.shape)
     """
-    dtype = img.dtype
+    cv2 = try_import('cv2')
 
+    dtype = img.dtype
     h, w, _ = img.shape
     point = center or (w / 2, h / 2)
     M = cv2.getRotationMatrix2D(point, angle=-angle, scale=1)
@@ -312,6 +330,7 @@ def to_grayscale(img, num_output_channels=1):
             fake_img = to_grayscale(fake_img)
             print(fake_img.shape)
     """
+    cv2 = try_import('cv2')
 
     if num_output_channels == 1:
         img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 14809e0c1acaa1b6d5a494e6e3df1801e1c8f61b..9ea828271765cb8976909724bbf6079aa02c2460 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -17,7 +17,6 @@ from __future__ import division
 import math
 import sys
 import random
-import cv2
 
 import numpy as np
 import numbers
@@ -26,6 +25,7 @@ import collections
 import warnings
 import traceback
 
+from paddle.utils import try_import
 from . import functional as F
 
 if sys.version_info < (3, 3):
@@ -214,7 +214,16 @@ class Resize(object):
             smaller edge of the image will be matched to this number.
             i.e, if height > width, then image will be rescaled to
             (size * height / width, size)
-        interpolation (int): Interpolation mode of resize. Default: cv2.INTER_LINEAR.
+        interpolation (int, optional): Interpolation mode of resize. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
 
     Examples:
     
@@ -232,7 +241,7 @@ class Resize(object):
             print(fake_img.shape)
     """
 
-    def __init__(self, size, interpolation=cv2.INTER_LINEAR):
+    def __init__(self, size, interpolation=1):
         assert isinstance(size, int) or (isinstance(size, Iterable) and
                                          len(size) == 2)
         self.size = size
@@ -252,6 +261,16 @@ class RandomResizedCrop(object):
         output_size (int|list|tuple): Target size of output image, with (height, width) shape.
         scale (list|tuple): Range of size of the origin size cropped. Default: (0.08, 1.0)
         ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
+        interpolation (int, optional): Interpolation mode of resize. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
 
     Examples:
     
@@ -273,7 +292,7 @@ class RandomResizedCrop(object):
                  output_size,
                  scale=(0.08, 1.0),
                  ratio=(3. / 4, 4. / 3),
-                 interpolation=cv2.INTER_LINEAR):
+                 interpolation=1):
         if isinstance(output_size, int):
             self.output_size = (output_size, output_size)
         else:
@@ -328,7 +347,16 @@ class CenterCropResize(object):
     Args:
         size (int|list|tuple): Target size of output image, with (height, width) shape.
         crop_padding (int): Center crop with the padding. Default: 32.
-        interpolation (int): Interpolation mode of resize. Default: cv2.INTER_LINEAR.
+        interpolation (int, optional): Interpolation mode of resize. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
 
     Examples:
     
@@ -346,7 +374,7 @@ class CenterCropResize(object):
             print(fake_img.shape)
     """
 
-    def __init__(self, size, crop_padding=32, interpolation=cv2.INTER_LINEAR):
+    def __init__(self, size, crop_padding=32, interpolation=1):
         if isinstance(size, int):
             self.size = (size, size)
         else:
@@ -661,6 +689,7 @@ class ContrastTransform(object):
         if self.value == 0:
             return img
 
+        cv2 = try_import('cv2')
         dtype = img.dtype
         img = img.astype(np.float32)
         alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
@@ -701,6 +730,8 @@ class SaturationTransform(object):
         if self.value == 0:
             return img
 
+        cv2 = try_import('cv2')
+
         dtype = img.dtype
         img = img.astype(np.float32)
         alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
@@ -742,6 +773,7 @@ class HueTransform(object):
         if self.value == 0:
             return img
 
+        cv2 = try_import('cv2')
         dtype = img.dtype
         img = img.astype(np.uint8)
         hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
@@ -1036,7 +1068,16 @@ class RandomRotate(object):
         degrees (sequence or float or int): Range of degrees to select from.
             If degrees is a number instead of sequence like (min, max), the range of degrees
             will be (-degrees, +degrees) clockwise order.
-        interpolation (int|optional): Interpolation mode of resize. Default: cv2.INTER_LINEAR.
+        interpolation (int, optional): Interpolation mode of resize. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
         expand (bool|optional): Optional expansion flag. Default: False.
             If true, expands the output to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -1061,11 +1102,7 @@ class RandomRotate(object):
             print(fake_img.shape)
     """
 
-    def __init__(self,
-                 degrees,
-                 interpolation=cv2.INTER_LINEAR,
-                 expand=False,
-                 center=None):
+    def __init__(self, degrees, interpolation=1, expand=False, center=None):
         if isinstance(degrees, numbers.Number):
             if degrees < 0:
                 raise ValueError(
diff --git a/python/requirements.txt b/python/requirements.txt
index ddd1e943df78eed3dd36d36571954665b267019d..6a88d61a94c182db3320e4de30f45be4c852f70f 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,4 +1,3 @@
-opencv-python<=4.2.0.32
 requests>=2.20.0
 numpy>=1.13, <=1.16.4 ; python_version<"3.5"
 numpy>=1.13 ; python_version>="3.5"
@@ -18,4 +17,5 @@ decorator
 prettytable
 astor
 pathlib
-netifaces
+netifaces ; platform_system != "Windows"
+netifaces ; python_version>="3.5" and platform_system == "Windows"
diff --git a/python/setup.py.in b/python/setup.py.in
index 117c24bd2d7621e9830ded3d3f2428b94da30319..31c8448ebd9054952ae65fc259c389084afb627d 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -238,9 +238,6 @@ if sys.version_info >= (3,7):
         setup_requires_tmp+=[setup_requires_i]
     setup_requires = setup_requires_tmp
 
-if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
-    setup_requires+=['opencv-python']
-
 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
 
diff --git a/setup.py b/setup.py
deleted file mode 100644
index af558c2ef0b42b68e47fe98ebd626c9b9034bef9..0000000000000000000000000000000000000000
--- a/setup.py
+++ /dev/null
@@ -1,577 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import subprocess
-import os
-import os.path
-import errno
-import re
-import shutil
-import sys
-import fnmatch
-import errno
-import platform
-
-from contextlib import contextmanager
-from setuptools import Command
-from setuptools import setup, Distribution, Extension
-from setuptools.command.install import install as InstallCommandBase
-
-
-class BinaryDistribution(Distribution):
-    def has_ext_modules(foo):
-        return True
-
-
-RC = 0
-
-ext_name = '.dll' if os.name == 'nt' else ('.dylib' if sys.platform == 'darwin'
-                                           else '.so')
-
-
-def git_commit():
-    try:
-        cmd = ['git', 'rev-parse', 'HEAD']
-        git_commit = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE,
-            cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
-    except:
-        git_commit = 'Unknown'
-    git_commit = git_commit.decode()
-    return str(git_commit)
-
-
-def _get_version_detail(idx):
-    assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
-        so detail index must less than 3"
-
-    if re.match('@TAG_VERSION_REGEX@', '@PADDLE_VERSION@'):
-        version_details = '@PADDLE_VERSION@'.split('.')
-
-        if len(version_details) >= 3:
-            return version_details[idx]
-
-    return 0
-
-
-def get_major():
-    return int(_get_version_detail(0))
-
-
-def get_minor():
-    return int(_get_version_detail(1))
-
-
-def get_patch():
-    return str(_get_version_detail(2))
-
-
-def is_taged():
-    try:
-        cmd = [
-            'git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null'
-        ]
-        git_tag = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE,
-            cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
-        git_tag = git_tag.decode()
-    except:
-        return False
-
-    if str(git_tag).replace('v', '') == '@PADDLE_VERSION@':
-        return True
-    else:
-        return False
-
-
-def write_version_py(filename='paddle/version.py'):
-    cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
-#
-full_version    = '%(major)d.%(minor)d.%(patch)s'
-major           = '%(major)d'
-minor           = '%(minor)d'
-patch           = '%(patch)s'
-rc              = '%(rc)d'
-istaged         = %(istaged)s
-commit          = '%(commit)s'
-with_mkl        = '%(with_mkl)s'
-
-def show():
-    if istaged:
-        print('full_version:', full_version)
-        print('major:', major)
-        print('minor:', minor)
-        print('patch:', patch)
-        print('rc:', rc)
-    else:
-        print('commit:', commit)
-
-def mkl():
-    return with_mkl
-'''
-    commit = git_commit()
-    with open(filename, 'w') as f:
-        f.write(cnt % {
-            'major': get_major(),
-            'minor': get_minor(),
-            'patch': get_patch(),
-            'rc': RC,
-            'version': '${PADDLE_VERSION}',
-            'commit': commit,
-            'istaged': is_taged(),
-            'with_mkl': '@WITH_MKL@'
-        })
-
-
-write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
-
-
-def write_distributed_training_mode_py(
-        filename='paddle/fluid/incubate/fleet/parameter_server/version.py'):
-    cnt = '''from __future__ import print_function
-
-# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
-
-from paddle.fluid.incubate.fleet.base.mode import Mode
-
-BUILD_MODE=Mode.%(mode)s
-
-def is_transpiler():
-    return Mode.TRANSPILER == BUILD_MODE
-
-'''
-
-    dirname = os.path.dirname(filename)
-
-    try:
-        os.makedirs(dirname)
-    except OSError as e:
-        if e.errno != errno.EEXIST:
-            raise
-
-    with open(filename, 'w') as f:
-        f.write(cnt %
-                {'mode': 'PSLIB' if '${WITH_PSLIB}' == 'ON' else 'TRANSPILER'})
-
-
-write_distributed_training_mode_py(
-    filename='@PADDLE_BINARY_DIR@/python/paddle/fluid/incubate/fleet/parameter_server/version.py'
-)
-
-packages = [
-    'paddle',
-    'paddle.libs',
-    'paddle.utils',
-    'paddle.dataset',
-    'paddle.reader',
-    'paddle.distributed',
-    'paddle.incubate',
-    'paddle.incubate.complex',
-    'paddle.incubate.complex.tensor',
-    'paddle.distributed.fleet',
-    'paddle.distributed.fleet.base',
-    'paddle.distributed.fleet.meta_optimizers',
-    'paddle.distributed.fleet.runtime',
-    'paddle.distributed.fleet.dataset',
-    'paddle.distributed.fleet.metrics',
-    'paddle.distributed.fleet.proto',
-    'paddle.distributed.fleet.utils',
-    'paddle.framework',
-    'paddle.jit',
-    'paddle.fluid',
-    'paddle.fluid.inference',
-    'paddle.fluid.dygraph',
-    'paddle.fluid.dygraph.dygraph_to_static',
-    'paddle.fluid.dygraph.amp',
-    'paddle.fluid.proto',
-    'paddle.fluid.proto.profiler',
-    'paddle.fluid.distributed',
-    'paddle.fluid.layers',
-    'paddle.fluid.dataloader',
-    'paddle.fluid.contrib',
-    'paddle.fluid.contrib.decoder',
-    'paddle.fluid.contrib.quantize',
-    'paddle.fluid.contrib.reader',
-    'paddle.fluid.contrib.slim',
-    'paddle.fluid.contrib.slim.quantization',
-    'paddle.fluid.contrib.slim.quantization.imperative',
-    'paddle.fluid.contrib.utils',
-    'paddle.fluid.contrib.extend_optimizer',
-    'paddle.fluid.contrib.mixed_precision',
-    'paddle.fluid.contrib.layers',
-    'paddle.fluid.transpiler',
-    'paddle.fluid.transpiler.details',
-    'paddle.fluid.incubate',
-    'paddle.fluid.incubate.data_generator',
-    'paddle.fluid.incubate.fleet',
-    'paddle.fluid.incubate.checkpoint',
-    'paddle.fluid.incubate.fleet.base',
-    'paddle.fluid.incubate.fleet.parameter_server',
-    'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',
-    'paddle.fluid.incubate.fleet.parameter_server.pslib',
-    'paddle.fluid.incubate.fleet.parameter_server.ir',
-    'paddle.fluid.incubate.fleet.collective',
-    'paddle.fluid.incubate.fleet.utils',
-    'paddle.hapi',
-    'paddle.vision',
-    'paddle.vision.models',
-    'paddle.vision.transforms',
-    'paddle.vision.datasets',
-    'paddle.text',
-    'paddle.text.datasets',
-    'paddle.incubate',
-    'paddle.io',
-    'paddle.optimizer',
-    'paddle.nn',
-    'paddle.nn.functional',
-    'paddle.nn.layer',
-    'paddle.nn.initializer',
-    'paddle.nn.utils',
-    'paddle.metric',
-    'paddle.static',
-    'paddle.static.nn',
-    'paddle.tensor',
-]
-
-with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
-    setup_requires = f.read().splitlines()
-
-# Note(wangzhongpu):
-# When compiling paddle under python36, the dependencies belonging to python2.7 will be imported, resulting in errors when installing paddle
-if sys.version_info >= (3, 6) and sys.version_info < (3, 7):
-    setup_requires_tmp = []
-    for setup_requires_i in setup_requires:
-        if "<\"3.6\"" in setup_requires_i or "<\"3.5\"" in setup_requires_i or "<=\"3.5\"" in setup_requires_i:
-            continue
-        setup_requires_tmp += [setup_requires_i]
-    setup_requires = setup_requires_tmp
-if sys.version_info >= (3, 5) and sys.version_info < (3, 6):
-    setup_requires_tmp = []
-    for setup_requires_i in setup_requires:
-        if "<\"3.5\"" in setup_requires_i:
-            continue
-        setup_requires_tmp += [setup_requires_i]
-    setup_requires = setup_requires_tmp
-if sys.version_info >= (3, 7):
-    setup_requires_tmp = []
-    for setup_requires_i in setup_requires:
-        if "<\"3.6\"" in setup_requires_i or "<=\"3.6\"" in setup_requires_i or "<\"3.5\"" in setup_requires_i or "<=\"3.5\"" in setup_requires_i or "<\"3.7\"" in setup_requires_i:
-            continue
-        setup_requires_tmp += [setup_requires_i]
-    setup_requires = setup_requires_tmp
-
-if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
-    setup_requires += ['opencv-python']
-
-# the prefix is sys.prefix which should always be usr
-paddle_bins = ''
-
-if not '${WIN32}':
-    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
-package_data = {
-    'paddle.fluid':
-    ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]
-}
-if '${HAS_NOAVX_CORE}' == 'ON':
-    package_data['paddle.fluid'] += [
-        'core_noavx' + ('.so' if os.name != 'nt' else '.pyd')
-    ]
-
-package_dir = {
-    '': '${PADDLE_BINARY_DIR}/python',
-    # The paddle.fluid.proto will be generated while compiling.
-    # So that package points to other directory.
-    'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
-    'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
-    'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
-}
-
-# put all thirdparty libraries in paddle.libs
-libs_path = '${PADDLE_BINARY_DIR}/python/paddle/libs'
-
-package_data['paddle.libs'] = []
-package_data['paddle.libs'] = [('libwarpctc'
-                                if os.name != 'nt' else 'warpctc') + ext_name]
-shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
-
-if '${WITH_MKL}' == 'ON':
-    shutil.copy('${MKLML_SHARED_LIB}', libs_path)
-    shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
-    package_data['paddle.libs'] += [
-        ('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name,
-        ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name
-    ]
-else:
-    if os.name == 'nt':
-        # copy the openblas.dll
-        shutil.copy('${OPENBLAS_SHARED_LIB}', libs_path)
-        package_data['paddle.libs'] += ['openblas' + ext_name]
-
-if '${WITH_LITE}' == 'ON':
-    shutil.copy('${LITE_SHARED_LIB}', libs_path)
-    package_data['paddle.libs'] += ['libpaddle_full_api_shared' + ext_name]
-
-if '${WITH_PSLIB}' == 'ON':
-    shutil.copy('${PSLIB_LIB}', libs_path)
-    if os.path.exists('${PSLIB_VERSION_PY}'):
-        shutil.copy(
-            '${PSLIB_VERSION_PY}',
-            '${PADDLE_BINARY_DIR}/python/paddle/fluid/incubate/fleet/parameter_server/pslib/'
-        )
-    package_data['paddle.libs'] += ['libps' + ext_name]
-
-if '${WITH_MKLDNN}' == 'ON':
-    if '${CMAKE_BUILD_TYPE}' == 'Release' and os.name != 'nt':
-        # only change rpath in Release mode.
-        # TODO(typhoonzero): use install_name_tool to patch mkl libs once
-        # we can support mkl on mac.
-        #
-        # change rpath of libdnnl.so.1, add $ORIGIN/ to it.
-        # The reason is that all thirdparty libraries in the same directory,
-        # thus, libdnnl.so.1 will find libmklml_intel.so and libiomp5.so.
-        command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
-        if os.system(command) != 0:
-            raise Exception("patch libdnnl.so failed, command: %s" % command)
-    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
-    if os.name != 'nt':
-        shutil.copy('${MKLDNN_SHARED_LIB_1}', libs_path)
-        package_data['paddle.libs'] += ['libmkldnn.so.0', 'libdnnl.so.1']
-    else:
-        package_data['paddle.libs'] += ['mkldnn.dll']
-
-if '${WITH_XPU}' == 'ON':
-    # only change rpath in Release mode,
-    if '${CMAKE_BUILD_TYPE}' == 'Release':
-        if os.name != 'nt':
-            if "@APPLE@" == "1":
-                command = "install_name_tool -id \"@loader_path/\" ${XPU_API_LIB}"
-            else:
-                command = "patchelf --set-rpath '$ORIGIN/' ${XPU_API_LIB}"
-            if os.system(command) != 0:
-                raise Exception("patch ${XPU_API_LIB} failed, command: %s" %
-                                command)
-    shutil.copy('${XPU_API_LIB}', libs_path)
-    shutil.copy('${XPU_RT_LIB}', libs_path)
-    shutil.copy('${XPU_SIM_LIB}', libs_path)
-    package_data['paddle.libs'] += [
-        '${XPU_API_LIB_NAME}', '${XPU_RT_LIB_NAME}', '${XPU_SIM_LIB_NAME}'
-    ]
-
-# copy libfuild_framework.so to libs
-if os.name != 'nt' and sys.platform != 'darwin':
-    paddle_framework_lib = '${FLUID_FRAMEWORK_SHARED_LIB}'
-    shutil.copy(paddle_framework_lib, libs_path)
-    package_data['paddle.libs'] += [
-        ('libpaddle_framework'
-         if os.name != 'nt' else 'paddle_framework') + ext_name
-    ]
-
-# remove unused paddle/libs/__init__.py
-if os.path.isfile(libs_path + '/__init__.py'):
-    os.remove(libs_path + '/__init__.py')
-package_dir['paddle.libs'] = libs_path
-
-# change rpath of ${FLUID_CORE_NAME}.ext, add $ORIGIN/../libs/ to it.
-# The reason is that libwarpctc.ext, libiomp5.ext etc are in paddle.libs, and
-# ${FLUID_CORE_NAME}.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
-# This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
-if '${CMAKE_BUILD_TYPE}' == 'Release':
-    if os.name != 'nt':
-        # only change rpath in Release mode, since in Debug mode, ${FLUID_CORE_NAME}.xx is too large to be changed.
-        if "@APPLE@" == "1":
-            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
-        else:
-            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
-        # The dynamic library compiled under aarch64 is greater than 64M,
-        # and an oversize error will be reported when using patchelf.
-        if platform.machine() != 'aarch64':
-            if os.system(command) != 0:
-                raise Exception(
-                    "patch ${FLUID_CORE_NAME}.%s failed, command: %s" %
-                    (ext_name, command))
-
-ext_modules = [Extension('_foo', ['stub.cc'])]
-if os.name == 'nt':
-    # fix the path separator under windows
-    fix_package_dir = {}
-    for k, v in package_dir.items():
-        fix_package_dir[k] = v.replace('/', '\\')
-    package_dir = fix_package_dir
-    ext_modules = []
-elif sys.platform == 'darwin':
-    ext_modules = []
-
-
-def find_files(pattern, root):
-    for dirpath, _, files in os.walk(root):
-        for filename in fnmatch.filter(files, pattern):
-            yield os.path.join(dirpath, filename)
-
-
-headers = (
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/framework')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/imperative')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/memory')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
-    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
-    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
-    list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}'))
-    +  # errorMessage.pb for errormessage
-    ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] +  # eigen
-    list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) +  # eigen
-    list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) +  # eigen
-    list(find_files('*', '${GFLAGS_INSTALL_DIR}/include')) +  # gflags
-    list(find_files('*', '${GLOG_INSTALL_DIR}/include')) +  # glog
-    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')) +  # boost
-    list(find_files('*', '${XXHASH_INSTALL_DIR}/include')) +  # xxhash
-    list(find_files('*', '${PROTOBUF_INCLUDE_DIR}')) +  # protobuf
-    list(find_files('*', '${DLPACK_INCLUDE_DIR}')) +  # dlpack
-    list(find_files('*.h', '${THREADPOOL_INCLUDE_DIR}')))  # threadpool
-
-if '${WITH_MKLDNN}' == 'ON':
-    headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include'))  # mkldnn
-
-if '${WITH_GPU}' == 'ON':
-    headers += list(find_files(
-        '*.pb', '${cudaerror_INCLUDE_DIR}'))  # errorMessage.pb for errormessage
-
-
-class InstallCommand(InstallCommandBase):
-    def finalize_options(self):
-        ret = InstallCommandBase.finalize_options(self)
-        self.install_headers = os.path.join(self.install_purelib, 'paddle',
-                                            'include')
-        self.install_lib = self.install_platlib
-        return ret
-
-
-class InstallHeaders(Command):
-    """Override how headers are copied.
-    """
-    description = 'install C/C++ header files'
-
-    user_options = [
-        ('install-dir=', 'd', 'directory to install header files to'),
-        ('force', 'f', 'force installation (overwrite existing files)'),
-    ]
-
-    boolean_options = ['force']
-
-    def initialize_options(self):
-        self.install_dir = None
-        self.force = 0
-        self.outfiles = []
-
-    def finalize_options(self):
-        self.set_undefined_options(
-            'install', ('install_headers', 'install_dir'), ('force', 'force'))
-
-    def mkdir_and_copy_file(self, header):
-        if 'pb.h' in header:
-            install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header)
-        elif 'third_party' not in header:
-            # framework
-            install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
-        else:
-            # third_party
-            install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
-            patterns = [
-                'eigen3/src/extern_eigen3', 'boost/src/extern_boost',
-                'dlpack/src/extern_dlpack/include', 'install/protobuf/include',
-                'install/gflags/include', 'install/glog/include',
-                'install/xxhash/include', 'install/mkldnn/include',
-                'threadpool/src/extern_threadpool'
-            ]
-            for pattern in patterns:
-                install_dir = re.sub(pattern, '', install_dir)
-        install_dir = os.path.join(self.install_dir,
-                                   os.path.dirname(install_dir))
-        if not os.path.exists(install_dir):
-            self.mkpath(install_dir)
-        return self.copy_file(header, install_dir)
-
-    def run(self):
-        # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
-        if os.name == 'nt' or sys.platform == 'darwin':
-            if '${WITH_GPU}' == 'ON':
-                self.mkdir_and_copy_file(
-                    '${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
-            return
-        hdrs = self.distribution.headers
-        if not hdrs:
-            return
-        self.mkpath(self.install_dir)
-        for header in hdrs:
-            (out, _) = self.mkdir_and_copy_file(header)
-            self.outfiles.append(out)
-
-    def get_inputs(self):
-        return self.distribution.headers or []
-
-    def get_outputs(self):
-        return self.outfiles
-
-
-# we redirect setuptools log for non-windows
-if sys.platform != 'win32':
-
-    @contextmanager
-    def redirect_stdout():
-        f_log = open('${SETUP_LOG_FILE}', 'w')
-        origin_stdout = sys.stdout
-        sys.stdout = f_log
-        yield
-        f_log = sys.stdout
-        sys.stdout = origin_stdout
-        f_log.close()
-else:
-
-    @contextmanager
-    def redirect_stdout():
-        yield
-
-
-if '${WITH_GPU}' == 'ON':
-    os.environ['PACKAGE_NAME'] = "paddlepaddle-gpu"
-else:
-    os.environ['PACKAGE_NAME'] = "paddlepaddle"
-
-with redirect_stdout():
-    setup(
-        name='${PACKAGE_NAME}',
-        version='${PADDLE_VERSION}',
-        description='Parallel Distributed Deep Learning',
-        install_requires=setup_requires,
-        packages=packages,
-        ext_modules=ext_modules,
-        package_data=package_data,
-        package_dir=package_dir,
-        scripts=paddle_bins,
-        distclass=BinaryDistribution,
-        headers=headers,
-        cmdclass={
-            'install_headers': InstallHeaders,
-            'install': InstallCommand,
-        },
-        entry_points={
-            'console_scripts':
-            ['fleetrun = paddle.distributed.fleet.launch:launch']
-        })
-
-# As there are a lot of files in purelib which causes many logs,
-# we don't print them on the screen, and you can open `setup.py.log`
-# for the full logs.
-if os.path.exists('${SETUP_LOG_FILE}'):
-    os.system('grep -v "purelib" ${SETUP_LOG_FILE}')
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index b787ae625017d783a7221006ddd6867c21e238e8..943b8c01e8cc0c0e0a41e9b01951939f454c3181 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -39,9 +39,9 @@ fi
 
 api_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5  ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` 
 if [ "$api_spec_diff" != "" ]; then
+    echo_line="${echo_line}Related APIs: ${api_spec_diff}\n"
     echo_line="You must have one RD (zhiqiu (Recommend) or phlrain) approval for the api change for the opreator-related api without 'core.ops'.\n"
     echo_line="${echo_line}For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/paddle_api_development_manual.md]\n"
-    echo_line="${echo_line}Related APIs: ${api_spec_diff}\n"
     check_approval 1 6888866 43953930
 fi
 
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
new file mode 100644
index 0000000000000000000000000000000000000000..970f89551c579b1554db8f878c63dace0d715930
--- /dev/null
+++ b/tools/get_pr_ut.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" For the PR that only modified the unit test, get cases in pull request. """
+
+import os
+import json
+from github import Github
+
+PADDLE_ROOT = os.getenv('PADDLE_ROOT', '/paddle/')
+
+
+class PRChecker(object):
+    """ PR Checker. """
+
+    def __init__(self):
+        self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60)
+        self.repo = self.github.get_repo('PaddlePaddle/Paddle')
+        self.pr = None
+
+    def init(self):
+        """ Get pull request. """
+        pr_id = os.getenv('GIT_PR_ID')
+        if not pr_id:
+            print('No PR ID')
+            exit(0)
+        self.pr = self.repo.get_pull(int(pr_id))
+
+    def get_pr_files(self):
+        """ Get files in pull request. """
+        page = 0
+        file_list = []
+        while True:
+            files = self.pr.get_files().get_page(page)
+            if not files:
+                break
+            for f in files:
+                file_list.append(PADDLE_ROOT + f.filename)
+            page += 1
+        return file_list
+
+    def get_pr_ut(self):
+        """ Get unit tests in pull request. """
+        ut_list = []
+        file_ut_map = None
+        cmd = 'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/file_ut.json'
+        os.system(cmd)
+        with open('file_ut.json') as jsonfile:
+            file_ut_map = json.load(jsonfile)
+        for f in self.get_pr_files():
+            if f not in file_ut_map:
+                return ''
+            if f.endswith('.h') or f.endswith('.cu'):
+                return ''
+            else:
+                ut_list.extend(file_ut_map.get(f))
+        ut_list = list(set(ut_list))
+        return ' '.join(ut_list)
+
+
+if __name__ == '__main__':
+    pr_checker = PRChecker()
+    pr_checker.init()
+    print(pr_checker.get_pr_ut())