diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 8a927d8e282a03e8a74c0814ee8d9b247451a091..07fe7d245ef57814d704d11be6f6fe45cf514b2d 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -4,7 +4,7 @@ endif()
 
 INCLUDE(ExternalProject)
 SET(XPU_PROJECT                 "extern_xpu")
-SET(XPU_URL    "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
 SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
 SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 21080fbe8fd2e14cf7fd805e01948f2f28535c22..7aa2766763ce9441b0e4de969930af50fb7a55e0 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -62,9 +62,9 @@ function(op_library TARGET)
             endif()
         endif()
         if(WITH_XPU)
-            string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
-                list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
+            string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc)
+                list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
             endif()
         endif()
     else()
@@ -83,7 +83,7 @@ function(op_library TARGET)
                 list(APPEND mkldnn_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cu.cc$")
                 list(APPEND cu_cc_srcs ${src})
-            elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
+            elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                 list(APPEND xpu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
                 list(APPEND cc_srcs ${src})
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index ffd32cc78f087504dde0e9f5b2b1f39a8bff557e..1eb2096af91dc99ac22b000d2de269bde2efcbbf 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -270,6 +270,10 @@ if(WITH_PSLIB)
     endif()
 endif(WITH_PSLIB)
 
+if(NOT WIN32 AND NOT APPLE)
+    include(external/gloo)
+    list(APPEND third_party_deps extern_gloo)
+endif()
 
 if(WITH_BOX_PS)
     include(external/box_ps)
@@ -277,10 +281,6 @@ if(WITH_BOX_PS)
 endif(WITH_BOX_PS)
 
 if(WITH_DISTRIBUTE)
-    if(WITH_GLOO)
-        include(external/gloo)
-        list(APPEND third_party_deps extern_gloo)
-    endif()
 
     if(WITH_GRPC)
         list(APPEND third_party_deps extern_grpc)
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 939a2fc8fc9c73472ff5c25633610fa70c7cec6d..78887f3ac5195893ca304ea97d5bf4218c5952f8 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -76,7 +76,7 @@ void AllReduceOpHandle::AllReduceImpl(
                     platform::errors::InvalidArgument(
                         "The NoDummyInputSize should be equal "
                         "to the number of places, but got NoDummyInputSize is "
-                        "%d and the number of place is %d.",
+                        "%d and the number of places is %d.",
                         in_var_handles.size(), num_places));
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), out_var_handles.size(),
@@ -89,7 +89,7 @@ void AllReduceOpHandle::AllReduceImpl(
       platform::errors::InvalidArgument(
           "The number of local scopes should be equal "
           "to the number of places, but got the number of local scopes is "
-          "%d and the number of place is %d.",
+          "%d and the number of places is %d.",
           in_var_handles.size(), num_places));
 
   std::vector<const void *> lod_tensor_data;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 4c3b0a7c6a44ca0d304113e57ebb3be9e1a7de27..35b106606740556481cd98ce76955e953f7e0ee7 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() {
   auto out_var_handles = DynamicCast<VarHandle>(outputs_);
 
   PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
-                    "The number of input should be one.");
-  PADDLE_ENFORCE_EQ(
-      out_var_handles.size(), places_.size(),
-      "The number of output should equal to the number of places.");
+                    platform::errors::PreconditionNotMet(
+                        "The number of inputs should be 1, but got %d.",
+                        in_var_handles.size()));
+  PADDLE_ENFORCE_EQ(out_var_handles.size(), places_.size(),
+                    platform::errors::PreconditionNotMet(
+                        "The number of outputs and the number of places should "
+                        "be equal, but got the number of outputs is %d and the "
+                        "number of places is %d.",
+                        out_var_handles.size(), places_.size()));
 
   VarHandle *in_var_handle = in_var_handles[0];
 
@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar(
     const std::vector<Scope *> &var_scopes) {
   auto *in_var =
       var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name());
-  PADDLE_ENFORCE_NOT_NULL(in_var);
+  PADDLE_ENFORCE_NOT_NULL(
+      in_var, platform::errors::NotFound("Variable %s is not found in scopes.",
+                                         in_var_handle.name()));
   Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
   if (UNLIKELY(!in_tensor.IsInitialized())) {
     VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!";
@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar(
 
       broadcast_calls.emplace_back(
           [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
-            PADDLE_ENFORCE(platform::dynload::ncclBcast(
+            PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
                 send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
                 root_id, nccl_ctx.comm_, nccl_ctx.stream()));
           });
@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar(
       nccl_ctxs_->DevCtx(p)->Wait();
     }
 #else
-    PADDLE_THROW("CUDA is not enabled.");
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
   }
 }
@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue(
     auto t_out_p = out_var_handle->place();
     auto *out_var = var_scopes.at(out_var_handle->scope_idx())
                         ->FindVar(out_var_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
+                                         "Variable %s is not found in scopes.",
+                                         out_var_handle->name()));
     if (is_gpu_place(in_tensor.place())) {
-      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
-                     "Places of input and output must be all on GPU.");
+      PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
+                        platform::errors::PreconditionNotMet(
+                            "Places of input and output must be all on GPU."));
     } else {
       t_out_p = platform::CPUPlace();
     }
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index e455879a68f70b3f4f33fb5e6ede0fd9e9f22d5f..4fdc420e1e0752ac3122d25db5ed1423bb47c69e 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -79,7 +79,8 @@ struct TestBroadcastOpHandle {
       }
       nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
     } else {
       int count = 8;
@@ -113,7 +114,8 @@ struct TestBroadcastOpHandle {
       op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                          place_list_, nccl_ctxs_.get());
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
     } else {
 #if defined(PADDLE_WITH_NCCL)
@@ -171,7 +173,9 @@ struct TestBroadcastOpHandle {
                                    float val_scalar = 0.0) {
     auto var = param_scopes_[input_scope_idx]->FindVar(varname);
 
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                        varname));
     auto lod_tensor = var->GetMutable<f::LoDTensor>();
     std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
     for (size_t k = 0; k < send_vector.size(); ++k) {
@@ -194,7 +198,9 @@ struct TestBroadcastOpHandle {
     }
 
     auto var = param_scopes_[input_scope_idx]->FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                        varname));
     auto selected_rows = var->GetMutable<f::SelectedRows>();
     auto value = selected_rows->mutable_value();
     value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
@@ -211,13 +217,24 @@ struct TestBroadcastOpHandle {
                          const std::vector<float>& send_vector,
                          const std::vector<int64_t>& rows, int height) {
     auto var = param_scopes_[input_scope_idx]->FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                        varname));
     auto& selected_rows = var->Get<f::SelectedRows>();
     auto rt = selected_rows.value();
-    PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal.");
+    PADDLE_ENFORCE_EQ(selected_rows.height(), height,
+                      platform::errors::InvalidArgument(
+                          "The height of SelectedRows is not equal to "
+                          "the expected, expect %d, but got %ld.",
+                          height, selected_rows.height()));
 
     for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]);
+      PADDLE_ENFORCE_EQ(
+          selected_rows.rows()[k], rows[k],
+          platform::errors::InvalidArgument(
+              "The item at position %zu of rows of SelectedRows "
+              "is not equal to the expected, expect %ld, but got %ld.",
+              k, rows[k], selected_rows.rows()[k]));
     }
 
     p::CPUPlace cpu_place;
@@ -235,9 +252,15 @@ struct TestBroadcastOpHandle {
                       framework::Scope* scope) {
     p::CPUPlace cpu_place;
     auto var = scope->FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                        varname));
     auto tensor = var->Get<f::LoDTensor>();
-    PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal.");
+    PADDLE_ENFORCE_EQ(tensor.lod(), lod,
+                      platform::errors::InvalidArgument(
+                          "The LoD of tensor is not equal to "
+                          "the expected, expect %s, but got %s.",
+                          lod, tensor.lod()));
     f::Tensor result_tensor;
     f::TensorCopySync(tensor, cpu_place, &result_tensor);
     float* ct = result_tensor.mutable_data<float>(cpu_place);
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index ecdb8cc9b8cdf38385f9bc6005bf6a889dfa7741..962f968c84ea46f0476feec70a5d3f53ba8b65ea 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
               AppendPass("reduce_mode_multi_devices_pass").get();
           break;
         default:
-          PADDLE_THROW("Unknown reduce strategy.");
+          PADDLE_THROW(
+              platform::errors::Unimplemented("Unknown reduce strategy."));
       }
     }
     multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 7735f9720c109407249ea19f0e5e609a02cfd22e..266557cb8554ae310ae27046075cfcb35b05d9da 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+
 #include <memory>
 #include <unordered_set>
 #include <utility>
 
-#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
     if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
       platform::CUDADeviceGuard guard(
           BOOST_GET_CONST(platform::CUDAPlace, place).device);
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-      PADDLE_ENFORCE_NOT_NULL(event_);
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+      PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
+                                          "The cuda envet created is NULL."));
     }
   }
 #endif
-  PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument(
-                                            "Variable names are empty."));
+  PADDLE_ENFORCE_NE(vars.empty(), true,
+                    platform::errors::InvalidArgument(
+                        "The variables to be deleted are empty."));
   for (auto *var : var_infos_) {
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
+                                     "The memory optimization info is NULL."));
   }
 }
 
@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
   if (event_) {
     auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
     platform::CUDADeviceGuard guard(gpu_place.device);
-    PADDLE_ENFORCE(cudaEventDestroy(event_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
   }
 #endif
 }
@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() {
 }
 
 void EagerDeletionOpHandle::CallOnce() {
-  PADDLE_ENFORCE(vars_.empty(), "vars_ must be initialized here");
+  PADDLE_ENFORCE_EQ(
+      vars_.empty(), true,
+      platform::errors::InvalidArgument(
+          "The variables to be deleted should be initialized here."));
   Scope *exec_scope = local_exec_scopes_[0];
   for (auto *var_info : var_infos_) {
     auto *var = exec_scope->FindVar(var_info->Name());
-    PADDLE_ENFORCE_NOT_NULL(var, "Variable %s should not be nullptr",
-                            var_info->Name());
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound(
+                 "The variable(%s) to be inplaced is not found in scope.",
+                 var_info->Name()));
     vars_.emplace_back(var);
   }
 }
@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() {
         garbages.emplace_back(t.MoveMemoryHolder());
       }
     } else {
-      PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                   framework::ToTypeName(var->Type()), var_info->Name());
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "The variable(%s) of type %s is not supported in eager deletion.",
+          framework::ToTypeName(var->Type()), var_info->Name()));
     }
   }
 
@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages(
     auto callback_stream =
         reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
     auto callback_func = [=]() {
-      PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-      PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaStreamWaitEvent(callback_stream, event_, 0));
     };
     gc_->Add(std::move(*garbages), callback_func);
   } else {
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index c67e21d5c4728110a800d4fd0367ee33441ce3c7..c538811669924aae2e33a6c18d7b1eb1ca9268cb 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
+
 #include <algorithm>
 #include <utility>
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() {
   size_t place_num = places_.size();
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), place_num * num_of_all_reduce_,
-      "The NoDummyInputSize should be equal to the number of places.");
+      platform::errors::PreconditionNotMet(
+          "The number of input variable handles should be equal to the number "
+          "of places plus the number of all reduce handles, "
+          "but got the number of input variable handles is %d, the "
+          "number of places is %d, and the number of all reduce handles "
+          "is %d.",
+          in_var_handles.size(), place_num, num_of_all_reduce_));
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+      platform::errors::PreconditionNotMet(
+          "The number of input variable handles should be equal to the number "
+          "of output variable handles, but got the number of input variable "
+          "handles is %d, and the number of  output variable handles is %d.",
+          in_var_handles.size(), out_var_handles.size()));
 
   // Note: some gradient op doesn't have CUDAKernel, so the gradients of
   // those op are in CPUPlace, in this case, the all reduce should not be fused.
@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
       dtype = ele_dtype;
     }
 
-    PADDLE_ENFORCE_EQ(ele_dtype, dtype);
+    PADDLE_ENFORCE_EQ(
+        ele_dtype, dtype,
+        platform::errors::InvalidArgument(
+            "The DataType of grad tensors of fused_all_reduce_op_handle  "
+            "must be consistent. The current dtype is %s, but the "
+            "previous dtype is %s.",
+            DataTypeToString(ele_dtype), DataTypeToString(dtype)));
 
     // Check whether the address space is contiguous.
     std::sort(
@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
           "input[%d] address: 0X%02x. The offset: %d",
           k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
           next_address, k, infer_next_address, offset);
-      PADDLE_ENFORCE_EQ(infer_next_address, next_address,
-                        "The address is not consistent.");
+      PADDLE_ENFORCE_EQ(
+          infer_next_address, next_address,
+          platform::errors::InvalidArgument(
+              "The infered address of the next tensor should be equal to the "
+              "real address of the next tensor. But got infered address is %p "
+              "and real address is %p.",
+              infer_next_address, next_address));
     }
   }
 
   if (!FLAGS_skip_fused_all_reduce_check) {
     for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
       for (size_t j = 1; j < num_of_all_reduce_; ++j) {
-        PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first,
-                          grads_tensor.at(scope_idx).at(j).first);
+        PADDLE_ENFORCE_EQ(
+            grads_tensor.at(0).at(j).first,
+            grads_tensor.at(scope_idx).at(j).first,
+            platform::errors::InvalidArgument(
+                "The variable name of grad tensors of "
+                "fused_all_reduce_op_handle  "
+                "must be consistent. The current name is %s, but the "
+                "previous name is %s.",
+                grads_tensor.at(0).at(j).first,
+                grads_tensor.at(scope_idx).at(j).first));
       }
     }
   }
@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
     for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
       auto var_name = in_var_handles[j]->name();
       auto var = local_scope->FindVar(var_name);
-      PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          var, platform::errors::NotFound(
+                   "The variable '%s' is not found in local scope.", var_name));
       auto &lod_tensor = var->Get<LoDTensor>();
       if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
         return true;
@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
   size_t place_num = places_.size();
   for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
     auto var_name = in_var_handles[j]->name();
-    PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
+    PADDLE_ENFORCE_EQ(
+        var_name, out_var_handles[j]->name(),
+        platform::errors::InvalidArgument(
+            "The name of input variable should be equal "
+            "to the name of output variable. But got the name of input "
+            "variable is %s and the name of output variable is %s.",
+            var_name, out_var_handles[j]->name()));
     auto var = local_scope->FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound(
+                 "The variable '%s' is not found in local scope.", var_name));
     auto &lod_tensor = var->Get<LoDTensor>();
 
     PADDLE_ENFORCE_EQ(
         platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)),
-        true, "%s(%d) is not in the right place.", var_name, scope_idx);
+        true, platform::errors::InvalidArgument(
+                  "The variable '%s' at scope %d is not in the right place.",
+                  var_name, scope_idx));
     grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
   }
 }
@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
   size_t size_of_dtype = 0;
   for (size_t i = 0; i < grad_tensor.size(); ++i) {
     // Get dtype
-    auto ele_type = grad_tensor.at(i).second->type();
+    auto ele_dtype = grad_tensor.at(i).second->type();
     if (i == 0) {
-      *dtype = ele_type;
-      size_of_dtype = framework::SizeOfType(ele_type);
+      *dtype = ele_dtype;
+      size_of_dtype = framework::SizeOfType(ele_dtype);
     }
-    PADDLE_ENFORCE_EQ(ele_type, *dtype);
+    PADDLE_ENFORCE_EQ(
+        ele_dtype, *dtype,
+        platform::errors::InvalidArgument(
+            "The DataType of grad tensors of fused_all_reduce_op_handle  "
+            "must be consistent. The current dtype is %s, but the "
+            "previous dtype is %s.",
+            DataTypeToString(ele_dtype), DataTypeToString(*dtype)));
 
     // Get element number
     int64_t len = grad_tensor.at(i).second->numel();
-    PADDLE_ENFORCE_GT(len, 0);
+    PADDLE_ENFORCE_GT(
+        len, 0, platform::errors::InvalidArgument(
+                    "The size of grad tensors of fused_all_reduce_op_handle  "
+                    "must be > 0, but got %d.",
+                    len));
     *numel +=
         platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
   }
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
index 59c5da0de8c114823a1cad3e6d65c92081b5a2b6..1ae09dcde9fc8e4a413f8876dd33d0ac53f181c3 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() {
   WaitInputVarGenerated();
 
   size_t place_num = places_.size();
-  PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size());
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size() * place_num, out_var_handles.size(),
+      platform::errors::PreconditionNotMet(
+          "The number of input variable handles plus the number "
+          "of places should be equal to the number of output variable handles, "
+          "but got the number of input variable handles is %d, the "
+          "number of places is %d, and the number of output variable handles "
+          "is %d.",
+          in_var_handles.size(), place_num, out_var_handles.size()));
 
   for (size_t i = 0; i < in_var_handles.size(); ++i) {
     BroadcastOneVar(
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
index 761a5b5a30a0e04690a7dc94752179130c85320a..ce7621d4e35a3f139047b543c4e77d805841e459 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
+
 #include <memory>
 #include <unordered_map>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
@@ -58,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
       op_handle_ = new FusedBroadcastOpHandle(
           nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
-      PADDLE_THROW("CUDA is not supported.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
     } else {
 #if defined(PADDLE_WITH_NCCL)
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
index a039c6200e394eebf6c44846ce2b0bf5d773e764..2d3b2fb39afbe7be680704e63e52d1951f3d8946 100644
--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/gather_op_handle.h"
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 
@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() {
 
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), places_.size(),
-      "The number of output should equal to the number of places.");
+      platform::errors::InvalidArgument(
+          "The number of input variables should be equal "
+          "to the number of places, but got the number of input variables is "
+          "%d and the number of places is %d.",
+          in_var_handles.size(), places_.size()));
 
   VarHandle *out_var_handle;
   {
     auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
-                      "The number of output should be one.");
+    PADDLE_ENFORCE_EQ(
+        out_var_handles.size(), 1,
+        platform::errors::InvalidArgument(
+            "The number of output variables should be 1, but got %d.",
+            out_var_handles.size()));
     out_var_handle = out_var_handles.front();
   }
 
@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() {
   auto in_0_handle = in_var_handles[0];
   auto pre_in_var =
       var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
+  PADDLE_ENFORCE_NOT_NULL(
+      pre_in_var,
+      platform::errors::NotFound("The variable '%s' is not found in the scope.",
+                                 in_0_handle->name()));
 
-  PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
-                 "Currently, gather_op only can gather SelectedRows.");
+  PADDLE_ENFORCE_EQ(pre_in_var->IsType<framework::SelectedRows>(), true,
+                    platform::errors::Unimplemented(
+                        "Currently, gather_op only supports SelectedRows."));
 
   // Wait input done, this Wait is asynchronous operation
   WaitInputVarGenerated();
@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() {
   for (auto *in_handle : in_var_handles) {
     auto *in_var =
         var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(in_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var,
+        platform::errors::NotFound(
+            "The variable '%s' is not found in the scope.", in_handle->name()));
     VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
 
     auto &in_sr_value = in_var->Get<framework::SelectedRows>();
@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() {
   // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
   platform::Place t_out_p = out_var_handle->place();
   if (platform::is_gpu_place(pre_in_value.place())) {
-    PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
-                   "Places of input and output must be all on GPU.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
+                      platform::errors::PreconditionNotMet(
+                          "Places of input and output must be all on GPU."));
   } else {
     t_out_p = platform::CPUPlace();
   }
 
   auto out_var = var_scopes.at(out_var_handle->scope_idx())
                      ->FindVar(out_var_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(out_var);
+  PADDLE_ENFORCE_NOT_NULL(
+      out_var,
+      platform::errors::NotFound("The variable '%s' is not found in the scope.",
+                                 out_var_handle->name()));
   auto out_value = out_var->GetMutable<framework::SelectedRows>();
   out_value->set_height(pre_in_value.height());
   out_value->set_rows(out_rows);
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index f3fcc1a436df38986e1202755cd88f14069028a8..60c1d0d39a551fb1cec523109e76c309d11ea248 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/gather_op_handle.h"
+
 #include <memory>
 #include <unordered_map>
+
 #include "gtest/gtest.h"
 
 namespace paddle {
@@ -60,7 +62,8 @@ struct TestGatherOpHandle {
         ctxs_.emplace_back(new p::CUDADeviceContext(p));
       }
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
     } else {
       int count = 8;
@@ -141,7 +144,9 @@ struct TestGatherOpHandle {
     for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
          ++input_scope_idx) {
       auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
-      PADDLE_ENFORCE_NOT_NULL(in_var);
+      PADDLE_ENFORCE_NOT_NULL(
+          in_var, platform::errors::NotFound(
+                      "The variable '%s' is not found in the scope.", "input"));
       auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
       auto value = in_selected_rows->mutable_value();
       value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -155,7 +160,9 @@ struct TestGatherOpHandle {
     }
 
     auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
-    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        out_var, platform::errors::NotFound(
+                     "The variable '%s' is not found in the scope.", "out"));
     auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
 
     auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
@@ -173,9 +180,19 @@ struct TestGatherOpHandle {
     auto& out_select_rows = out_var->Get<f::SelectedRows>();
     auto rt = out_select_rows.value();
 
-    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                      platform::errors::InvalidArgument(
+                          "The height of SelectedRows is not equal to "
+                          "the expected, expect %d, but got %d.",
+                          height, out_select_rows.height()));
+
     for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+      PADDLE_ENFORCE_EQ(
+          out_select_rows.rows()[k], rows[k % rows.size()],
+          platform::errors::InvalidArgument(
+              "The item at position %d of rows of SelectedRows is not equal to "
+              "the expected, expect %d, but got %d.",
+              k, rows[k % rows.size()], out_select_rows.rows()[k]));
     }
 
     f::Tensor result_tensor;
@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) {
   test_op.TestGatherSelectedRows(input_scope_idx);
 }
 #endif
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h
index 2d4d4122a3c0f60e5e207556d20886985f72a30a..22a059773f513f1a4a3aef0d3ca9b603fcd30bf8 100644
--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_op_handle.h
@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase {
   }
   virtual ~NCCLOpHandleBase() {
     for (auto& ev : inter_events_) {
-      PADDLE_ENFORCE(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
     }
     for (auto& ev : exter_events_) {
-      PADDLE_ENFORCE(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
     }
   }
   void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
-    PADDLE_ENFORCE(run_order >= 0, "run_order must >= 0");
+    PADDLE_ENFORCE_GE(
+        run_order, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order must be >= 0, but got %d.", run_order));
     run_order_ = run_order;
     use_hierarchical_allreduce_ = use_hierarchical_allreduce;
 
@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase {
       return;
     }
 
-    PADDLE_ENFORCE(places_.size() == 1,
-                   "HierarchicalAllReduce run one proc with one card mode.");
+    PADDLE_ENFORCE_EQ(places_.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "HierarchicalAllReduce can only run "
+                          "one proccess with one card mode, but got %d cards.",
+                          places_.size()));
 
     for (auto& p : places_) {
       auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order);
@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase {
         continue;
       }
 
-      PADDLE_ENFORCE(cudaSetDevice(dev_id));
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&inter_events_[dev_id],
-                                              cudaEventDisableTiming));
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&exter_events_[dev_id],
-                                              cudaEventDisableTiming));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
+          &inter_events_[dev_id], cudaEventDisableTiming));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
+          &exter_events_[dev_id], cudaEventDisableTiming));
       VLOG(10) << "Create events on dev_id:" << dev_id
                << ", inter_event:" << &inter_events_[dev_id]
                << ", exter_event:" << &exter_events_[dev_id];
@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase {
   void FlatNCCLAllReduce(platform::Place place, const void* sendbuff,
                          void* recvbuff, size_t count, ncclDataType_t datatype,
                          ncclRedOp_t op) {
-    PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
+    PADDLE_ENFORCE_GE(
+        run_order_, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order_ must be >= 0, but got %d.", run_order_));
     auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
     auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dev_id:" << dev_id << ", dtype:" << datatype
              << ", place:" << place;
 
-    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
   }
 
   void NCCLAllReduce(platform::Place place, const void* sendbuff,
                      void* recvbuff, size_t count, ncclDataType_t datatype,
                      ncclRedOp_t op) {
-    PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
+    PADDLE_ENFORCE_GE(
+        run_order_, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order_ must be >= 0, but got %d.", run_order_));
     if (!use_hierarchical_allreduce_) {
       FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op);
       return;
@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase {
   void HierarchicalAllReduce(platform::Place place, const void* sendbuff,
                              void* recvbuff, size_t count,
                              ncclDataType_t datatype, ncclRedOp_t op) {
-    PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
+    PADDLE_ENFORCE_GE(
+        run_order_, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order_ must be >= 0, but got %d.", run_order_));
     InterReduce(place, sendbuff, recvbuff, count, datatype, op);
     // When a trainer is not in exter allreduce ring
     // they need not to call this.
@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dtype:" << datatype << ", place:" << place
              << ", stream:" << stream;
 
-    PADDLE_ENFORCE(platform::dynload::ncclReduce(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
         sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
 
     cudaEventRecord(inter_events_.at(dev_id), stream);
 
     if (FLAGS_sync_nccl_allreduce) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream),
-                     "sync HierarchicalAllReduce inter stream error");
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
     }
   }
 
@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase {
                       void* recvbuff, size_t count, ncclDataType_t datatype,
                       ncclRedOp_t op) {
     auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_);
-    PADDLE_ENFORCE(nccl_ctxs_, "can't get exter %d nccl_ctxs", run_order_);
+    PADDLE_ENFORCE_NOT_NULL(
+        nccl_ctxs_, platform::errors::NotFound(
+                        "Can't get exter %d nccl contexts.", run_order_));
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
     auto& nccl_ctx = nccl_ctxs->at(dev_id);
     auto stream = nccl_ctx.stream();
@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase {
 
     cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
 
-    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
 
     cudaEventRecord(exter_events_.at(dev_id), stream);
 
     if (FLAGS_sync_nccl_allreduce) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream),
-                     "sync HierarchicalAllReduce exter stream error");
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
     }
   }
 
@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", stream:" << stream;
 
     cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
-    PADDLE_ENFORCE(platform::dynload::ncclBcast(sendbuff, count, datatype, 0,
-                                                comm, stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+        sendbuff, count, datatype, 0, comm, stream));
   }
 
  protected:
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 459bcff5c0b740be0d495a6ad648da7424bd1a42..105c37192f57c365abc1429afa7e6627b95eef90 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -47,8 +47,8 @@ void OpHandleBase::InitCUDA() {
 #ifdef PADDLE_WITH_CUDA
   for (auto &p : dev_ctxes_) {
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
-    PADDLE_ENFORCE(cudaSetDevice(dev_id));
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
   }
   if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
@@ -62,17 +62,22 @@ void OpHandleBase::InitCUDA() {
       }
     }
   } else {
-    PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
-                      "%s should have only one dev_ctx.", Name());
+    PADDLE_ENFORCE_EQ(
+        dev_ctxes_.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Operator %s should have only one dev_ctx, but got %d.", Name(),
+            dev_ctxes_.size()));
     auto &place = dev_ctxes_.begin()->first;
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
     for (auto &out_var : outputs_) {
       auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
       if (out_var_handle) {
-        PADDLE_ENFORCE(platform::is_same_place(place, out_var_handle->place()),
-                       "The place of output(%s) is not consistent with the "
-                       "place of current op(%s).",
-                       out_var_handle->Name(), Name());
+        PADDLE_ENFORCE_EQ(
+            platform::is_same_place(place, out_var_handle->place()), true,
+            platform::errors::InvalidArgument(
+                "The place of output(%s) is not consistent with the "
+                "place of current op(%s).",
+                out_var_handle->Name(), Name()));
         out_var_handle->SetGenerateEvent(events_.at(dev_id));
       }
     }
@@ -86,7 +91,10 @@ void OpHandleBase::Run(bool use_cuda) {
     InitCUDA();
   }
 #else
-  PADDLE_ENFORCE(!use_cuda);
+  PADDLE_ENFORCE_EQ(use_cuda, false,
+                    platform::errors::InvalidArgument(
+                        "Argument use_cuda should be false when Paddle is not "
+                        "compiled with CUDA."));
 #endif
 
   // skip running current op, used with inplace_addto_op_pass
@@ -100,17 +108,20 @@ void OpHandleBase::Run(bool use_cuda) {
 
 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
 #ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_NOT_NULL(waited_ctx);
+  PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
+                                          "Argument waited_ctx is NULL."));
   if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
     for (auto &dev_ctx : dev_ctxes_) {
-      PADDLE_ENFORCE_NOT_NULL(dev_ctx.second);
+      PADDLE_ENFORCE_NOT_NULL(
+          dev_ctx.second,
+          platform::errors::InvalidArgument("The device context is NULL."));
       dev_ctx.second->Wait();
     }
   } else {
     auto stream =
         static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
     for (auto &ev : events_) {
-      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
     }
   }
 #else
@@ -145,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() {
           auto stream =
               static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
                   ->stream();
-          PADDLE_ENFORCE(
+          PADDLE_ENFORCE_CUDA_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
-          PADDLE_THROW("Doesn't compile the GPU.");
+          PADDLE_THROW(
+              platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
         }
         // There are nothing to do when the place is CPUPlace.
@@ -169,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
           auto stream = static_cast<platform::CUDADeviceContext *>(
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
-          PADDLE_ENFORCE(
+          PADDLE_ENFORCE_CUDA_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
-          PADDLE_THROW("Doesn't compile the GPU.");
+          PADDLE_THROW(
+              platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
         }
         // There are nothing to do when the place is CPUPlace.
@@ -242,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes(
   auto scopes = GetLocalScopes();
   for (auto *scope : scopes) {
     auto iter = scope_map.find(scope);
-    PADDLE_ENFORCE(iter != scope_map.end(), "Local scope not found");
+    PADDLE_ENFORCE_NE(
+        iter, scope_map.end(),
+        platform::errors::NotFound("Local scope not found in scope map."));
     local_exec_scopes_.emplace_back(iter->second);
   }
 }
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 1e608000e0ac93bb9be3308a89362c321d6a11f9..453a25166b56e9755528fa17f203557680ebabe6 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/inplace_op_inference.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
     PADDLE_ENFORCE_EQ(info->proto_, nullptr,
                       platform::errors::AlreadyExists(
-                          "OpProto of %s has been registered", op_type));
+                          "OpProto of %s has been registered.", op_type));
     PADDLE_ENFORCE_EQ(info->checker_, nullptr,
                       platform::errors::AlreadyExists(
-                          "OpAttrChecker of %s has been registered", op_type));
+                          "OpAttrChecker of %s has been registered.", op_type));
     info->proto_ = new proto::OpProto;
     info->checker_ = new OpAttrChecker();
     T maker;
     maker(info->proto_, info->checker_);
     info->proto_->set_type(op_type);
-    PADDLE_ENFORCE(
-        info->proto_->IsInitialized(),
-        "Fail to initialize %s's OpProto, because %s is not initialized",
-        op_type, info->proto_->InitializationErrorString());
+    PADDLE_ENFORCE_EQ(
+        info->proto_->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "Fail to initialize %s's OpProto, because %s is not initialized.",
+            op_type, info->proto_->InitializationErrorString()));
   }
 };
 
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index 11c4621fde394057144462bb513aab63187512e3..9ecb2d8dbdd1c26e827961cbc9b569d92992c8e3 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <map>
 #include <vector>
+
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -32,9 +33,13 @@ struct ReduceLoDTensor {
 
   template <typename T>
   void apply() const {
-    PADDLE_ENFORCE(!src_tensors_.empty());
+    PADDLE_ENFORCE_NE(src_tensors_.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The number of tensors to be reduced is 0."));
     auto &t0 = *src_tensors_[0];
-    PADDLE_ENFORCE_NE(t0.numel(), 0);
+    PADDLE_ENFORCE_NE(t0.numel(), 0,
+                      platform::errors::InvalidArgument(
+                          "The size of first tensor to be reduced is 0."));
 
     dst_tensor_.Resize(t0.dims());
     T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
@@ -45,8 +50,19 @@ struct ReduceLoDTensor {
         continue;
       }
 
-      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
-      PADDLE_ENFORCE_EQ(t.type(), t0.type());
+      PADDLE_ENFORCE_EQ(t.dims(), t0.dims(),
+                        platform::errors::InvalidArgument(
+                            "The shape of tensors to be reduced must be "
+                            "consistent. The shape of current tensor is %s, "
+                            "but the shape of the first tensor is %s.",
+                            t.dims(), t0.dims()));
+
+      PADDLE_ENFORCE_EQ(t.type(), t0.type(),
+                        platform::errors::InvalidArgument(
+                            "The type of tensors to be reduced must be "
+                            "consistent. The type of current tensor is %s, "
+                            "but the type of the first tensor is %s.",
+                            t.type(), t0.type()));
       std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
                      [](T a, T b) -> T { return a + b; });
     }
@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor {
         in_places_(in_places),
         out_place_(out_place),
         dst_selected_rows_(dst_selected_rows) {
-    PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false);
+    PADDLE_ENFORCE_NE(src_selected_rows.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The number of selected_rows to be gathered is 0."));
 
     std::vector<int64_t> out_rows;
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index d8f8cc994c080953c92adc02943bc6828aa645a6..d7f13f79f68ebe629cc67d5e6c868d331d3c917c 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows(
   merged_dev_ctx->Wait();
   scope->EraseVars(std::vector<std::string>{gathered_var_name});
 
-  PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope));
-  PADDLE_ENFORCE(remote.size() == vars.size());
+  PADDLE_ENFORCE_EQ(
+      client->Gather(vars, &remote, *merged_dev_ctx, scope), true,
+      platform::errors::PreconditionNotMet("Gather SelectedRows failed."));
+  PADDLE_ENFORCE_EQ(remote.size(), vars.size(),
+                    platform::errors::PreconditionNotMet(
+                        "The number of remotes should be equal to the number "
+                        "of variables to be gathered, but got the number of "
+                        "remotes is %d and the number of variables is %d.",
+                        remote.size(), vars.size()));
 
   // 4. merged local selected rows.
   std::vector<const SelectedRows *> all;
@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() {
 
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), places_.size(),
-      "The number of output should equal to the number of places.");
+      platform::errors::InvalidArgument(
+          "The number of inputs should equal to the number of places, but got "
+          "the number of inputs is %d and the number of places is %d.",
+          in_var_handles.size(), places_.size()));
 
   VarHandle *out_var_handle;
   {
     auto out_var_handles = DynamicCast<VarHandle>(outputs_);
 
     PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
-                      "The number of output should be one.");
+                      platform::errors::InvalidArgument(
+                          "The number of output should be one, but got %d.",
+                          out_var_handles.size()));
     out_var_handle = out_var_handles.front();
   }
 
@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() {
 
   auto pre_in_var =
       var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
+
+  PADDLE_ENFORCE_NOT_NULL(pre_in_var, platform::errors::NotFound(
+                                          "Variable %s is not found in scope.",
+                                          in_0_handle->name()));
 
   // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
   std::vector<platform::Place> in_places;  // used to get dev_ctx
@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() {
     in_places.emplace_back(in_handle->place());
     auto in_var =
         var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(in_var);
+
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                           in_handle->name()));
+
     VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
   }
 
   auto out_var = var_scopes.at(out_var_handle->scope_idx())
                      ->FindVar(out_var_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(out_var);
+
+  PADDLE_ENFORCE_NOT_NULL(
+      out_var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                          out_var_handle->name()));
 
   // NOTE: The tensors' Place of input and output must be all on GPU or all on
   // CPU.
   auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
   platform::Place t_out_p;
   if (platform::is_gpu_place(in_p)) {
-    PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place()),
-                   "Places of input and output must be all on GPU.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_var_handle->place()), true,
+                      platform::errors::PreconditionNotMet(
+                          "Places of input and output must be all on GPU."));
     t_out_p = out_var_handle->place();
   } else {
     t_out_p = platform::CPUPlace();
@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() {
             in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
             out_var->GetMutable<framework::SelectedRows>());
       } else {
-        PADDLE_THROW("only support double or float when gather SelectedRows");
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Only support double or float when gather SelectedRows, but got "
+            "%s.",
+            framework::DataTypeToString(in_selected_rows[0]->value().type())));
       }
 #endif
     });
@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() {
         size_t numel = static_cast<size_t>(lod_tensor.numel());
         all_reduce_calls.emplace_back(
             [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
-              PADDLE_ENFORCE(platform::dynload::ncclReduce(
+              PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
                   buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
                   ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
             });
@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() {
         }
       });
 #else
-      PADDLE_THROW("CUDA is not enabled.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
     } else {
-      PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The place of tensor should be CPUPlace or CUDAPlace, but got %s.",
+          lod_tensors[0]->place()));
     }
   }
 }
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index d71251b76c75b08e35c6b2ba3af2f8ab2e53308c..ba03c3a267aec821f83f70e694b79833989743c4 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
+
 #include <unordered_map>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -69,7 +71,8 @@ struct TestReduceOpHandle {
       }
       nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
     } else {
       int count = 8;
@@ -103,7 +106,8 @@ struct TestReduceOpHandle {
       op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                           gpu_list_, nccl_ctxs_.get()));
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
     } else {
 #if defined(PADDLE_WITH_NCCL)
@@ -164,7 +168,10 @@ struct TestReduceOpHandle {
     for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
          ++input_scope_idx) {
       auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
-      PADDLE_ENFORCE_NOT_NULL(in_var);
+
+      PADDLE_ENFORCE_NOT_NULL(
+          in_var, platform::errors::NotFound(
+                      "Variable %s is not found in scope.", "input"));
       auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
       auto value = in_selected_rows->mutable_value();
       value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -178,7 +185,9 @@ struct TestReduceOpHandle {
     }
 
     auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
-    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NOT_NULL(out_var,
+                            platform::errors::NotFound(
+                                "Variable %s is not found in scope.", "out"));
     auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
 
     auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
@@ -196,9 +205,18 @@ struct TestReduceOpHandle {
     auto &out_select_rows = out_var->Get<f::SelectedRows>();
     auto rt = out_select_rows.value();
 
-    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                      platform::errors::InvalidArgument(
+                          "The height of SelectedRows is not equal to "
+                          "the expected, expect %d, but got %d.",
+                          height, out_select_rows.height()));
     for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+      PADDLE_ENFORCE_EQ(
+          out_select_rows.rows()[k], rows[k % rows.size()],
+          platform::errors::InvalidArgument(
+              "The item at position %d of rows of SelectedRows is not equal to "
+              "the expected, expect %d, but got %d.",
+              k, rows[k % rows.size()], out_select_rows.rows()[k]));
     }
 
     f::Tensor result_tensor;
@@ -208,7 +226,7 @@ struct TestReduceOpHandle {
     for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
       ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
     }
-  }
+  }  // namespace details
 
   void TestReduceLodTensors(size_t output_scope_idx) {
     std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
@@ -220,7 +238,9 @@ struct TestReduceOpHandle {
     for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
          ++input_scope_idx) {
       auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
-      PADDLE_ENFORCE_NOT_NULL(in_var);
+      PADDLE_ENFORCE_NOT_NULL(
+          in_var, platform::errors::NotFound(
+                      "Variable %s is not found in scope.", "input"));
       auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
       in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
       in_lod_tensor->set_lod(lod);
@@ -230,7 +250,9 @@ struct TestReduceOpHandle {
     }
 
     auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
-    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NOT_NULL(out_var,
+                            platform::errors::NotFound(
+                                "Variable %s is not found in scope.", "out"));
     auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
 
     auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
@@ -254,7 +276,7 @@ struct TestReduceOpHandle {
       ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
     }
   }
-};
+};  // namespace details
 
 TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
   TestReduceOpHandle test_op;
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
index bf93d8f85b16cbe47373c1982c4eff2d678158c8..079e9abc895ca560c2f607b225dc6f6587b75dfb 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -111,13 +111,12 @@ void ShareTensorBufferFunctor::CallOnce() {
     auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
     PADDLE_ENFORCE_NOT_NULL(
         in_var, platform::errors::NotFound(
-                    "The input variable(%s)to be inplaced should not be NULL.",
+                    "The variable(%s) to be inplaced is not found in scope.",
                     in_var_infos_[i]->Name()));
     PADDLE_ENFORCE_NOT_NULL(
-        out_var,
-        platform::errors::NotFound(
-            "The output variable(%s) to be inplaced should not be NULL.",
-            out_var_names_[i]));
+        out_var, platform::errors::NotFound(
+                     "The variable(%s) to be inplaced is not found in scope.",
+                     out_var_names_[i]));
     PADDLE_ENFORCE_NE(
         in_var, out_var,
         platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index 3f9af1c3a1289cc3453d850a2ffa9f78900f3367..37399e5ddc09d9c8237319682e80d352d658411d 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
+
 #include <algorithm>
 #include <utility>
+
 #include "dgc/dgc.h"
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
       is_encoded_(is_encoded),
       nranks_(nranks) {
   // TODO(gongwb) :polish them!
-  PADDLE_ENFORCE_EQ(is_encoded, true);
+  PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument(
+                                          "The argument is_encoded is false."));
   VLOG(1) << "Use dgc allreduce mode"
           << ", nranks:" << nranks_;
 
-  PADDLE_ENFORCE_GT(local_scopes_.size(), 0);
+  PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The number of local scope should be > 0, but got %zu.",
+                        local_scopes_.size()));
   auto nranks_name = g_dgc_nranks;
   for (size_t i = 0; i < local_scopes_.size(); ++i) {
     auto *local_scope = local_scopes_[i];
     auto nranks_var = local_scope->FindVar(nranks_name);
-    if (nranks_var == nullptr) {
-      PADDLE_THROW("not find nranks_var:%s", nranks_name);
-    }
+
+    PADDLE_ENFORCE_NOT_NULL(
+        nranks_var, platform::errors::NotFound(
+                        "Variable %s is not found in scope.", nranks_name));
 
     float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>();
     *dgc_nranks = nranks;
@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
   auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), places_.size(),
-      "The NoDummyInputSize should be equal to the number of places.");
+      platform::errors::PreconditionNotMet(
+          "The number of input variables should be equal to the number of "
+          "places, but got the number of input variables is %zu and the the "
+          "number of places is %zu.",
+          in_var_handles.size(), places_.size()));
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+      platform::errors::PreconditionNotMet(
+          "The number of input variables should be equal to the number of "
+          "output variables, but got the number of input variables is %zu and "
+          "the the number of output variables is %zu.",
+          in_var_handles.size(), out_var_handles.size()));
 
   std::vector<const LoDTensor *> ins;
   std::vector<LoDTensor *> gathers;
@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
 
     auto encode_var_name = original_name + g_dgc_encoded;
     auto *in_var = local_scope->FindVar(encode_var_name);
-    PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                           encode_var_name));
     auto &in = in_var->Get<LoDTensor>();
     ins.emplace_back(&in);
 
     auto gather_var_name = original_name + g_dgc_gather;
     auto *gather_var = local_scope->FindVar(gather_var_name);
-    PADDLE_ENFORCE_NOT_NULL(gather_var, "%s should not be null",
-                            gather_var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        gather_var, platform::errors::NotFound(
+                        "Variable %s is not found in scope.", gather_var));
     auto *gather = gather_var->GetMutable<LoDTensor>();
     gathers.emplace_back(gather);
 
@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
     }
   }
 
-  PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
-  PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
-  PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+  PADDLE_ENFORCE_EQ(
+      platform::is_gpu_place(ins[0]->place()), true,
+      platform::errors::InvalidArgument(
+          "The place of input variable should be CUDAPlace, but got %s.",
+          ins[0]->place()));
+  PADDLE_ENFORCE_EQ(
+      platform::is_gpu_place(outs[0]->place()), true,
+      platform::errors::InvalidArgument(
+          "The place of input variable should be CUDAPlace, but got %s.",
+          outs[0]->place()));
+  PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::PreconditionNotMet(
+                                          "The nccl contexts are NULL."));
 
   int dtype = -1;
   size_t in_numel = 0;
   size_t out_numel = 0;
-  PADDLE_ENFORCE(nranks_ > 1);
+  PADDLE_ENFORCE_GT(
+      nranks_, 1,
+      platform::errors::PreconditionNotMet(
+          "The number of ranks should be > 1, but got %d.", nranks_));
   std::vector<std::function<void()>> all_gather_calls;
   std::vector<std::function<void()>> sparse_reduce_calls;
 
@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
 
     dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
     in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
-    PADDLE_ENFORCE(in_numel % 2 == 0);
-    PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
+    PADDLE_ENFORCE_EQ(in_numel % 2, 0,
+                      platform::errors::InvalidArgument(
+                          "The number of elements of input variable should be "
+                          "even, but got %zu.",
+                          in_numel));
+    PADDLE_ENFORCE_EQ(in_numel / 2, static_cast<size_t>(k),
+                      platform::errors::InvalidArgument(
+                          "The number of elements of input variable should be "
+                          "even, but got %zu.",
+                          in_numel));
     out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
 
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
       PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce(
                             gather_buff, k, out_tensor_buf,
                             static_cast<int>(out_numel), nranks_, stream),
-                        true);
+                        true, platform::errors::Unavailable(
+                                  "Calling sparseReduce() failed."));
     });
   }
 
@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc(
 int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
   auto original_name = paddle::framework::GradOriginalVarName(grad_name);
   auto var_name = original_name + g_dgc_k;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
+  PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The number of local scope should be > 0, but got %zu.",
+                        local_scopes_.size()));
 
   auto *scope = local_exec_scopes_[0];
   auto var = scope->FindVar(var_name);
-  PADDLE_ENFORCE_NOT_NULL(var);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                      var_name));
   auto tensor = var->Get<LoDTensor>().data<float>();
   return *tensor;
 }
@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() {
   }
   auto counter_name = g_dgc_counter_name;
   auto step_name = g_dgc_rampup_begin_step;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
+
+  PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The number of local scope should be > 0, but got %zu.",
+                        local_scopes_.size()));
 
   auto *local_scope = local_exec_scopes_[0];
   auto count_var = local_scope->FindVar(counter_name);
   auto step_var = local_scope->FindVar(step_name);
-  if (count_var == nullptr || step_var == nullptr) {
-    PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
-                 step_var);
-  }
+
+  PADDLE_ENFORCE_NOT_NULL(
+      count_var, platform::errors::NotFound(
+                     "Variable %s is not found in scope.", counter_name));
+  PADDLE_ENFORCE_NOT_NULL(
+      step_var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                           step_var));
 
   float count = *count_var->Get<LoDTensor>().data<float>();
   float step = *step_var->Get<LoDTensor>().data<float>();
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index ee2ef9a0c3d3595a26c248e63a9e19af784f8bf7..f6f3098613ba194bea90a36efc3153cf63d2db5b 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -74,7 +74,9 @@ class PullDenseWorker {
   virtual void Initialize(const TrainerDesc& param);
 #ifdef PADDLE_WITH_CUDA
   void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
+#endif
 
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
   void AddPlace(const paddle::platform::Place place) {
     places_.push_back(place);
   }
@@ -135,9 +137,9 @@ class PullDenseWorker {
 
 #ifdef PADDLE_WITH_CUDA
   std::vector<cudaStream_t> copy_streams_;
+#endif
   std::vector<paddle::platform::Place> places_;
   std::vector<Scope*> thread_scopes_;
-#endif
 };
 
 // should incorporate different type of device
@@ -161,6 +163,7 @@ class DeviceWorker {
   virtual void SetDataFeed(DataFeed* data_feed);
   virtual void SetWorkerNum(int num) {}
   virtual void CacheProgram(const ProgramDesc& main_program) {}
+  virtual void GetXpuOpIndex() {}
   virtual void SetNeedDumpField(bool need_dump_field) {
     need_dump_field_ = need_dump_field;
   }
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index c9ae5a67950c8d305cedf23654a56d4f48d8dcba..21e28d7ac86d06571e49a522db13c17c0ebf33be 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -97,6 +97,7 @@ message AsyncConfig {
   optional int32 thread_pool_size = 6 [ default = 1 ];
   optional int32 send_wait_times = 7 [ default = 1 ];
   optional bool runtime_split_send_recv = 8 [ default = false ];
+  optional bool launch_barrier = 9 [ default = true ];
 }
 
 message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 3c076805932d6489676bb9290468821c96931c3c..693073d1fc73a65bd17e34da864f9d8df019043c 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -745,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync(
     push_sparse_status->push_back(std::move(status));
   }
 }
+#endif
+
+#ifdef PADDLE_WITH_XPU
+void FleetWrapper::PushDenseVarsAsync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names,
+    std::vector<::std::future<int32_t>>* push_sparse_status,
+    float scale_datanorm, int batch_size,
+    const paddle::platform::Place& place) {
+#ifdef PADDLE_WITH_PSLIB
+  std::vector<paddle::ps::Region> regions;
+  for (auto& t : var_names) {
+    Variable* var = scope.FindVar(t);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int count = tensor->numel();
+    float* g_data = tensor->data<float>();
+
+    Variable* pin_var = scope.FindVar(t + "pin");
+    LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
+    float* pin_g =
+        pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
+    memory::Copy(platform::CPUPlace(), pin_g,
+                 BOOST_GET_CONST(platform::XPUPlace, place), g_data,
+                 sizeof(float) * count);
+
+    float* g = pin_g;
+    if (scale_datanorm >= 0) {
+      if (t.find(".batch_size@GRAD") != std::string::npos ||
+          t.find(".batch_sum@GRAD") != std::string::npos) {
+        Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
+        float scale = 1.0 / batch_size;
+        mat *= scale;
+      } else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
+        VLOG(3) << "epsilon: " << scale_datanorm;
+        for (int i = 0; i < count; ++i) {
+          g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
+                 batch_size * scale_datanorm;
+        }
+      }
+    }
+    paddle::ps::Region reg(g, count);
+    regions.emplace_back(std::move(reg));
+  }
 
+  auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
+                                                    regions.size(), table_id);
+  if (push_sparse_status) {
+    push_sparse_status->push_back(std::move(status));
+  }
+#endif
+}
 #endif
 void FleetWrapper::PushDenseVarsAsync(
     const Scope& scope, const uint64_t table_id,
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index be87bdf1e755c63e55ba59d702f43f4aeba42130..ae86835f38df77a3a7661433501cdd2440553d17 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -160,6 +160,14 @@ class FleetWrapper {
       float scale_datanorm, int batch_size,
       const paddle::platform::Place& place, cudaStream_t stream,
       cudaEvent_t event);
+#endif
+#ifdef PADDLE_WITH_XPU
+  void PushDenseVarsAsync(
+      const Scope& scope, const uint64_t table_id,
+      const std::vector<std::string>& var_names,
+      std::vector<::std::future<int32_t>>* push_sparse_status,
+      float scale_datanorm, int batch_size,
+      const paddle::platform::Place& place);
 #endif
   void PushDenseVarsAsync(
       const Scope& scope, const uint64_t table_id,
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index 7a27b6a9d7a7b82e84438d101774591dd2e732af..8e232560ab6876995a735b6901a5459265f9cb05 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -113,30 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
   if (platform::is_cpu_place(tensor->place())) {
     memcpy(data_ptr, tensor->data<void>(),
            tensor->numel() * SizeOfType(tensor->type()));
-#ifdef PADDLE_WITH_CUDA
   } else {
+#ifdef PADDLE_WITH_CUDA
     memory::Copy(platform::CPUPlace(), data_ptr,
                  BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
                  tensor->data<void>(),
                  tensor->numel() * SizeOfType(tensor->type()), nullptr);
-  }
-#else
-  }
 #endif
+#ifdef PADDLE_WITH_XPU
+    memory::Copy(platform::CPUPlace(), data_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
+                 tensor->data<void>(),
+                 tensor->numel() * SizeOfType(tensor->type()));
+#endif
+  }
 }
 
-// void HeterWrapper::DeSerializeToTensor(Scope* scope,
-// const HeterRequest* request) {
 #ifdef PADDLE_WITH_CUDA
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                        const VariableMessage& req_var,
                                        platform::Place place,
                                        cudaStream_t stream) {
+  // const VariableMessage& req_var = request->vars();
+  auto* var = scope->FindVar(req_var.varname());
+  auto* tensor = var->GetMutable<LoDTensor>();
+
+  std::vector<int> vec_dim;
+  for (auto& x : req_var.dims()) {
+    vec_dim.push_back(x);
+  }
+  tensor->Resize(make_ddim(vec_dim));
+
+  LoD lod;
+  for (int i = 0; i < req_var.lod_level(); ++i) {
+    framework::Vector<size_t> v;
+    for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) {
+      v.push_back(req_var.lod(i).lod_data(j));
+    }
+    lod.push_back(v);
+  }
+  tensor->set_lod(lod);
+
+  void* tensor_data =
+      tensor->mutable_data(place, ToVarType(req_var.data_type()));
+
+#ifdef PADDLE_WITH_CUDA
+  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+               platform::CPUPlace(), req_var.data().data(),
+               tensor->numel() * SizeOfType(tensor->type()), stream);
 #else
+  memcpy(tensor_data, req_var.data().data(),
+         tensor->numel() * SizeOfType(tensor->type()));
+#endif
+}
+#endif
+
+// void HeterWrapper::DeSerializeToTensor(Scope* scope,
+// const HeterRequest* request) {
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                        const VariableMessage& req_var,
                                        platform::Place place) {
-#endif
   // const VariableMessage& req_var = request->vars();
   auto* var = scope->FindVar(req_var.varname());
   auto* tensor = var->GetMutable<LoDTensor>();
@@ -160,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   void* tensor_data =
       tensor->mutable_data(place, ToVarType(req_var.data_type()));
 
-#ifdef PADDLE_WITH_CUDA
-  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+#ifdef PADDLE_WITH_XPU
+  memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data,
                platform::CPUPlace(), req_var.data().data(),
-               tensor->numel() * SizeOfType(tensor->type()), stream);
+               tensor->numel() * SizeOfType(tensor->type()));
 #else
   memcpy(tensor_data, req_var.data().data(),
          tensor->numel() * SizeOfType(tensor->type()));
@@ -184,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType(
     case VariableMessage::BOOL:
       return framework::proto::VarType::BOOL;  // NOLINT
     default:
-      VLOG(0) << "Not support type " << type;
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "ToVarType:Unsupported type %d", type));
   }
 }
 
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index fbed74800b4ccd10d67f3d6d8212eca73a566629..6bbbaacdde3b30a8956794b650e2ff7b1f503a59 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#include <cstdlib>
+#include <ctime>
+#include <string>
+#include <vector>
+#include "io/fs.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/data_set.h"
+#include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
+#include "paddle/fluid/framework/trainer.h"
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
-
+#endif
 namespace paddle {
 namespace framework {
 
@@ -34,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
   int place_num = trainer_desc.worker_places_size();
   for (int i = 0; i < place_num; ++i) {
     int num = trainer_desc.worker_places(i);
+#ifdef PADDLE_WITH_CUDA
     platform::CUDAPlace place = platform::CUDAPlace(num);
     platform::CUDADeviceGuard guard(place.device);
     cudaStream_t stream;
@@ -44,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
     events_.push_back(event);
+#endif
+#ifdef PADDLE_WITH_XPU
+    platform::XPUPlace place = platform::XPUPlace(num);
+    places_.push_back(place);
+#endif
   }
   // thread_num_ = trainer_desc.thread_num();
   // SetDataset(dataset);
@@ -95,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
 void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
   auto place = places_[num];
   Scope* scope = place_scopes_[num];
+#ifdef PADDLE_WITH_CUDA
   auto stream = copy_streams_[num];
   auto event = events_[num];
-
   auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
   platform::CUDADeviceGuard guard(dev_id);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+  xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+#endif
+
   auto& block = program.Block(0);
   for (auto& var : block.AllVars()) {
     if (var->Persistable()) {
@@ -116,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
       HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
     }                                                                   \
   } while (0)
+
+#define HeterMemcpyXpuFunc(cpp_type, proto_type)                \
+  do {                                                          \
+    if (root_tensor->type() == proto_type) {                    \
+      HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place); \
+    }                                                           \
+  } while (0)
+#ifdef PADDLE_WITH_CUDA
       _ForEachDataType_(HeterMemcpyFunc);
+#endif
+#ifdef PADDLE_WITH_XPU
+      _ForEachDataType_(HeterMemcpyXpuFunc);
+#endif
     }
   }
+#ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
   cudaEventSynchronize(event);
+#endif
 }
 
+#ifdef PADDLE_WITH_CUDA
 template <typename T>
 void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
                                   LoDTensor* root_tensor,
@@ -141,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
                  root_ptr, sizeof(T) * root_tensor->numel(), stream);
   }
 }
+#endif
+
+#ifdef PADDLE_WITH_XPU
+template <typename T>
+void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
+                                  LoDTensor* root_tensor,
+                                  const paddle::platform::Place& thread_place) {
+  T* thread_ptr =
+      thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
+  T* root_ptr = root_tensor->data<T>();
+  if (platform::is_cpu_place(root_tensor->place())) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
+                 platform::CPUPlace(), root_ptr,
+                 sizeof(T) * root_tensor->numel());
+  } else {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()),
+                 root_ptr, sizeof(T) * root_tensor->numel());
+  }
+}
+#endif
 
 void HeterXpuTrainer::DumpWork(int tid) {}
 
@@ -171,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
     CreateThreadParam(main_program, i);
     pull_dense_worker_->AddThreadScope(scope);
     pull_dense_worker_->AddPlace(places_[i]);
+#ifdef PADDLE_WITH_CUDA
     pull_dense_worker_->AddStream(copy_streams_[i]);
+#endif
   }
-
   pull_dense_worker_->Start();
+#ifdef PADDLE_WITH_CUDA
   for (auto& stream : copy_streams_) {
     cudaStreamSynchronize(stream);
   }
+#endif
   op_names_.clear();
   for (auto& op_desc : block.AllOps()) {
     std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
@@ -220,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
         OperatorBase* local_op_ptr = local_op.release();
         (context->ops_).push_back(local_op_ptr);
       }
+#ifdef PADDLE_WITH_CUDA
       auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
       platform::CUDADeviceGuard guard(dev_id);
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
+#endif
       object_pool_.Push(context);
     }
   }
@@ -267,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
     }                                                                          \
   } while (0)
       _ForEachDataType_(MergeCallback);
-      if (platform::is_gpu_place(thread_tensor->place())) {
+      if (!platform::is_cpu_place(thread_tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
         auto dev_id =
             BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
         platform::CUDADeviceGuard guard(dev_id);
         cudaMemset(thread_tensor->data<void>(), 0,
                    thread_tensor->numel() * SizeOfType(thread_tensor->type()));
+#endif
+#ifdef PADDLE_WITH_XPU
+        auto place = thread_tensor->place();
+        xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+        platform::DeviceContextPool& pool =
+            platform::DeviceContextPool::Instance();
+        platform::DeviceContext* dev_ctx = pool.Get(place);
+        const platform::XPUDeviceContext* xpu_ctx =
+            reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
+        xpu::memset(xpu_ctx->x_context(), thread_tensor->data<void>(), 0,
+                    thread_tensor->numel() * SizeOfType(thread_tensor->type()));
+#endif
       } else {
         memset(thread_tensor->data<void>(), 0,
                thread_tensor->numel() * SizeOfType(thread_tensor->type()));
@@ -281,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
     auto* merge_var = response->add_vars();
     heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_,
                                merge_var);
-    if (platform::is_gpu_place(root_tensor->place())) {
+    if (!platform::is_cpu_place(root_tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
       auto dev_id =
           BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
       platform::CUDADeviceGuard guard(dev_id);
       cudaMemset(root_tensor->data<void>(), 0,
                  root_tensor->numel() * SizeOfType(root_tensor->type()));
+#endif
+#ifdef PADDLE_WITH_XPU
+      auto place = root_tensor->place();
+      xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      platform::DeviceContext* dev_ctx = pool.Get(place);
+      const platform::XPUDeviceContext* xpu_ctx =
+          reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
+      xpu::memset(xpu_ctx->x_context(), root_tensor->data<void>(), 0,
+                  root_tensor->numel() * SizeOfType(root_tensor->type()));
+#endif
     } else {
       memset(root_tensor->data<void>(), 0,
              root_tensor->numel() * SizeOfType(root_tensor->type()));
@@ -346,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
       OperatorBase* local_op_ptr = local_op.release();
       (context->ops_).push_back(local_op_ptr);
     }
-
+#ifdef PADDLE_WITH_CUDA
     auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
     platform::CUDADeviceGuard guard(dev_id);
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
+#endif
   }
 
   context->Reset();
@@ -359,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
     auto deserial_timer =
         std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial");
     for (int i = 0; i < request->vars_size(); ++i) {
+#ifdef PADDLE_WITH_CUDA
       heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place,
                                       copy_streams_[context->place_num_]);
+#endif
+#ifdef PADDLE_WITH_XPU
+      heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place);
+#endif
     }
+#ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
     while (cudaEventQuery(context->event_) != cudaSuccess) {
       VLOG(3) << "wait for kernel";
       bthread_yield();
     }
+#endif
   }
 
   {
@@ -378,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
       op->Run(*(context->scope_), place);
     }
   }
+#ifdef PADDLE_WITH_CUDA
   auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
       platform::DeviceContextPool::Instance().Get(place));
   PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -391,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
       bthread_yield();
     }
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  xpu_wait();
+#endif
 
   for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) {
     const std::string& varname = trainer_desc_.xpu_send_list(i);
@@ -407,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
        ++i) {
     uint64_t tid =
         static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
+#ifdef PADDLE_WITH_CUDA
     fleet_ptr_->PushDenseVarsAsync(
         *(context->scope_), tid, dense_grad_names_[tid],
         &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
         places_[context->place_num_], copy_streams_[context->place_num_],
         context->event_);
+#endif
+#ifdef PADDLE_WITH_XPU
+    fleet_ptr_->PushDenseVarsAsync(
+        *(context->scope_), tid, dense_grad_names_[tid],
+        &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
+        places_[context->place_num_]);
+#endif
   }
   for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
        ++i) {
@@ -453,7 +565,6 @@ void HeterXpuTrainer::Finalize() {
   pull_dense_worker_->Stop();
   root_scope_->DropKids();
 }
-
 }  // namespace framework
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 9c1eaa99a3ca04ddbeecab639d5587d5509e3f00..96952e20c2158453df0d94c9e43c64bb6bb1e04f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1894,8 +1894,7 @@ PDNode *patterns::QuantizePlacement::operator()(
 
 PDNode *patterns::Bfloat16Placement::operator()(
     const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
-  std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>();
+  std::unordered_set<std::string> supported_op_types{"conv2d"};
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index c399c5d02eb19d08f929168de0804ecea18cde37..6aeef8a39b53342a1c7eb99ba0892bda29a8fbcd 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -62,13 +62,15 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
   fleet_ptr_ = FleetWrapper::GetInstance();
 #ifdef PADDLE_WITH_CUDA
   copy_streams_.clear();
+#endif
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
   places_.clear();
   thread_scopes_.clear();
 #endif
 }
 
 void PullDenseWorker::CreatePinVar() {
-#ifdef PADDLE_WITH_CUDA
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_PSLIB)
   // for (auto& v : dense_value_names_) {
   //  for (auto& name : v.second) {
   for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
@@ -83,8 +85,13 @@ void PullDenseWorker::CreatePinVar() {
       auto* ptr = root_scope_->Var(name + "pin");
       InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
       LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>();
+#ifdef PADDLE_WITH_CUDA
       pin_tensor->mutable_data<float>(tensor->dims(),
                                       platform::CUDAPinnedPlace());
+#endif
+#ifdef PADDLE_WITH_XPU
+      pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
+#endif
     }
   }
 #endif
@@ -107,7 +114,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
     exit(-1);
   }
   status_vec->resize(0);
-#ifdef PADDLE_WITH_CUDA
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
 
   for (size_t i = 0; i < places_.size(); ++i) {
     // for (auto& v : dense_value_names_) {
@@ -125,9 +132,16 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
         Variable* var = thread_scopes_[i]->FindVar(name);
         LoDTensor* tensor = var->GetMutable<LoDTensor>();
         float* w = tensor->data<float>();
+#ifdef PADDLE_WITH_CUDA
         memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
                      platform::CUDAPinnedPlace(), pin_w,
                      sizeof(float) * tensor->numel(), copy_streams_[i]);
+#endif
+#ifdef PADDLE_WITH_XPU
+        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, places_[i]), w,
+                     platform::CPUPlace(), pin_w,
+                     sizeof(float) * tensor->numel());
+#endif
       }
     }
   }
@@ -148,7 +162,7 @@ void PullDenseWorker::PullDense(bool force_update) {
     uint64_t tid = static_cast<uint64_t>(
         dwp_param_.program_config(0).pull_dense_table_id(i));
     if (force_update || CheckUpdateParam(tid)) {
-#ifdef PADDLE_WITH_CUDA
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
       VLOG(3) << "pull dense " << force_update << " " << tid;
       fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
                                      &pull_dense_status_, false);
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index d041ef48e2c04a8e6db6bee7fe1c762de89bc9eb..ecaec49aa461cd6134cb60b7af7adb50f1a94686 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -138,7 +138,8 @@ class DistMultiTrainer : public MultiTrainer {
   std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
 };
 
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
 class HeterServiceContext {
  public:
   HeterServiceContext() {}
@@ -151,7 +152,9 @@ class HeterServiceContext {
   void Reset() { push_dense_status_.clear(); }
   int place_num_;
   Scope* scope_{nullptr};
+#ifdef PADDLE_WITH_CUDA
   cudaEvent_t event_;
+#endif
   std::vector<OperatorBase*> ops_;
   std::vector<::std::future<int32_t>> push_dense_status_;
 };
@@ -178,10 +181,18 @@ class HeterXpuTrainer : public TrainerBase {
   virtual void CacheProgram(const ProgramDesc& main_program) {
     new (&program_) ProgramDesc(main_program);
   }
+  virtual std::string GetDumpPath(int tid) { return ""; }
+  virtual void InitDumpEnv() {}
   template <typename T>
+#ifdef PADDLE_WITH_CUDA
   void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
                    const paddle::platform::Place& thread_place,
                    cudaStream_t stream);
+#endif
+#ifdef PADDLE_WITH_XPU
+  void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
+                   const paddle::platform::Place& thread_place);
+#endif
   void CreateThreadParam(const ProgramDesc& program, int num);
   template <typename T>
   void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
@@ -207,9 +218,11 @@ class HeterXpuTrainer : public TrainerBase {
   std::vector<std::string> op_names_;
   std::vector<Scope*> place_scopes_;
   BtObjectPool<HeterServiceContext> object_pool_;
-  std::vector<cudaStream_t> copy_streams_;
   std::vector<platform::Place> places_;
+#ifdef PADDLE_WITH_CUDA
+  std::vector<cudaStream_t> copy_streams_;
   std::vector<cudaEvent_t> events_;
+#endif
 };
 #endif
 
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index 15584620d86b62c05e5fef841fc26058e8610c21..cc92c50cc428a59905ce3864a0b89f591d1b2390 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -63,7 +63,8 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
 
 REGISTER_TRAINER_CLASS(MultiTrainer);
 REGISTER_TRAINER_CLASS(DistMultiTrainer);
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(HeterXpuTrainer);
 #endif
 #if defined(PADDLE_WITH_NCCL)
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 7caeb4378ce3d1ca1d1557054642c9fa184bea39..07f1868b7fa29914b4d362cf2c71d9380ca446be 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/gradient_accumulator.h"
+
 #include <algorithm>
 #include <memory>
 #include <utility>
+
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -136,9 +138,13 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
     return;
   }
 
-  PADDLE_ENFORCE_EQ(dst_tensor->numel() == numel, true,
-                    "dst_numel %d vs. src_numel %d", dst_tensor->numel(),
-                    numel);
+  PADDLE_ENFORCE_EQ(
+      dst_tensor->numel(), numel,
+      platform::errors::PreconditionNotMet(
+          "The number of elements of source tensor and destination tensor "
+          "should be equal, but got the number of elements of source tensor is "
+          "%zu and the number of elements of destination tensor is %zu.",
+          numel, dst_tensor->numel()));
 
   auto data_type = src_tensor.type();
   auto place = src_tensor.place();
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc
index 9f4cf713f7c19c0d8bebdaf023d512a4f593d8e4..59ff5b4eae4419274412160632ed78c02b298867 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
+
 #include <unordered_map>
 #include <unordered_set>
 
@@ -203,7 +204,8 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc(
 
 void ProgramDescTracer::InsertVarIfNotExist(
     const std::shared_ptr<VarBase> &new_var, bool is_input) {
-  PADDLE_ENFORCE_NOT_NULL(new_var);
+  PADDLE_ENFORCE_NOT_NULL(new_var, platform::errors::InvalidArgument(
+                                       "The variable to insert is NULL."));
   if (vars_.count(new_var) != 0) return;
 
   auto new_var_desc = new framework::VarDesc("");
@@ -220,7 +222,9 @@ void ProgramDescTracer::InsertVarIfNotExist(
   }
 
   const auto &inner_var = new_var->Var();
-  PADDLE_ENFORCE_EQ(inner_var.IsInitialized(), true);
+  PADDLE_ENFORCE_EQ(inner_var.IsInitialized(), true,
+                    platform::errors::InvalidArgument(
+                        "The variable to insert is not initialized."));
   if (inner_var.IsType<framework::LoDTensor>()) {
     const auto &tensor = inner_var.Get<framework::LoDTensor>();
     new_var_desc->SetType(framework::proto::VarType::LOD_TENSOR);
@@ -232,8 +236,9 @@ void ProgramDescTracer::InsertVarIfNotExist(
       new_var_desc->SetDataType(framework::proto::VarType::FP32);
     }
   } else {
-    PADDLE_THROW("Not support variable type %s",
-                 framework::ToTypeName(inner_var.Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Not support variable type %s.",
+        framework::ToTypeName(inner_var.Type())));
   }
 }
 
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 115078e7eadfc03153fc95ef05a9d4bb6cd40369..c8fd31fcbffe680da36d03276ec0d4c1095030bc 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/imperative/nccl_context.h"
+
 #include "paddle/fluid/platform/collective_helper.h"
 
 namespace paddle {
@@ -21,8 +22,10 @@ namespace imperative {
 void NCCLParallelContext::RecvNCCLID(const std::string &ep,
                                      ncclUniqueId *nccl_id) {
   auto addr = paddle::string::Split(ep, ':');
-  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
-                    "The endpoint should contain host and port: %s", ep);
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
   std::string host = addr[0];
   int port = std::stoi(addr[1]);
 
@@ -32,27 +35,41 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
   char buffer[1024] = {0};
   int opt = 0;
   // creating socket fd
-  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0)
-    PADDLE_THROW("create server fd failed");
-  if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)))
-    PADDLE_THROW("set socket opt failed");
+  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0) {
+    PADDLE_THROW(
+        platform::errors::Unavailable("Create server file descriptor failed."));
+  }
+
+  if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt))) {
+    PADDLE_THROW(platform::errors::Unavailable("Set socket options failed."));
+  }
 
   address.sin_family = AF_INET;
   address.sin_addr.s_addr = INADDR_ANY;
   address.sin_port = htons(port);
 
-  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0)
-    PADDLE_THROW("binding failed on ep: %s", ep);
+  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
+    PADDLE_THROW(
+        platform::errors::Unavailable("Bind on endpoint %s failed.", ep));
+  }
+
   VLOG(3) << "listening on: " << ep;
-  if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed");
+  if (listen(server_fd, 3) < 0) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Listen on server file descriptor failed."));
+  }
 
   if ((new_socket =
            accept(server_fd, reinterpret_cast<struct sockaddr *>(&address),
-                  reinterpret_cast<socklen_t *>(&addrlen))) < 0)
-    PADDLE_THROW("accept the new socket fd failed");
+                  reinterpret_cast<socklen_t *>(&addrlen))) < 0) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Accept the new socket file descriptor failed."));
+  }
+
+  if (read(new_socket, buffer, 1024) < 0) {
+    PADDLE_THROW(platform::errors::Unavailable("Read from socket failed."));
+  }
 
-  if (read(new_socket, buffer, 1024) < 0)
-    PADDLE_THROW("reading the ncclUniqueId from socket failed");
   VLOG(3) << "recevived the ncclUniqueId";
   memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
 
@@ -63,8 +80,10 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
 void NCCLParallelContext::SendNCCLID(const std::string &ep,
                                      ncclUniqueId *nccl_id) {
   auto addr = paddle::string::Split(ep, ':');
-  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
-                    "The endpoint should contain host and port: %s", ep);
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
   std::string host = addr[0];
   int port = std::stoi(addr[1]);
   // struct sockaddr_in address;
@@ -73,15 +92,17 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
   char buffer[1024] = {0};
 
   memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
-  if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
-    PADDLE_THROW("create socket failed");
+  if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+    PADDLE_THROW(platform::errors::Unavailable("Create socket failed."));
+  }
 
   memset(&serv_addr, '0', sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
   serv_addr.sin_port = htons(port);
 
-  if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0)
-    PADDLE_THROW("invalied address: %s", ep);
+  if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) {
+    PADDLE_THROW(platform::errors::Unavailable("Open address %s failed.", ep));
+  }
 
   int try_times = 0;
   while (true) {
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 92e3933a072832fa42520e67f455d3dc90118518..c661c9f9c37509f6b55f6ce8b67b11752c68418a 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -127,11 +127,10 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
                         "Baidu Kunlun Card is properly installed.",
                         ret));
   ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
+  PADDLE_ENFORCE_EQ(
+      ret, XPU_SUCCESS,
+      platform::errors::External(
+          "XPU API return wrong value[%d], no enough memory", ret));
   if (FLAGS_init_allocated_mem) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "xpu memory FLAGS_init_allocated_mem is not implemented."));
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 5a3660cee85762f3d76129dfb694eeb6d87bb52c..a640a6c745ccb7e7cda0e47b17104ec990fce0dd 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -763,10 +763,28 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
   }
 };
 
-//
+// AbsGrad: dx=dy if x >=0 else -dy
+// AbsDoubleGrad: ddy = ddx if x >=0 else -ddx
+template <typename T>
+class AbsDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
+ public:
+  using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("abs_grad_grad");
+    // input1: x
+    op->SetInput("X", this->Input("X"));
+    // input2: ddx
+    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    op->SetAttrMap(this->Attrs());
+    // output: ddy
+    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+  }
+};
+
 // ReluGrad: dx = dy if y >= 0 else 0
 // ReluGradGrad: ddy = ddx if y >= 0 else 0
-//
 template <typename T>
 class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
  public:
@@ -873,6 +891,28 @@ class SquareDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
   }
 };
 
+// log Grad: dx = dout / x
+// log Grad Grad: ddout = ddx / x; dx = -(dout / x) * (ddx / x)
+template <typename T>
+class LogDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
+ public:
+  using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("log_grad_grad");
+    op->SetInput("X", this->Input("X"));
+    // X@GRAD@GRAD: ddx
+    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
+    op->SetAttrMap(this->Attrs());
+    // X@GRAD: dx
+    op->SetOutput("DX", this->InputGrad("X"));
+    // Out@GRAD@GRAD: ddy
+    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+  }
+};
+
 DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInferer,
                            {framework::GradVarName("Out"),
                             framework::GradVarName("X")});
@@ -1214,7 +1254,13 @@ REGISTER_OPERATOR(
     std::conditional<ops::CanInplaceAct<ops::AbsGradFunctor<float>>(),
                      ops::ActFwdInplaceInferer, void>::type);
 REGISTER_OPERATOR(abs_grad, ops::ActivationOpGrad,
-                  ops::ActivationGradOpInplaceInferer);
+                  ops::ActivationGradOpInplaceInferer,
+                  ops::AbsDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::AbsDoubleGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    abs_grad_grad,
+    ops::ActivationOpDoubleGrad<ops::AbsGradGradFunctor<float>::FwdDeps()>,
+    ops::ActivationDoubleGradOpInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(abs,
                        ops::ActivationKernel<paddle::platform::CPUDeviceContext,
@@ -1234,6 +1280,47 @@ REGISTER_OP_CPU_KERNEL(
                               ops::AbsGradFunctor<int>>,
     ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
                               ops::AbsGradFunctor<int64_t>>);
+REGISTER_OP_CPU_KERNEL(
+    abs_grad_grad,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<plat::float16>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<int>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<int64_t>>);
+/* ========================================================================== */
+
+/* ==========================  Log register ==================================*/
+REGISTER_OPERATOR(
+    log, ops::ActivationOp, ops::LogOpMaker, ops::ActivationOpInferVarType,
+    ops::ActivationGradOpMaker<ops::LogGradFunctor<float>::FwdDeps(),
+                               paddle::framework::OpDesc>,
+    ops::ActivationGradOpMaker<ops::LogGradFunctor<float>::FwdDeps(),
+                               paddle::imperative::OpBase>,
+    ops::ActFwdInplaceInferer);
+REGISTER_OPERATOR(log_grad, ops::ActivationOpGrad,
+                  ops::ActivationGradOpInplaceInferer,
+                  ops::LogDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::LogDoubleGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(
+    log_grad_grad,
+    ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
+    ops::ActivationDoubleGradOpInplaceInferer);
+
+REGISTER_ACTIVATION_CPU_KERNEL(log, Log, LogFunctor, LogGradFunctor);
+
+REGISTER_OP_CPU_KERNEL(
+    log_grad_grad, ops::LogDoubleGradKernel<plat::CPUDeviceContext,
+                                            ops::LogGradGradFunctor<float>>,
+    ops::LogDoubleGradKernel<plat::CPUDeviceContext,
+                             ops::LogGradGradFunctor<double>>,
+    ops::LogDoubleGradKernel<plat::CPUDeviceContext,
+                             ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ==========================  register checkpoint ===========================*/
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 48ec90471f0becf921e7e68eb8722544885aaa7a..839776ad58d0352cd9bdd59530951e4eea1120b3 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -160,7 +160,7 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::ExpGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ==========================   exp register  ============================ */
+/* ==========================   abs register  ============================ */
 
 REGISTER_OP_CUDA_KERNEL(
     abs, ops::ActivationKernel<plat::CUDADeviceContext, ops::AbsFunctor<float>>,
@@ -180,4 +180,28 @@ REGISTER_OP_CUDA_KERNEL(
                               ops::AbsGradFunctor<int64_t>>,
     ops::ActivationGradKernel<plat::CUDADeviceContext,
                               ops::AbsGradFunctor<plat::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    abs_grad_grad,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<plat::float16>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<int>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<int64_t>>);
+/* ========================================================================== */
+
+/* ==========================  Log register ==================================*/
+REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor);
+
+REGISTER_OP_CUDA_KERNEL(
+    log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
+                                            ops::LogGradGradFunctor<float>>,
+    ops::LogDoubleGradKernel<plat::CUDADeviceContext,
+                             ops::LogGradGradFunctor<double>>,
+    ops::LogDoubleGradKernel<plat::CUDADeviceContext,
+                             ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 00a7c063c9155488d117332d5ef3541d16d76bdb..a5c613297a473c326a2d239ad57ac0cca5a165f3 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1430,6 +1430,27 @@ class ActivationDoubleGradKernel
   }
 };
 
+template <typename T>
+struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* X,
+                  const framework::Tensor* Out, const framework::Tensor* ddX,
+                  framework::Tensor* ddOut, framework::Tensor* dOut,
+                  framework::Tensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "AbsGradGrad"));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "AbsGradGrad"));
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "AbsGradGrad"));
+      ddout.device(*d) = ddx * x.sign();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 template <typename T>
 struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device>
@@ -1642,6 +1663,10 @@ class SquareDoubleGradKernel
   }
 };
 
+template <typename DeviceContext, typename Functor>
+class LogDoubleGradKernel
+    : public SquareDoubleGradKernel<DeviceContext, Functor> {};
+
 template <typename DeviceContext, typename Functor>
 class ELUDoubleGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -1831,6 +1856,37 @@ class PowGradKernel
     functor(*place, x, out, dout, dx);
   }
 };
+
+template <typename T>
+struct LogGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* X,
+                  const framework::Tensor* ddX, framework::Tensor* ddOut,
+                  const framework::Tensor* dOut, framework::Tensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
+    // ddout = ddx / x; dx = -(dout / x) * (ddx / x)
+    // calculate dx first, so ddout can inplace ddx
+    if (dX) {
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
+      auto dx = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
+      dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
+    }
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
+      ddout.device(*d) = ddx * static_cast<T>(1) / x;
+    }
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -1851,7 +1907,6 @@ class PowGradKernel
   __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor);                          \
   __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
   __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
-  __macro(log, Log, LogFunctor, LogGradFunctor);                              \
   __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                      \
   __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor);                      \
   __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \
diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..49b7a08a7b52b0a1dd110215aad301f1b484317e
--- /dev/null
+++ b/paddle/fluid/operators/activation_op_xpu.cc
@@ -0,0 +1,179 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/activation_op.h"
+#include <string>
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+
+template <typename Functor>
+class XPUActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+template <typename Functor>
+class XPUActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+template <typename DeviceContext, typename T>
+void xpu_activation_forward(const framework::ExecutionContext &ctx,
+                            xpu::Activation_t type) {
+  const auto *x = ctx.Input<Tensor>("X");
+  auto *y = ctx.Output<Tensor>("Out");
+  const T *x_data = x->data<T>();
+  T *y_data = y->mutable_data<T>(ctx.GetPlace());
+  int r = 0;
+  if (xpu::Activation_t::ACT_POW == type.type) {
+    type.pow_factor = ctx.Attr<float>("factor");
+  }
+  auto xpu_context = ctx.device_context<DeviceContext>().x_context();
+  r = xpu::activation_forward(xpu_context, type, x->numel(),
+                              reinterpret_cast<const float *>(x_data),
+                              reinterpret_cast<float *>(y_data));
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        r));
+}
+
+template <typename DeviceContext, typename T>
+void xpu_activation_backward(const framework::ExecutionContext &ctx,
+                             xpu::Activation_t type) {
+  /* TODO: relu tanh sigmoid are inplace */
+  const auto *x = ctx.Input<Tensor>("X");
+  auto *y = ctx.Input<Tensor>("Out");
+  auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+  auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+  const T *x_data = nullptr;
+  const T *y_data = nullptr;
+  const T *y_grad = nullptr;
+  if (x != nullptr) x_data = x->data<T>();
+  if (y != nullptr) y_data = y->data<T>();
+  if (dOut != nullptr) y_grad = dOut->data<T>();
+  T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
+  auto xpu_context = ctx.device_context<DeviceContext>().x_context();
+  int r = xpu::activation_backward(xpu_context, type, dX->numel(),
+                                   reinterpret_cast<const float *>(x_data),
+                                   reinterpret_cast<const float *>(y_data),
+                                   reinterpret_cast<const float *>(y_grad),
+                                   reinterpret_cast<float *>(x_grad));
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        r));
+}
+
+template <typename T, xpu::Activation_t::act_enum algorithm>
+struct XPUActivationFunc : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
+                                                                  algorithm);
+  }
+};
+
+template <typename T, xpu::Activation_t::act_enum algorithm>
+struct XPUActivationGradFunc : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(ctx,
+                                                                   algorithm);
+  }
+};
+
+template <typename T>
+using XPUReluFunctor = XPUActivationFunc<T, xpu::Activation_t::RELU>;
+template <typename T>
+using XPUSigmoidFunctor = XPUActivationFunc<T, xpu::Activation_t::SIGMOID>;
+template <typename T>
+using XPUTanhFunctor = XPUActivationFunc<T, xpu::Activation_t::TANH>;
+template <typename T>
+using XPUGeluFunctor = XPUActivationFunc<T, xpu::Activation_t::GELU>;
+template <typename T>
+using XPULogFunctor = XPUActivationFunc<T, xpu::Activation_t::LOG>;
+template <typename T>
+using XPUSquareFunctor = XPUActivationFunc<T, xpu::Activation_t::SQUARE>;
+template <typename T>
+using XPUSuareGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQUARE>;
+template <typename T>
+using XPUReluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::RELU>;
+template <typename T>
+using XPUSigmoidGradFunctor =
+    XPUActivationGradFunc<T, xpu::Activation_t::SIGMOID>;
+template <typename T>
+using XPUTanhGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::TANH>;
+template <typename T>
+using XPUGeluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::GELU>;
+template <typename T>
+using XPUSqrtFunctor = XPUActivationFunc<T, xpu::Activation_t::SQRT>;
+template <typename T>
+using XPUSqrtGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQRT>;
+template <typename T>
+using XPUACTPowFunctor = XPUActivationFunc<T, xpu::Activation_t::ACT_POW>;
+template <typename T>
+using XPUABSFunctor = XPUActivationFunc<T, xpu::Activation_t::ABS>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, functor, grad_functor)  \
+  REGISTER_OP_XPU_KERNEL(act_type,                                       \
+                         ops::XPUActivationKernel<ops::functor<float>>); \
+  REGISTER_OP_XPU_KERNEL(                                                \
+      act_type##_grad,                                                   \
+      ops::XPUActivationGradKernel<ops::grad_functor<float>>);
+
+REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(tanh, XPUTanhFunctor, XPUTanhGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor,
+                               XPUSigmoidGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(gelu, XPUGeluFunctor, XPUGeluGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSuareGradFunctor)
+REGISTER_OP_XPU_KERNEL(log,
+                       ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
+REGISTER_OP_XPU_KERNEL(pow,
+                       ops::XPUActivationKernel<ops::XPUACTPowFunctor<float>>);
+REGISTER_OP_XPU_KERNEL(abs,
+                       ops::XPUActivationKernel<ops::XPUABSFunctor<float>>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index dcfe8bb1bb48a505f5526f6471e8ce9ba848b5b3..7a88403aa9daa78f1093115124eb19167bf6e99d 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -839,6 +839,7 @@ void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const {
   op->SetInput("SavedMean", this->Input("SavedMean"));
   op->SetInput("SavedVariance", this->Input("SavedVariance"));
   if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) {
+    op->SetInput("Mean", this->Input("Mean"));
     op->SetInput("Variance", this->Input("Variance"));
   }
   op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
@@ -868,14 +869,19 @@ void BatchNormDoubleGradOp::InferShape(
                    "BatchNormDoubleGrad");
   }
 
-  OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX", "BatchNormDoubleGrad");
   OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "BatchNormDoubleGrad");
 
   // check output
   OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", "BatchNormDoubleGrad");
 
   const auto x_dims = ctx->GetInputDim("X");
-  const int C = x_dims[1];
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+  const int C =
+      ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+           ? x_dims[1]
+           : x_dims[x_dims.size() - 1]);
+
   if (ctx->HasOutput("DX")) {
     ctx->SetOutputDim("DX", x_dims);
   }
@@ -957,7 +963,9 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
 
     Tensor inv_var_tensor;
     if (use_global_stats) {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
       const auto *running_variance = ctx.Input<Tensor>("Variance");
+      mean_data = running_mean->data<T>();
       inv_var_tensor.Resize({C});
 
       T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
@@ -1077,12 +1085,12 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
         //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
         //          np.sum(dy,
         //          axis=(n,h,w)) * (x - mean) *
-        //          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+        //          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
         //          inv_var
         //          *
         //          np.mean(dy, axis=(n,h,w)) -
         //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-        //          axis=(n,h,w))))
+        //          axis=(n,h,w)))
 
         if (ddX) {
           dx_arr +=
@@ -1176,7 +1184,8 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
                                C, sample_size);
       ddy_arr.setZero();
       if (use_global_stats) {
-        // math: ddy = r * ddx * inv_var
+        // math: ddy = r * ddx * inv_var + ddbias +
+        //           ddscale * (x - mean) * inv_var
         if (ddX) {
           ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
         }
@@ -1196,25 +1205,29 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
                        .replicate(1, sample_size) /
                    sample_size);
         }
-        if (ddScale && ddBias) {
-          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-          Tensor ddscale_tile;
-          ddscale_tile.Resize({C, sample_size});
-          EigenArrayMap<T> ddscale_tile_data(
-              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+      }
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+        ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+      }
 
-          ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
-          Tensor ddbias_tile;
-          ddbias_tile.Resize({C, sample_size});
-          EigenArrayMap<T> ddbias_tile_data(
-              ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
-          ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
+      if (ddBias) {
+        ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+        Tensor ddbias_tile;
+        ddbias_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddbias_tile_data(
+            ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+        ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
 
-          ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
-          ddy_arr += ddbias_tile_data;
-        }
+        ddy_arr += ddbias_tile_data;
       }
+
       if (data_layout == DataLayout::kNCHW) {
         VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
         TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
index 5ec34e574504e1058021a0623d09d4d33cf75c66..654df5ccd5e9df324f6e127addadd4e71a641d94 100644
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -47,8 +47,8 @@ void OpTester::Init(const OpTesterConfig &config) {
     CreateInputVarDesc();
     CreateOutputVarDesc();
   } else {
-    PADDLE_THROW(platform::errors::NotFound("Operator '%s' is not registered.",
-                                            config_.op_type));
+    PADDLE_THROW(platform::errors::NotFound(
+        "Operator '%s' is not registered in OpTester.", config_.op_type));
   }
 
   if (config_.device_id >= 0) {
@@ -81,7 +81,8 @@ void OpTester::Run() {
       platform::EnableProfiler(platform::ProfilerState::kAll);
       platform::SetDeviceId(config_.device_id);
 #else
-      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "'CUDAPlace' is not supported in CPU only device."));
 #endif
     }
 
@@ -162,7 +163,8 @@ framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
   } else if (str == "fp64") {
     return framework::proto::VarType::FP64;
   } else {
-    PADDLE_THROW("Unsupported dtype %s.", str.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported dtype %s in OpTester.", str.c_str()));
   }
 }
 
@@ -233,8 +235,8 @@ void OpTester::CreateOpDesc() {
       case framework::proto::AttrType::INTS:
       case framework::proto::AttrType::FLOATS:
       case framework::proto::AttrType::STRINGS:
-        PADDLE_THROW(
-            platform::errors::Unimplemented("Not supported STRINGS type yet."));
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported STRINGS type in OpTester yet."));
         break;
       case framework::proto::AttrType::LONG: {
         int64_t value = StringTo<int64_t>(value_str);
@@ -242,7 +244,8 @@ void OpTester::CreateOpDesc() {
       } break;
       case framework::proto::AttrType::LONGS:
       default:
-        PADDLE_THROW("Unsupport attr type %d", type);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupport attr type %d in OpTester.", type));
     }
   }
 }
@@ -299,7 +302,8 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
     }
     is.close();
   } else {
-    PADDLE_THROW("Unsupported initializer %s.", initializer.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported initializer %s in OpTester.", initializer.c_str()));
   }
 
   if (!platform::is_cpu_place(place_)) {
@@ -351,7 +355,8 @@ void OpTester::CreateVariables(framework::Scope *scope) {
                           static_cast<double>(1.0), item.second.initializer,
                           item.second.filename);
     } else {
-      PADDLE_THROW("Unsupported dtype %d.", data_type);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported dtype %d in OpTester.", data_type));
     }
 
     VLOG(3) << "Set lod for tensor " << var_name;
@@ -473,7 +478,8 @@ std::string OpTester::DebugString() {
            << "\n";
       } break;
       default:
-        PADDLE_THROW("Unsupport attr type %d", attr_type);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupport attr type %d in OpTester.", attr_type));
     }
     ss << GenSpaces(--count) << "}\n";
   }
@@ -484,8 +490,10 @@ std::string OpTester::DebugString() {
 TEST(op_tester, base) {
   if (!FLAGS_op_config_list.empty()) {
     std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
-                   FLAGS_op_config_list.c_str());
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fin), true,
+        platform::errors::InvalidArgument("OpTester cannot open file %s",
+                                          FLAGS_op_config_list.c_str()));
     std::vector<OpTesterConfig> op_configs;
     while (!fin.eof()) {
       VLOG(4) << "Reading config " << op_configs.size() << "...";
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc
index 818e5f64edc2c1d213659c48d282df75625676ca..e9477798858d13e7a2862081561634011f9156c8 100644
--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -78,7 +78,8 @@ void OpInputConfig::ParseDType(std::istream& is) {
   } else if (dtype_str == "fp64" || dtype_str == "double") {
     dtype = "fp64";
   } else {
-    PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported dtype %s in OpInputConfig.", dtype_str.c_str()));
   }
   VLOG(4) << "dtype of input " << name << " is: " << dtype;
 }
@@ -91,7 +92,9 @@ void OpInputConfig::ParseInitializer(std::istream& is) {
   const std::vector<std::string> supported_initializers = {"random", "natural",
                                                            "zeros", "file"};
   if (!Has(supported_initializers, initializer_str)) {
-    PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported initializer %s in OpInputConfig.",
+        initializer_str.c_str()));
   }
 
   initializer = initializer_str;
@@ -126,7 +129,12 @@ void OpInputConfig::ParseLoD(std::istream& is) {
     }
   }
   EraseEndSep(&lod_str);
-  PADDLE_ENFORCE_GE(lod_str.length(), 4U);
+  PADDLE_ENFORCE_GE(
+      lod_str.length(), 4U,
+      platform::errors::InvalidArgument(
+          "The length of lod string should be "
+          "equal to or larger than 4. But length of lod string is %zu.",
+          lod_str.length()));
   VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
 
   // Parse the lod_str
@@ -153,8 +161,10 @@ void OpInputConfig::ParseLoD(std::istream& is) {
 
 OpTesterConfig::OpTesterConfig(const std::string& filename) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
-                 filename.c_str());
+  PADDLE_ENFORCE_EQ(
+      static_cast<bool>(fin), true,
+      platform::errors::InvalidArgument("OpTesterConfig cannot open file %s.",
+                                        filename.c_str()));
 
   Init(fin);
 }
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index bf97b9d03c455182a8d95b6987896b9a580c84fe..ef8a2b38f20b99f0b1e41ddc1976f88dd8d1f5ab 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -166,7 +166,8 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
 #endif
 
   if (input_data_type != framework::proto::VarType::INT8 &&
-      input_data_type != framework::proto::VarType::UINT8) {
+      input_data_type != framework::proto::VarType::UINT8 &&
+      input_data_type != framework::proto::VarType::BF16) {
     auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
     PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
                       platform::errors::InvalidArgument(
@@ -455,6 +456,11 @@ void Conv3DOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<std::string>(
+      "mkldnn_data_type",
+      "(string, default \"float32\"). Data type of mkldnn kernel")
+      .SetDefault("float32")
+      .InEnum({"float32", "int8", "bfloat16"});
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
   AddAttr<std::string>("fuse_activation",
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index a91df5b3c471e234dd1ae72771c287e21ebf7af0..51b13bc2c569d0078199e6ede42bfecb2e33148b 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -175,10 +175,6 @@ void RecvGeoSparseRecords(const CommContext &rpc_ctx,
 
 template <typename T>
 void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto cpu_place = platform::CPUPlace();
-  auto &cpu_ctx = *pool.Get(cpu_place);
-
   distributed::RPCClient *rpc_client =
       distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
 
@@ -188,8 +184,13 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
   if (rpc_ctx.origin_varnames.size() == 1 &&
       rpc_ctx.splited_varnames.size() == 1) {
     auto varname = rpc_ctx.origin_varnames[0];
-    VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0];
-    rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], cpu_ctx,
+    const auto place =
+        scope.FindVar(varname)->Get<framework::LoDTensor>().place();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
+    VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? "
+            << platform::is_gpu_place(place);
+    rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx,
                                                     scope, varname, varname));
 
     for (size_t i = 0; i < rets.size(); i++) {
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ff7a71d7f03a4e91cea42926f23c7b270857ba9
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -0,0 +1,162 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    XPUElementwise<T, XPUAddFunctor<T>>(ctx);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+    using Tensor = framework::Tensor;
+
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto dx_dims = dout->dims();
+    auto dy_dims_untrimed = dout->dims();
+    T *dx_data = NULL;
+    T *dy_data = NULL;
+
+    int axis = ctx.Attr<int>("axis");
+    PADDLE_ENFORCE_GE(dx_dims.size(), dy_dims_untrimed.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    if (dx != nullptr) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      dx_dims = dx->dims();
+      dx_data = dx->data<T>();
+    }
+
+    if (dy != nullptr) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      dy_dims_untrimed = dy->dims();
+      dy_data = dy->data<T>();
+    }
+
+    int pre, n, post, is_common_broadcast;
+    if (dx_dims == dy_dims_untrimed) {
+      pre = post = 1;
+      n = dout->numel();
+    } else {
+      axis = (axis == -1 ? dx_dims.size() - dy_dims_untrimed.size() : axis);
+      PADDLE_ENFORCE(axis >= 0 && axis < dx_dims.size(),
+                     "Axis should be in range [0, dx_dims)");
+      auto dy_dims = trim_trailing_singular_dims(dy_dims_untrimed);
+      axis = (dy_dims.size() == 0) ? dx_dims.size() : axis;
+      get_mid_dims(dx_dims, dy_dims, axis, &pre, &n, &post,
+                   &is_common_broadcast);
+    }
+    int len = pre * n * post;
+
+    auto &dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    if (post == 1) {
+      int r = xpu::matrix_vector_add_grad(
+          dev_ctx.x_context(), dout->data<T>(), dout->data<T>(),
+          dout->data<T>(), dout->data<T>(), dx_data, dy_data, pre, n);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+      return;
+    }
+
+    if (dx == nullptr) {
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void **>(&dx_data), len * sizeof(float)),
+          XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    }
+
+    if (dy == nullptr) {
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void **>(&dy_data), len * sizeof(float)),
+          XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    } else {
+      if (len != n) {
+        PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&dy_data),
+                                     len * sizeof(float)),
+                          XPU_SUCCESS, platform::errors::External(
+                                           "XPU has no enough memory"));
+      }
+    }
+
+    int r = xpu::elementwise_add_grad(
+        dev_ctx.x_context(), dout->data<T>() /*x*/, dout->data<T>() /*y*/,
+        dout->data<T>() /*out*/, dout->data<T>(), dx_data, dy_data, len);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            r));
+
+    if ((dy != nullptr) && (len != n)) {
+      r = xpu::reduce_ew(dev_ctx.x_context(), dy_data, dy->data<T>(), pre, n,
+                         post, xpu::ElementwiseOp::ASSIGN);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+      dev_ctx.Wait();
+      xpu_free(dy_data);
+    }
+
+    if ((dx == nullptr || dy == nullptr) && !(dy != nullptr && len != n)) {
+      dev_ctx.Wait();
+    }
+
+    if (dx == nullptr) {
+      xpu_free(dx_data);
+    }
+    if (dy == nullptr) {
+      xpu_free(dy_data);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    elementwise_add,
+    ops::ElementwiseAddXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(elementwise_add_grad,
+                       ops::ElementwiseAddGradXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_xpu.h b/paddle/fluid/operators/elementwise/elementwise_xpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..53c4332e9190de131b48d68c30b84b035ec66e67
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct XPUAddFunctor {
+  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
+    return xpu::elementwise_add(ctx, x, y, z, len);
+  }
+};
+
+template <typename T>
+struct XPUMulFunctor {
+  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
+    return xpu::elementwise_mul(ctx, x, y, z, len);
+  }
+};
+
+template <typename T, typename Functor>
+void XPUElementwise(const framework::ExecutionContext& ctx) {
+  PADDLE_ENFORCE(platform::is_xpu_place(ctx.GetPlace()),
+                 "This kernel only runs on XPU device.");
+  auto x_var = ctx.InputVar("X");
+  PADDLE_ENFORCE_NE(x_var, nullptr,
+                    platform::errors::Fatal("Cannot get input Variable X"));
+  PADDLE_ENFORCE(x_var->IsType<framework::LoDTensor>(),
+                 "XPU only support LoDTensor");
+
+  auto x = x_var->Get<framework::LoDTensor>();
+  auto* y = ctx.Input<framework::LoDTensor>("Y");
+  auto* z = ctx.Output<framework::LoDTensor>("Out");
+  z->mutable_data<T>(ctx.GetPlace());
+
+  int axis = ctx.Attr<int>("axis");
+  auto x_dims = x.dims();
+  auto y_dims_untrimed = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
+                    "Rank of first input must >= rank of second input.");
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+  auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
+  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+  int pre, n, post, is_common_broadcast;
+  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &is_common_broadcast);
+  int len = pre * n * post;
+
+  const T* x_data = x.data<T>();
+  const T* y_data = y->data<T>();
+  T* z_data = z->data<T>();
+  T* y_broadcast = nullptr;
+
+  auto& dev_ctx =
+      ctx.template device_context<paddle::platform::XPUDeviceContext>();
+
+  if (post == 1) {
+    if (std::is_same<Functor, XPUAddFunctor<T>>::value) {
+      int res = xpu::matrix_vector_add(dev_ctx.x_context(), x_data, y_data,
+                                       z_data, pre, n);
+      PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
+                     res);
+      return;
+    }
+    if (std::is_same<Functor, XPUMulFunctor<T>>::value) {
+      int res = xpu::matrix_vector_mul(dev_ctx.x_context(), x_data, y_data,
+                                       z_data, pre, n);
+      PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
+                     res);
+      return;
+    }
+  }
+
+  if (pre != 1 || post != 1) {
+    PADDLE_ENFORCE(xpu_malloc(reinterpret_cast<void**>(&y_broadcast),
+                              len * sizeof(T)) == XPU_SUCCESS);
+    int res = xpu::broadcast_ew(dev_ctx.x_context(), y_data, y_broadcast, pre,
+                                n, post, xpu::ElementwiseOp::ASSIGN);
+    PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
+                   res);
+    y_data = y_broadcast;
+  }
+
+  Functor functor;
+  int res = functor(dev_ctx.x_context(), x_data, y_data, z_data, len);
+  PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
+                 res);
+
+  if (pre != 1 || post != 1) {
+    dev_ctx.Wait();
+    xpu_free(y_broadcast);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index a5b270c1dfef14bc92697c29bfeafa0fe08211d7..03279a9b2c15b8d918333fd61c07ed636f11d889 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -520,11 +520,11 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
     //          (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
     //          np.sum(dy,
     //          axis=(h,w)) * (x - mean) *
-    //          (np.mean(ddx, axis=(h,w)) - ddx) + ddr * (dy * inv_var - inv_var
-    //          *
+    //          (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var -
+    //          inv_var *
     //          np.mean(dy, axis=(h,w)) -
     //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-    //          axis=(h,w))))
+    //          axis=(h,w)))
 
     Tensor x_sub_mean_mul_invstd;
     x_sub_mean_mul_invstd.Resize({sample_size, NxC});
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 898f27f9afef9ca13a9f24ab1b61a50f745d40f7..d65cdc6c150ec6b9e5e4ed3e469069b3beffc819 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -136,7 +136,6 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
 }
 
 using Tensor = paddle::framework::Tensor;
-
 template <typename KernelTuple, typename PlaceType>
 void BenchKernelXYZN() {
   using T = typename KernelTuple::data_type;
@@ -320,8 +319,15 @@ void BenchKernelSgd() {
   const T lr = 0.1;
   auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                   const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
+    PADDLE_ENFORCE_LE(
+        static_cast<size_t>(upper - lower), n - 1,
+        paddle::platform::errors::InvalidArgument(
+            "The range of Sgd (upper - lower) should be equal to or lower "
+            "than n-1 (Sgd size -1). But upper - lower is %d and n-1 is %d.",
+            static_cast<size_t>(upper - lower), (n - 1)));
+    PADDLE_ENFORCE_GT(
+        n, 0, paddle::platform::errors::InvalidArgument(
+                  "The Sgd size should be larger than 0. But the n is %d.", n));
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc
index b4e63d87eac064c7f29855cefc3dbe875ccee28f..c549fec0970cb235bed77105c4297669c163c5e7 100644
--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
@@ -132,11 +132,31 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const emb_seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.table_height, 0);
-    PADDLE_ENFORCE_GT(attr.table_width, 0);
-    PADDLE_ENFORCE_GT(attr.index_height, 0);
-    PADDLE_ENFORCE_GT(attr.index_width, 0);
-    PADDLE_ENFORCE_GT(attr.out_width, 0);
+    PADDLE_ENFORCE_GT(attr.table_height, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute table_height of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.table_height));
+    PADDLE_ENFORCE_GT(attr.table_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute table_width of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.table_width));
+    PADDLE_ENFORCE_GT(attr.index_height, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute index_height of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.index_height));
+    PADDLE_ENFORCE_GT(attr.index_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute index_width of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.index_width));
+    PADDLE_ENFORCE_GT(attr.out_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute out_width of EmbSeqPool should be "
+                          "larger than 0. But it is %d.",
+                          attr.out_width));
     return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc
index 047d0d3e1caa2290111104d5799e67cea7b7eed2..3139b252cadbc37d6ffbe2af023bd5e836f15ab7 100644
--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -29,7 +29,11 @@ void MatMulJitCode::genCode() {
   preCode();
   int block, rest;
   const auto groups = packed_groups(n_, k_, &block, &rest);
-  PADDLE_ENFORCE_GT(groups.front(), 0);
+  PADDLE_ENFORCE_GT(
+      groups.front(), 0,
+      platform::errors::InvalidArgument("The number of rest registers should "
+                                        "be larger than 0. But it is %d.",
+                                        groups.front()));
 
   const int block_len = sizeof(float) * block;
   const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
@@ -118,9 +122,21 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const matmul_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.m, 0);
-    PADDLE_ENFORCE_GT(attr.n, 0);
-    PADDLE_ENFORCE_GT(attr.k, 0);
+    PADDLE_ENFORCE_GT(
+        attr.m, 0, platform::errors::InvalidArgument(
+                       "The attribute m (first matrix's row) of MatMul should "
+                       "be larger than 0. But it is %d.",
+                       attr.m));
+    PADDLE_ENFORCE_GT(
+        attr.n, 0, platform::errors::InvalidArgument(
+                       "The attribute n (first matrix's col) of MatMul should "
+                       "be larger than 0. But it is %d.",
+                       attr.n));
+    PADDLE_ENFORCE_GT(
+        attr.k, 0, platform::errors::InvalidArgument(
+                       "The attribute k (second matrix's col) of MatMul should "
+                       "be larger than 0. But it is %d.",
+                       attr.k));
     return make_unique<MatMulJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h
index 4f04f7606d2deb8ba58809f9e07e60ba19182a71..eb7328d7e069cf05a22ec1ecee70f36280e6d231 100644
--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -33,7 +33,10 @@ class MatMulJitCode : public JitCode {
                          size_t code_size = 256 * 1024,
                          void* code_ptr = nullptr)
       : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
-    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
+    PADDLE_ENFORCE_EQ(m_, 1, platform::errors::Unimplemented(
+                                 "Jitcode of matmul only support m==1 (first "
+                                 "matrix's row) now. But m is %d.",
+                                 m_));
     this->genCode();
   }
 
diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc
index ec8e4e9827441bc0a817c6da455cb9e530c8c1bf..d8c7b3cdb7b1f36125b76feab19ab4369d491219 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@@ -70,8 +70,14 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.w, 0);
-    PADDLE_ENFORCE_GT(attr.h, 0);
+    PADDLE_ENFORCE_GT(attr.w, 0, platform::errors::InvalidArgument(
+                                     "The attribute width of SeqPool should "
+                                     "be larger than 0. But it is %d.",
+                                     attr.w));
+    PADDLE_ENFORCE_GT(attr.h, 0, platform::errors::InvalidArgument(
+                                     "The attribute height of SeqPool should "
+                                     "be larger than 0. But it is %d.",
+                                     attr.h));
     return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h
index cb562c4c9a6c6be5c5881cb6273ca7426a6c2a10..d4e7b2e29ce22705c0ef7320495f55483d9bfef1 100644
--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@@ -127,8 +127,13 @@ class SeqPoolJitCode : public JitCode {
         vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
         reg_idx++;
       }
-      PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs,
-                        "All heights should use same regs");
+      PADDLE_ENFORCE_EQ(
+          reg_idx, rest_used_num_regs,
+          platform::errors::InvalidArgument(
+              "All heights of SeqPool should use the same number of registers."
+              "It equals to the numbr of rest registers. But use %d registers "
+              "and the numbr of rest registers is %d.",
+              reg_idx, rest_used_num_regs));
       for (int i = 0; i < reg_idx; ++i) {
         vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
       }
diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc
index 1452d4139b0d7994e5304680d63a63cb28bb606b..7fe93fdb6a51a811a6e60ba5af31d9a91aadd336 100644
--- a/paddle/fluid/operators/jit/gen/sgd.cc
+++ b/paddle/fluid/operators/jit/gen/sgd.cc
@@ -116,9 +116,24 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
   size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; }
   std::unique_ptr<GenBase> CreateJitCode(
       const sgd_attr_t& attr) const override {
-    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
-    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
-    PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
+    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width,
+                      platform::errors::InvalidArgument(
+                          "The attribute param_width of Sgd should be "
+                          "equal to the attribute grad_width. But param_width "
+                          "is %d and grad_width is %d.",
+                          attr.param_width, attr.grad_width));
+    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height,
+                      platform::errors::InvalidArgument(
+                          "The attribute selected_rows_size of Sgd should be "
+                          "equal to or less than the attribute grad_height. "
+                          "But selected_rows_size is %d and grad_height is %d.",
+                          attr.selected_rows_size, attr.grad_height));
+    PADDLE_ENFORCE_GE(
+        attr.selected_rows_size, 0,
+        platform::errors::InvalidArgument(
+            "The attribute selected_rows_size of Sgd should be "
+            "equal to or larger than 0. But selected_rows_size is %d.",
+            attr.selected_rows_size));
     return make_unique<SgdJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc
index 66a8d75fd4de5bae3ba37cf7fe7b1645938aa855..4084d68c2a840812358ec13f33d99fbb1f592c9f 100644
--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
@@ -76,7 +76,11 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
     return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
   }
   std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
-    PADDLE_ENFORCE_GT(w, 0);
+    PADDLE_ENFORCE_GT(
+        w, 0,
+        platform::errors::InvalidArgument(
+            "The width of VBroadcast should be larger than 0. But w is %d.",
+            w));
     return make_unique<VBroadcastJitCode>(w, CodeSize(w));
   }
 };
diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc
index 4c49eff49e3efc0664a084f9fa2bb897db0c6f1d..2ae71256cddcb172edb24488d559fe788e99ada5 100644
--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -49,9 +49,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
 void* GenBase::operator new(size_t size) {
   void* ptr;
   constexpr size_t alignment = 32ul;
-  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0,
-                    "GenBase Alloc %ld error!", size);
-  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
+  PADDLE_ENFORCE_EQ(
+      posix_memalign(&ptr, alignment, size), 0,
+      platform::errors::InvalidArgument(
+          "Jitcode generator (GenBase) allocate %ld memory error!", size));
+  PADDLE_ENFORCE_NOT_NULL(ptr, platform::errors::InvalidArgument(
+                                   "Fail to allocate jitcode generator "
+                                   "(GenBase) CPU memory: size = %d .",
+                                   size));
   return ptr;
 }
 
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
index 2952cdb87146ec01a366abaf332ce1099c425966..c66e8092d5e4221767100c94174210af24a43abc 100644
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -66,7 +66,8 @@ const char* to_string(KernelType kt) {
     ONE_CASE(kEmbSeqPool);
     ONE_CASE(kSgd);
     default:
-      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "JIT kernel do not support type: %d.", kt));
       return "NOT JITKernel";
   }
   return nullptr;
@@ -79,7 +80,8 @@ const char* to_string(SeqPoolType tp) {
     ONE_CASE(kAvg);
     ONE_CASE(kSqrt);
     default:
-      PADDLE_THROW("Not support type: %d, or forget to add it.", tp);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "SeqPool JIT kernel do not support type: %d.", tp));
       return "NOT PoolType";
   }
   return nullptr;
@@ -100,7 +102,8 @@ KernelType to_kerneltype(const std::string& act) {
   } else if (lower == "tanh" || lower == "vtanh") {
     return kVTanh;
   }
-  PADDLE_THROW("Not support type: %s, or forget to add this case", act);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Act JIT kernel do not support type: %s.", act));
   return kNone;
 }
 
@@ -109,12 +112,19 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
   int block, rest;
   const auto groups = packed_groups(n, k, &block, &rest);
   std::for_each(groups.begin(), groups.end(), [&](int i) {
-    PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
+    PADDLE_ENFORCE_GT(i, 0, platform::errors::InvalidArgument(
+                                "Each element of groups should be larger than "
+                                "0. However the element: %d doesn't satify.",
+                                i));
   });
   int sum = std::accumulate(groups.begin(), groups.end(), 0);
   std::memset(dst, 0, k * sum * block * sizeof(float));
   PADDLE_ENFORCE_GE(sum * block, n,
-                    "The packed n should be equal to or larger than n");
+                    platform::errors::InvalidArgument(
+                        "The packed n (sum * block) should be equal to or "
+                        "larger than n (matmul row size). "
+                        "However, the packed n is %d and n is %d.",
+                        sum * block, n));
 
   const int block_len = sizeof(float) * block;
   int n_offset = 0;
@@ -136,7 +146,8 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
 template <typename T>
 typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
     const T* src, T* dst, int n, int k) {
-  PADDLE_THROW("Only support pack with float type.");
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Only supports pack weights with float type."));
 }
 
 }  // namespace jit
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
index b6dd49b77728c48c1075109934699545bb282420..0791bb5810526cb930fe1869a60913d4239f72a3 100644
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -85,8 +85,10 @@ inline const Kernel* GetReferKernel() {
   auto& ref_pool = ReferKernelPool::Instance().AllKernels();
   KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace());
   auto ref_iter = ref_pool.find(kkey);
-  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
-                 "Every Kernel should have reference function.");
+  PADDLE_ENFORCE_NE(
+      ref_iter, ref_pool.end(),
+      platform::errors::PreconditionNotMet(
+          "Every Refer Kernel of jitcode should have reference function."));
   auto& ref_impls = ref_iter->second;
   for (auto& impl : ref_impls) {
     auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
@@ -101,7 +103,9 @@ template <typename KernelTuple>
 inline typename KernelTuple::func_type GetReferFunc() {
   auto ker = GetReferKernel<KernelTuple>();
   auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
-  PADDLE_ENFORCE(p, "The Refer kernel should exsit");
+  PADDLE_ENFORCE_NOT_NULL(p, platform::errors::InvalidArgument(
+                                 "Get the reference code of kernel in CPU "
+                                 "failed. The Refer kernel should exsit."));
   return p->GetFunc();
 }
 
@@ -132,7 +136,9 @@ std::vector<const Kernel*> GetAllCandidateKernels(
 
   // The last implementation should be reference function on CPUPlace.
   auto ref = GetReferKernel<KernelTuple>();
-  PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
+  PADDLE_ENFORCE_NOT_NULL(ref, platform::errors::InvalidArgument(
+                                   "Get all candicate kernel in CPU failed. "
+                                   "The Refer Kernel can not be empty."));
   res.emplace_back(ref);
   return res;
 }
@@ -147,11 +153,14 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
     std::string name = k->ImplType();
     if (name == "JitCode") {
       auto i = dynamic_cast<const GenBase*>(k);
-      PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
+      PADDLE_ENFORCE_NOT_NULL(i,
+                              platform::errors::InvalidArgument(
+                                  "Generate jitcode kernel (GenBase) failed."));
       res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
     } else {
       auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
-      PADDLE_ENFORCE(i, "kernel cast can not fail.");
+      PADDLE_ENFORCE_NOT_NULL(i, platform::errors::InvalidArgument(
+                                     "Kernel cast (KernelMore) failed."));
       res.emplace_back(std::make_pair(name, i->GetFunc()));
     }
   }
@@ -173,7 +182,9 @@ template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
 typename KernelTuple::func_type GetDefaultBestFunc(
     const typename KernelTuple::attr_type& attr) {
   auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
-  PADDLE_ENFORCE_GE(funcs.size(), 1UL);
+  PADDLE_ENFORCE_GE(funcs.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "The candicate jit kernel is at least one in CPU."));
   // Here could do some runtime benchmark of this attr and return the best one.
   // But yet just get the first one as the default best one,
   // which is searched in order and tuned by offline.
diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc
index f5b7bfff89825bfcd6cbe4b1008628d3e1093f4c..5d63f4848e6165bfb84c1bfe301d20cc24cfc7b0 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -95,7 +95,8 @@ void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
   } else if (type == kVIdentity) {
     return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
   }
-  PADDLE_THROW("Not support type: %s", type);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Act JIT kernel do not support type: %s", type));
   return nullptr;
 }
 
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index ee31c8df2f882d092783cf0408564a39ca6fafa1..5f3c29ad5efb848f1fa12236ffe36a9f654864a3 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -103,11 +103,24 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
 template <typename T>
 void EmbSeqPool(const T* table, const int64_t* idx, T* out,
                 const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  PADDLE_ENFORCE_EQ(
+      attr->table_width * attr->index_width, attr->out_width,
+      platform::errors::InvalidArgument(
+          "The attribute table_width * index_width of EmbSeqPool should "
+          "be equal to out_width. But table_width * index_width is %d, "
+          "out_width is %d.",
+          attr->table_width * attr->index_width, attr->out_width));
   auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
-                      idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+    PADDLE_ENFORCE_LT(
+        idx[i], attr->table_height,
+        platform::errors::InvalidArgument(
+            "The idx shoud be lower than the attribute table_height of "
+            "EmbSeqPool. But %dth of idx is %d and table_height is %d.",
+            i, idx[i], attr->table_height));
+    PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
+                                     "The idx shoud be equal to or larger than "
+                                     "the 0. But %dth of idx is %d.",
+                                     i, idx[i]));
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -168,22 +181,50 @@ void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
 template <typename T>
 void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
          T* out, const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width,
+                    platform::errors::InvalidArgument(
+                        "The attribute param_width of Sgd should be "
+                        "equal to the attribute grad_width. But param_width "
+                        "is %d and grad_width is %d.",
+                        attr->param_width, attr->grad_width));
+  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height,
+                    platform::errors::InvalidArgument(
+                        "The attribute selected_rows_size of Sgd should be "
+                        "equal to or less than the attribute grad_height. "
+                        "But selected_rows_size is %d and grad_height is %d.",
+                        attr->selected_rows_size, attr->grad_height));
   T scalar = -lr[0];
   int width = attr->grad_width;
   if (out == param) {
     for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
       auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
+      PADDLE_ENFORCE_LT(h_idx, attr->param_height,
+                        platform::errors::InvalidArgument(
+                            "The rows of Sgd should be "
+                            "less than the attribute. But %dth of rows "
+                            "is %d and grad_width is %d.",
+                            i, h_idx, attr->param_height));
+      PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
+                                      "The rows of Sgd should be "
+                                      "larger than 0. But %dth of rows "
+                                      "is %d.",
+                                      i, h_idx));
       VAXPY(scalar, grad + i * width, out + h_idx * width, width);
     }
   } else {
     for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
       auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
+      PADDLE_ENFORCE_LT(h_idx, attr->param_height,
+                        platform::errors::InvalidArgument(
+                            "The rows of Sgd should be "
+                            "less than the attribute. But %dth of rows "
+                            "is %d and grad_width is %d.",
+                            i, h_idx, attr->param_height));
+      PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
+                                      "The rows of Sgd should be "
+                                      "larger than 0. But %dth of rows "
+                                      "is %d.",
+                                      i, h_idx));
       VScal(&scalar, grad + i * width, out + h_idx * width, width);
       VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width,
            width);
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index b8d5e2c24071f6a14b070f4120f8c987110ed4f7..42fb7b4f279c225fb38a49d23e9d76ac1854d12d 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -147,7 +147,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) {  // NOLINT
   } else if (type == kVIdentity) {
     return VIdentity<T>;
   }
-  PADDLE_THROW("Not support type: %s", type);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Act JIT kernel do not support type: %s.", type));
   return nullptr;
 }
 
@@ -465,12 +466,25 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
 template <typename T>
 void EmbSeqPool(const T* table, const int64_t* idx, T* out,
                 const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  PADDLE_ENFORCE_EQ(
+      attr->table_width * attr->index_width, attr->out_width,
+      platform::errors::InvalidArgument(
+          "The attribute table_width * index_width of EmbSeqPool should "
+          "be equal to out_width. But table_width * index_width is %d and "
+          "out_width is %d.",
+          attr->table_width * attr->index_width, attr->out_width));
 
   auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
-                      idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+    PADDLE_ENFORCE_LT(
+        idx[i], attr->table_height,
+        platform::errors::InvalidArgument(
+            "The idx shoud be lower than the attribute table_height of "
+            "EmbSeqPool. But %dth of idx is %d and table_height is %d.",
+            i, idx[i], attr->table_height));
+    PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
+                                     "The idx shoud be equal to or larger than "
+                                     "the 0. But %dth of idx is %d.",
+                                     i, idx[i]));
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -505,12 +519,31 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
 template <typename T>
 void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
          T* out, const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width,
+                    platform::errors::InvalidArgument(
+                        "The attribute param_width of Sgd should be "
+                        "equal to the attribute grad_width. But param_width "
+                        "is %d and grad_width is %d.",
+                        attr->param_width, attr->grad_width));
+  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height,
+                    platform::errors::InvalidArgument(
+                        "The attribute selected_rows_size of Sgd should be "
+                        "equal to or less than the attribute grad_height. "
+                        "But selected_rows_size is %d and grad_height is %d.",
+                        attr->selected_rows_size, attr->grad_height));
   for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
     auto h_idx = rows[i];
-    PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-    PADDLE_ENFORCE_GE(h_idx, 0);
+    PADDLE_ENFORCE_LT(h_idx, attr->param_height,
+                      platform::errors::InvalidArgument(
+                          "The rows of Sgd should be "
+                          "less than the attribute. But %dth of rows "
+                          "is %d and grad_width is %d.",
+                          i, h_idx, attr->param_height));
+    PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
+                                    "The rows of Sgd should be "
+                                    "larger than 0. But %dth of rows "
+                                    "is %d.",
+                                    i, h_idx));
     for (int64_t j = 0; j < attr->grad_width; ++j) {
       out[h_idx * attr->grad_width + j] =
           param[h_idx * attr->grad_width + j] -
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index eb56f111f0880f1a884e8f7f7ca2edcebfac695a..0cc62720b87943c8d92e56a53705ec9e4b46e047 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -850,8 +850,15 @@ void TestKernelSgd() {
   const T lr = 0.1;
   auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                   const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
+    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1,
+                      paddle::platform::errors::InvalidArgument(
+                          "The range of Sgd (upper - lower) should be lower "
+                          "than n-1 (Sgd size -1). But the upper - lower is %d "
+                          "and n-1 is %d.",
+                          static_cast<size_t>(upper - lower), n - 1));
+    PADDLE_ENFORCE_GT(
+        n, 0, paddle::platform::errors::InvalidArgument(
+                  "The Sgd size should be larger than 0. But the n is %d.", n));
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index a0464cf70e2dcc44c42fc2ca7440680ef8a53e6e..aeafe22235c0954d16a73ac242ccb9e54a15413b 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -420,6 +420,22 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
   });
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMV(
+    bool trans_a, int M, int N, platform::float16 alpha,
+    const platform::float16 *A, const platform::float16 *B,
+    platform::float16 beta, platform::float16 *C) const {
+  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                           alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                           alpha, A, B, beta, C);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -479,6 +495,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
   }
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::float16 alpha, const platform::float16 **A,
+    const platform::float16 **B, platform::float16 beta, platform::float16 **C,
+    int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
+                                           B[k], beta, C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
index bb2b6db100b65dae175af1738a9592b1c4212a9a..37155fa184e23a3582226eaef2d2813e79e7c61c 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include <algorithm>
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/gpu_launch_param_config.h"
-#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
@@ -100,7 +99,7 @@ __global__ void SegmentOpsKernel(const Index* segment_ids, const T* input,
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
     Index inner_dim_size = h.inner_dim_size;
-    h.calculate(stripe_index, segment_offset, dim_index_base, actual_height);
+    h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
 
     T minmax = pool.initial();
     Index first_segment_id = segment_ids[dim_index_base];
@@ -154,7 +153,7 @@ __global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input,
                                        T* in_grad, Helper h) {
   CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) {
     Index segment_offset, dim_index_base, actual_height;
-    h.calculate(stripe_index, segment_offset, dim_index_base, actual_height);
+    h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height);
 
     for (Index j = 0; j < actual_height; j++) {
       Index current_segment_id = segment_ids[dim_index_base + j];
@@ -217,11 +216,11 @@ class ArrangeHelper {
     total_stripe_count = inner_dim_size * input_outer_dim_num_stripe;
   }
 
-  DEVICE inline void calculate(T stripe_index, T& segment_offset,
-                               T& dim_index_base, T& actual_height) {
-    segment_offset = stripe_index % inner_dim_size;
-    dim_index_base = stripe_index / inner_dim_size * DimTileSize;
-    actual_height = min(DimTileSize, input_length_size - dim_index_base);
+  DEVICE inline void calculate(T stripe_index, T* segment_offset,
+                               T* dim_index_base, T* actual_height) {
+    *segment_offset = stripe_index % inner_dim_size;
+    *dim_index_base = stripe_index / inner_dim_size * DimTileSize;
+    *actual_height = min(DimTileSize, input_length_size - *dim_index_base);
   }
 };
 
diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff038d7ef1223d7e7ddafe942492b845172d4986
--- /dev/null
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
@@ -0,0 +1,343 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return framework::make_ddim({1, x_dim[0]});
+}
+
+static framework::Tensor FoldInitDims(const framework::Tensor &input) {
+  auto output = input;
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
+  }
+  return output;
+}
+/**
+ * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return framework::make_ddim({y_dim[0], 1});
+}
+
+static void ReshapeTensorIntoMatrixSequence(
+    framework::Tensor *x, const math::MatDescriptor &descriptor) {
+  int64_t h, w;
+  h = descriptor.height_;
+  w = descriptor.width_;
+  if (descriptor.trans_) {
+    std::swap(w, h);
+  }
+  if (descriptor.batch_size_) {
+    x->Resize({descriptor.batch_size_, h, w});
+  } else {
+    x->Resize({h, w});
+  }
+}
+/**
+ * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
+ * Out = matmul(x, y)
+ *
+ * This method will first calculate X,Y matrix sequence, and then calculate
+ * the out shape.
+ *
+ * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
+ * The out = [BatchSize, H1, W2]
+ *
+ * If there is no batch size in `X` and `Y`, the out will be [H1, W2]
+ * If any of `X` and `Y` has batch size BatchSize, the out will have the
+ * BatchSize.
+ */
+static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
+                                           framework::Tensor *y,
+                                           framework::Tensor *out, bool trans_x,
+                                           bool trans_y) {
+  auto x_dim = RowMatrixFromVector(x->dims());
+  auto y_dim = ColumnMatrixFromVector(y->dims());
+  auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
+  auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
+  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
+    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
+  } else {
+    out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
+                 mat_dim_x.height_, mat_dim_y.width_});
+  }
+
+  ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
+  ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
+}
+
+template <typename DeviceContext, typename T>
+class MatMulXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<framework::Tensor>("X");
+    auto *y = context.Input<framework::Tensor>("Y");
+    auto *out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    auto mat_dim_a = math::CreateMatrixDescriptor(
+        RowMatrixFromVector(x->dims()), 0, context.Attr<bool>("transpose_X"));
+    auto mat_dim_b =
+        math::CreateMatrixDescriptor(ColumnMatrixFromVector(y->dims()), 0,
+                                     context.Attr<bool>("transpose_Y"));
+    PADDLE_ENFORCE_EQ(
+        mat_dim_a.width_, mat_dim_b.height_,
+        platform::errors::InvalidArgument("Shape mistake in matmul_op"));
+    PADDLE_ENFORCE_EQ(
+        mat_dim_a.batch_size_, mat_dim_b.batch_size_,
+        platform::errors::InvalidArgument("Shape mistake in matmul_op"));
+    T alpha = static_cast<T>(context.Attr<float>("alpha"));
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    float *data_c = out->data<T>();
+    if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
+      int r =
+          xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_,
+                        mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_,
+                        alpha, x->data<T>(), y->data<T>(), 0.0f, data_c);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    } else {
+      // batch matmul
+      int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_,
+                                      mat_dim_b.trans_, mat_dim_a.batch_size_,
+                                      mat_dim_a.height_, mat_dim_b.width_,
+                                      mat_dim_a.width_, alpha, x->data<T>(),
+                                      y->data<T>(), data_c, nullptr, nullptr);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    }
+  }
+};
+
+// Reshape a rank-3 tensor from P x M x N to M x (P * N).
+// (Warning: This requires transposing data and writes into new memory.)
+// Identity op if the tensor is not of rank 3.
+template <typename DeviceContext, typename T>
+static framework::Tensor XPUFoldHeadAndLastDims(
+    const DeviceContext &context, const framework::Tensor &input) {
+  auto in_dims = input.dims();
+  if (in_dims.size() != 3) {
+    return input;
+  }
+
+  framework::Tensor output;
+  output.Resize({in_dims[1], in_dims[0], in_dims[2]});
+  output.mutable_data<T>(context.GetPlace());
+  std::vector<int> in_shape_host = {static_cast<int>(in_dims[0]),
+                                    static_cast<int>(in_dims[1]),
+                                    static_cast<int>(in_dims[2])};
+  std::vector<int> axis_host = {1, 0, 2};
+
+  int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
+                         in_shape_host.data(), axis_host.data(), /*ndims=*/3);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        r));
+  output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
+
+  return output;
+}
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// transpose_X | False    | True     | False    | True
+// transpose_Y | False    | False    | True     | True
+// -----------+----------+----------+----------+-----------
+//        dX = | dOut Y^T | Y dOut^T | dOut Y   | Y^T dOut^T
+//        dY = | X^T dOut | X dOut   | dOut^T X | dOut^T X^T
+//
+// When X is a vector of size K, we treat it instead as a matrix of shape
+// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
+// a matrix of shape (K, 1).
+//
+// When X and Y are both 3-dimensional tensors, then the first dimension
+// the batch dimension can be ignored and the exact same formulas apply
+// as for two matrices.
+//
+// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
+// up with formulas like
+//
+//   dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
+//
+// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
+// to X: (P * M) x K, dOut: (P * M) x N.
+template <typename DeviceContext, typename T>
+class MatMulGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void MatMul(const framework::ExecutionContext &context,
+              const framework::Tensor &a, bool trans_a,
+              const framework::Tensor &b, bool trans_b,
+              framework::Tensor *out) const {
+    out->mutable_data<T>(context.GetPlace());
+    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+    PADDLE_ENFORCE_EQ(
+        mat_dim_a.width_, mat_dim_b.height_,
+        platform::errors::InvalidArgument("Shape mistake in matmul_grad_op"));
+    PADDLE_ENFORCE_EQ(
+        mat_dim_a.batch_size_, mat_dim_b.batch_size_,
+        platform::errors::InvalidArgument("Shape mistake in matmul_grad_op"));
+    T alpha = static_cast<T>(context.Attr<float>("alpha"));
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    float *data_c = out->data<T>();
+    if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
+      int r =
+          xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_,
+                        mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_,
+                        alpha, a.data<T>(), b.data<T>(), 0.0f, data_c);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    } else {
+      // batch matmul
+      int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_,
+                                      mat_dim_b.trans_, mat_dim_a.batch_size_,
+                                      mat_dim_a.height_, mat_dim_b.width_,
+                                      mat_dim_a.width_, alpha, a.data<T>(),
+                                      b.data<T>(), data_c, nullptr, nullptr);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    }
+  }
+
+  void CalcInputGrad(const framework::ExecutionContext &context,
+                     const framework::Tensor &a, bool trans_a,
+                     bool is_fold_init_dims_a, const framework::Tensor &b,
+                     bool trans_b, bool is_fold_init_dims_b,
+                     framework::Tensor *out) const {
+    if (out == nullptr) return;
+    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
+                        out->dims().size() == 2;
+    if (!need_combine) {
+      MatMul(context, a, trans_a, b, trans_b, out);
+    } else {
+      auto &dev_ctx = context.template device_context<DeviceContext>();
+      MatMul(
+          context, is_fold_init_dims_a
+                       ? FoldInitDims(a)
+                       : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, a),
+          trans_a, is_fold_init_dims_b
+                       ? FoldInitDims(b)
+                       : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, b),
+          trans_b, out);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto x = *context.Input<framework::Tensor>("X");
+    auto y = *context.Input<framework::Tensor>("Y");
+    auto dout =
+        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+
+    framework::DDim dx_dims;
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x.dims()) {
+        dx->Resize(x.dims());
+      }
+    }
+
+    framework::DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y.dims()) {
+        dy->Resize(y.dims());
+      }
+    }
+
+    if (transpose_x && transpose_y) {
+      CalcInputGrad(context, y, true, true, dout, true, false, dx);
+      CalcInputGrad(context, dout, true, true, x, true, false, dy);
+    } else if (transpose_x) {
+      CalcInputGrad(context, y, false, false, dout, true, false, dx);
+      CalcInputGrad(context, x, false, false, dout, false, true, dy);
+    } else if (transpose_y) {
+      CalcInputGrad(context, dout, false, false, y, false, true, dx);
+      CalcInputGrad(context, dout, true, true, x, false, true, dy);
+    } else {
+      CalcInputGrad(context, dout, false, false, y, true, false, dx);
+      CalcInputGrad(context, x, true, true, dout, false, true, dy);
+    }
+
+    if (dx) {
+      if (dx_dims != x.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+
+    if (dy) {
+      if (dy_dims != y.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    matmul, ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    matmul_grad,
+    ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/matmul_v2_op.cu b/paddle/fluid/operators/matmul_v2_op.cu
index 64ec65a23419725c7cc481beadb9383402a426bd..91958513ddb3c9923487e5de86f188bc3a0a6f65 100644
--- a/paddle/fluid/operators/matmul_v2_op.cu
+++ b/paddle/fluid/operators/matmul_v2_op.cu
@@ -17,10 +17,12 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plf = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(matmul_v2,
-                        ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
-                        ops::MatMulV2Kernel<plf::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_v2, ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
+    ops::MatMulV2Kernel<plf::CUDADeviceContext, double>,
+    ops::MatMulV2Kernel<plf::CUDADeviceContext, plf::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     matmul_v2_grad, ops::MatMulV2GradKernel<plf::CUDADeviceContext, float>,
-    ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>);
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>,
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, plf::float16>);
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index 8cd4fa12be4065b3ece42e7525481f2f04f35bc8..ee485bd1711e21b86cdf65fdb2f5f0793e42beb4 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -163,17 +163,20 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
     if (trans_y) {
       const int M = Y->numel() / N;
       VLOG(3) << "MatMul's case 2";
-      blas.GEMV(false, M, N, 1., y_data, x_data, 0., Out->data<T>());
+      blas.GEMV(false, M, N, static_cast<T>(1), y_data, x_data,
+                static_cast<T>(0), Out->data<T>());
     } else {
       const int M = y_dims[y_ndim - 1];
       const int batch_size = Y->numel() / (M * N);
       if (batch_size == 1) {
         VLOG(3) << "MatMul's case 3";
-        blas.GEMV(true, N, M, 1., y_data, x_data, 0., Out->data<T>());
+        blas.GEMV(true, N, M, static_cast<T>(1), y_data, x_data,
+                  static_cast<T>(0), Out->data<T>());
       } else {
         VLOG(3) << "MatMul's case 4";
-        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, y_data,
-                         x_data, 0, Out->data<T>(), batch_size, M * N, 0);
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
+                         y_data, x_data, static_cast<T>(0), Out->data<T>(),
+                         batch_size, M * N, 0);
       }
     }
     return;
@@ -205,16 +208,19 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
       const int batch_size = X->numel() / (M * N);
       if (batch_size == 1) {
         VLOG(3) << "MatMul's case 5";
-        blas.GEMV(true, N, M, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+        blas.GEMV(true, N, M, static_cast<T>(1), x_data, y_data,
+                  static_cast<T>(0), Out->data<T>());
       } else {
         VLOG(3) << "MatMul's case 6";
-        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, x_data,
-                         y_data, 0, Out->data<T>(), batch_size, M * N, 0);
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
+                         x_data, y_data, static_cast<T>(0), Out->data<T>(),
+                         batch_size, M * N, 0);
       }
     } else {
       const int M = X->numel() / N;
       VLOG(3) << "MatMul's case 7";
-      blas.GEMV(false, M, N, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+      blas.GEMV(false, M, N, static_cast<T>(1), x_data, y_data,
+                static_cast<T>(0), Out->data<T>());
     }
     return;
   }
@@ -263,37 +269,38 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
   if (x_batch_size == 1 && y_batch_size == 1) {
     VLOG(3) << "MatMul's case 8";
     blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
-              trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
-              y_data, 0.0f, Out->data<T>());
+              trans_y ? CblasTrans : CblasNoTrans, M, N, K, static_cast<T>(1),
+              x_data, y_data, static_cast<T>(0), Out->data<T>());
   } else if (x_batch_size == 1) {
     if (M == 1 && trans_y) {
       VLOG(3) << "MatMul's case 9";
-      blas.GEMV(false, y_batch_size * N, K, 1.0f, y_data, x_data, 0.0f,
-                Out->data<T>());
+      blas.GEMV(false, y_batch_size * N, K, static_cast<T>(1), y_data, x_data,
+                static_cast<T>(0), Out->data<T>());
     } else {
       VLOG(3) << "MatMul's case 10";
       blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                       trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
-                       x_data, y_data, 0, Out->data<T>(), out_batch_size, 0,
-                       K * N);
+                       trans_y ? CblasTrans : CblasNoTrans, M, N, K,
+                       static_cast<T>(1), x_data, y_data, static_cast<T>(0),
+                       Out->data<T>(), out_batch_size, 0, K * N);
     }
   } else if (y_batch_size == 1) {
     if (!trans_x) {
       VLOG(3) << "MatMul's case 11";
       blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans,
-                x_batch_size * M, N, K, 1.0f, x_data, y_data, 0.0f,
-                Out->data<T>());
+                x_batch_size * M, N, K, static_cast<T>(1), x_data, y_data,
+                static_cast<T>(0), Out->data<T>());
     } else {
       VLOG(3) << "MatMul's case 12";
       blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
-                       1.0f, x_data, y_data, 0, Out->data<T>(), out_batch_size,
-                       M * K, 0);
+                       static_cast<T>(1), x_data, y_data, static_cast<T>(0),
+                       Out->data<T>(), out_batch_size, M * K, 0);
     }
   } else if (!is_broadcast_dims) {
     VLOG(3) << "MatMul's case 13";
     blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
-                     y_data, 0, Out->data<T>(), out_batch_size, M * K, K * N);
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K,
+                     static_cast<T>(1), x_data, y_data, static_cast<T>(0),
+                     Out->data<T>(), out_batch_size, M * K, K * N);
   } else {
     // in the case, can't use stridedgemm
     std::vector<const T*> x_ptr(out_batch_size);
@@ -314,9 +321,9 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
     }
     VLOG(3) << "MatMul's case 14";
     blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
-                     x_ptr.data(), y_ptr.data(), 0.0f, out_ptr.data(),
-                     out_batch_size);
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K,
+                     static_cast<T>(1), x_ptr.data(), y_ptr.data(),
+                     static_cast<T>(0), out_ptr.data(), out_batch_size);
   }
 }
 
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index a6cda154e55b972fc653cffc4815f9e0f6e975de..7a4e11091fd3a6d064f3c4d905bb65c61d62d882 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -55,12 +55,12 @@ inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format,
   }
 }
 
-static mkldnn::memory::data_type GetDstType(bool is_int8,
+static mkldnn::memory::data_type GetDstType(bool is_int8, bool is_bfloat16,
                                             bool force_fp32_output,
                                             std::string fuse_activation,
                                             bool fuse_residual_conn,
                                             const Tensor* residual_param) {
-  auto dst_dt = mkldnn::memory::data_type::f32;  // uint8_t, int8_t, float
+  auto dst_dt = mkldnn::memory::data_type::f32;
   if (is_int8) {
     dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6")
                  ? mkldnn::memory::data_type::u8
@@ -72,6 +72,13 @@ static mkldnn::memory::data_type GetDstType(bool is_int8,
       auto residual_dt = framework::ToMKLDNNDataType(residual_param->type());
       if (dst_dt != residual_dt) dst_dt = residual_dt;
     }
+  } else {
+    if (!force_fp32_output && is_bfloat16) {
+      dst_dt = mkldnn::memory::data_type::bf16;
+      if (fuse_residual_conn && residual_param) {
+        dst_dt = framework::ToMKLDNNDataType(residual_param->type());
+      }
+    }
   }
   return dst_dt;
 }
@@ -224,12 +231,15 @@ class ConvMKLDNNHandlerT
               src_tz.size(), chosen_memory_format);
         }
       }
-
-      const auto src_md = platform::MKLDNNMemDesc(
-          src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-      const auto weights_md =
-          platform::MKLDNNMemDesc(weights_tz, platform::MKLDNNGetDataType<T>(),
-                                  MKLDNNMemoryFormat::any);
+      auto data_type = mkldnn::memory::data_type::f32;
+      if (ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16" ||
+          std::is_same<T_out, platform::bfloat16>::value)
+        data_type = mkldnn::memory::data_type::bf16;
+
+      const auto src_md =
+          platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format);
+      const auto weights_md = platform::MKLDNNMemDesc(weights_tz, data_type,
+                                                      MKLDNNMemoryFormat::any);
       const auto dst_md = platform::MKLDNNMemDesc(
           dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
 
@@ -241,8 +251,8 @@ class ConvMKLDNNHandlerT
 
       if (bias) {
         auto bias_tz = framework::vectorize(bias->dims());
-        auto bias_md = platform::MKLDNNMemDesc(
-            bias_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::x);
+        auto bias_md =
+            platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x);
 
         this->AcquireForwardPrimitiveDescriptor(
             conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct,
@@ -384,15 +394,21 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                           "Operator DNNL Conv must use CPUPlace"));
     bool is_INT8 =
         std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
+    bool is_BFLOAT16 = ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16";
+    auto residual_param = ctx.Input<Tensor>("ResidualData");
+    bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
+    std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
+    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+    auto dst_dt =
+        GetDstType(is_INT8, is_BFLOAT16, force_fp32_output, fuse_activation,
+                   fuse_residual_conn, residual_param);
     if (!is_INT8) {
-      ComputeFP32<float>(ctx);
+      if (dst_dt == mkldnn::memory::data_type::f32) {
+        ComputeFP32<float>(ctx);
+      } else if (dst_dt == mkldnn::memory::data_type::bf16) {
+        ComputeFP32<platform::bfloat16>(ctx);
+      }
     } else {
-      std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
-      bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
-      bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
-      auto residual_param = ctx.Input<Tensor>("ResidualData");
-      auto dst_dt = GetDstType(true, force_fp32_output, fuse_activation,
-                               fuse_residual_conn, residual_param);
       if (dst_dt == mkldnn::memory::data_type::f32) {
         ComputeINT8<float>(ctx);
       } else if (dst_dt == mkldnn::memory::data_type::u8) {
@@ -1103,6 +1119,10 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
                                     ops::kConvMKLDNNFP32,
                                     ops::ConvMKLDNNOpKernel<float, float>);
 
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    conv2d, MKLDNN, ::paddle::platform::CPUPlace, BF16, ops::kConvMKLDNNFP32,
+    ops::ConvMKLDNNOpKernel<paddle::platform::bfloat16, float>);
+
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
                                     ::paddle::platform::CPUPlace, U8,
                                     ops::kConvMKLDNNINT8,
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index 540642c7140e707441ad9c4d71ae9b777863a7bd..70d4c34d9c5c4d28e2705c85f56bc65f90fbb3cf 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -110,4 +110,5 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::DeQuantOpKernel<uint8_t>, ops::DeQuantOpKernel<int8_t>);
+                   ops::DeQuantOpKernel<uint8_t>, ops::DeQuantOpKernel<int8_t>,
+                   ops::DeQuantOpKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/xpu/mul_xpu_op.cc b/paddle/fluid/operators/mul_op_xpu.cc
similarity index 100%
rename from paddle/fluid/operators/xpu/mul_xpu_op.cc
rename to paddle/fluid/operators/mul_op_xpu.cc
index 79aae71c3045f938f4b8f0d3e05ce7cf358c41ea..0c8469101abf097629e2768ee35f72a0e0f72bc5 100644
--- a/paddle/fluid/operators/xpu/mul_xpu_op.cc
+++ b/paddle/fluid/operators/mul_op_xpu.cc
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
+#include "paddle/fluid/operators/mul_op.h"
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "paddle/fluid/operators/mul_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 07333f1ae11c3889b543ca6d327e480607a4bcea..02dcb4045f4cdee6840f5caef98d7329e706eaf2 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -40,12 +40,12 @@ using DataLayout = framework::DataLayout;
 //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
 //          np.sum(dy,
 //          axis=(n,h,w)) * (x - mean) *
-//          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+//          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
 //          inv_var
 //          *
 //          np.mean(dy, axis=(n,h,w)) -
 //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-//          axis=(n,h,w))))
+//          axis=(n,h,w)))
 
 template <typename T, int BlockDim, framework::DataLayout layout>
 __global__ void DoubleGradComputeDX(const T *x, const T *mean,
@@ -138,7 +138,7 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
                 ? (j / sample_size * C + i) * sample_size + j % sample_size
                 : j * outer_size + i;
         dx[index] += (dy[index] * var_val - dy_sum_val / inner_size * var_val -
-                      (x[index] - mean_val) * var_val *
+                      (x[index] - mean_val) * var_val * var_val *
                           dy_mul_x_sub_mean_sum_val * var_val / inner_size) *
                      ddscale[i];
       }
@@ -326,19 +326,57 @@ __global__ void DoubleGradComputeDScaleWithGlobal(
 }
 
 // math: dx = ddscale * dy * inv_var
-// math: ddy = scale * ddx * inv_var
 template <typename T, framework::DataLayout layout>
-__global__ void DoubleGradComputeDataWithGlobal(
-    const T *dy, const T *scale, const T *variance, const double epsilon,
-    const int C, const int sample_size, const int num, T *dx) {
+__global__ void DoubleGradComputeDXWithGlobal(const T *dy, const T *ddscale,
+                                              const T *variance,
+                                              const double epsilon, const int C,
+                                              const int sample_size,
+                                              const int num, T *dx) {
   int gid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
-  if (scale != nullptr) {
+  if (ddscale != nullptr) {
     for (int i = gid; i < num; i += stride) {
       const int c =
           layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
       T inv_var = 1.0 / sqrt(variance[c] + epsilon);
-      dx[i] = dy[i] * scale[c] * inv_var;
+      dx[i] = dy[i] * ddscale[c] * inv_var;
+    }
+  }
+}
+
+// math: ddy = scale * ddx * inv_var + ddbias +
+//             ddscale * (x - mean) * inv_var
+template <typename T, framework::DataLayout layout>
+__global__ void DoubleGradComputeDDYWithGlobal(
+    const T *ddx, const T *scale, const T *mean, const T *variance, const T *x,
+    const T *ddbias, const T *ddscale, const double epsilon, const int C,
+    const int sample_size, const int num, T *ddy) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  if (ddx != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      T inv_var = 1.0 / sqrt(variance[c] + epsilon);
+      ddy[i] += ddx[i] * scale[c] * inv_var;
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      T inv_var = 1.0 / sqrt(variance[c] + epsilon);
+      ddy[i] += (x[i] - mean[c]) * inv_var * ddscale[c];
+    }
+  }
+  __syncthreads();
+  if (ddbias != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      ddy[i] += ddbias[c];
     }
   }
 }
@@ -383,8 +421,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
 
   const T *mean_data, *variance_data;
   if (use_global_stats) {
+    const auto *running_mean = ctx.Input<Tensor>("Mean");
     const auto *running_var = ctx.Input<Tensor>("Variance");
+    const auto *running_mean_data = running_mean->template data<T>();
     const auto *running_var_data = running_var->template data<T>();
+    mean_data = running_mean_data;
     variance_data = running_var_data;
   } else {
     const T *smean_data = Saved_mean->data<T>();
@@ -398,12 +439,12 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     set_constant(dev_ctx, dX, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDXWithGlobal<
             T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
       } else {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDXWithGlobal<
             T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
@@ -456,15 +497,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     set_constant(dev_ctx, ddY, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDDYWithGlobal<
             T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
-            ddx_data, scale_data, variance_data, epsilon, C, sample_size, num,
-            ddy_data);
+            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
+            ddscale_data, epsilon, C, sample_size, num, ddy_data);
       } else {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDDYWithGlobal<
             T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
-            ddx_data, scale_data, variance_data, epsilon, C, sample_size, num,
-            ddy_data);
+            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
+            ddscale_data, epsilon, C, sample_size, num, ddy_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc
index 3bcf17fc7b37f1c29e23ccccc7d4df92e705671e..bce00933420e4cfa750e93d563070cc48051d6cc 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@@ -24,32 +24,45 @@ class DpsgdOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
-                      "Input(Param) of DpsgdOp should not be null.");
+                      platform::errors::NotFound(
+                          "Input(Param) of DpsgdOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
-                      "Input(Grad) of DpsgdOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
-                      "Input(LearningRate) of DpsgdOp should not be null.");
+                      platform::errors::NotFound(
+                          "Input(Grad) of DpsgdOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("LearningRate"), true,
+        platform::errors::NotFound(
+            "Input(LearningRate) of DpsgdOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->GetInputsVarType("Param").front(),
         framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->GetInputsVarType("Param").front()));
     PADDLE_ENFORCE_EQ(
         ctx->GetInputsVarType("Grad").front(),
         framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->GetInputsVarType("Grad").front()));
 
     PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
-                      "Output(ParamOut) of DpsgdOp should not be null.");
+                      platform::errors::NotFound(
+                          "Output(ParamOut) of DpsgdOp should not be null."));
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 dimension");
+                      platform::errors::InvalidArgument(
+                          "Learning rate should have 1 dimension. But Received "
+                          "LearningRate's dims [%s].",
+                          framework::product(lr_dims)));
     auto param_dims = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of DpsgdOp should have same dimension");
+        platform::errors::InvalidArgument(
+            "Param and Grad input of DpsgdOp should have same dimension. But "
+            "received Para's dim [%s] and Grad's dim [%s].",
+            param_dims, ctx->GetInputDim("Grad")));
 
     ctx->SetOutputDim("ParamOut", param_dims);
   }
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.h b/paddle/fluid/operators/optimizers/dpsgd_op.h
index 4eb52feb85108637aa4878d3555bc3cbff674420..e52a1dd9db1791e0be82eb1ee47999d2b8f51175 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.h
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.h
@@ -28,17 +28,19 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *param_var = ctx.InputVar("Param");
     PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      "The Var(%s)'s type should be LoDTensor, "
-                      "but the received is %s",
-                      ctx.InputNames("Param").front(),
-                      framework::ToTypeName(param_var->Type()));
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
 
     const auto *grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
-                      "The Var(%s)'s type should be LoDTensor, "
-                      "but the received is %s",
-                      ctx.InputNames("Grad").front(),
-                      framework::ToTypeName(grad_var->Type()));
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
 
     const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
 
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 10b72524efd4a8f9174eab4f45e6173dc56f2c27..083bd91abfc47a4712563c739b333f7417ce21a0 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -40,43 +40,62 @@ class MomentumOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(param) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(grad) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Velocity"),
-                   "Input(velocity) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of Momentum should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
-                   "Output(VelocityOut) of Momentum should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::NotFound(
+                          "Input(param) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
+                      platform::errors::NotFound(
+                          "Input(grad) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Velocity"), true,
+                      platform::errors::NotFound(
+                          "Input(velocity) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("LearningRate"), true,
+        platform::errors::NotFound(
+            "Input(LearningRate) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputsVarType("Param").front(),
+        framework::proto::VarType::LOD_TENSOR,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->GetInputsVarType("Param").front()));
+
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
+                      platform::errors::NotFound(
+                          "Output(ParamOut) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("VelocityOut"), true,
+        platform::errors::NotFound(
+            "Output(VelocityOut) of Momentum should not be null."));
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
+                      platform::errors::InvalidArgument(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning_rate should be a scalar");
+                      platform::errors::InvalidArgument(
+                          "Learning_rate should be a scalar. But Received "
+                          "LearningRate's dim [%s]",
+                          framework::product(lr_dims)));
 
     auto param_dim = ctx->GetInputDim("Param");
     if (ctx->GetInputsVarType("Grad")[0] ==
         framework::proto::VarType::LOD_TENSOR) {
       PADDLE_ENFORCE_EQ(
           param_dim, ctx->GetInputDim("Grad"),
-          "Param and Grad input of MomentumOp should have the same dimension.");
+          platform::errors::InvalidArgument(
+              "Param and Grad input of MomentumOp should have the same "
+              "dimension. But received Param's dim [%s] and Grad's dim [%s].",
+              param_dim, ctx->GetInputDim("Grad")));
       PADDLE_ENFORCE_EQ(
           param_dim, ctx->GetInputDim("Velocity"),
-          "Param and Velocity of MomentumOp should have the same dimension.");
+          platform::errors::InvalidArgument(
+              "Param and Velocity of MomentumOp should have the same "
+              "dimension. But received Param's dim [%s] and Velocity [%s].",
+              param_dim, ctx->GetInputDim("Velocity")));
     }
 
     ctx->SetOutputDim("ParamOut", param_dim);
@@ -398,10 +417,12 @@ class MomentumOpKernel : public framework::OpKernel<T> {
         for_range(functor);
       }
     } else {
-      PADDLE_THROW(
-          string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows "
-                          "gradient, but the received Variable Type is %s",
-                          framework::ToTypeName(grad_var->Type())));
+      PADDLE_ENFORCE_EQ(false, true,
+                        platform::errors::PermissionDenied(
+                            "Unsupported Variable Type of Grad "
+                            "in MomentumOp. Excepted LodTensor "
+                            "or SelectedRows, But received [%s]",
+                            paddle::framework::ToTypeName(grad_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc
index eeee008cdc53c457146074060d526d8d0e8b43aa..9e7960c237fdec4d61ab8cf2663558b7b596d087 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc
@@ -22,47 +22,75 @@ class RmspropOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("MeanSquare"),
-                   "Input(MeanSquare) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(param_out) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
-                   "Output(MeanSquareOut) of RmspropOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::NotFound(
+                          "Input(Param) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("MeanSquare"), true,
+        platform::errors::NotFound(
+            "Input(MeanSquare) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("LearningRate"), true,
+        platform::errors::NotFound(
+            "Input(LearningRate) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
+                      platform::errors::NotFound(
+                          "Input(Grad) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Moment"), true,
+                      platform::errors::NotFound(
+                          "Input(Moment) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(),
+                      framework::proto::VarType::LOD_TENSOR,
+                      platform::errors::InvalidArgument(
+                          "The input var's type in RmspropOp should be "
+                          "LoDTensor, but the received is %s",
+                          ctx->GetInputsVarType("Param").front()));
+
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("ParamOut"), true,
+        platform::errors::NotFound(
+            "Output(param_out) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("MomentOut"), true,
+        platform::errors::NotFound(
+            "Output(MomentOut) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("MeanSquareOut"), true,
+        platform::errors::NotFound(
+            "Output(MeanSquareOut) of RmspropOp should not be null."));
     if (ctx->Attrs().Get<bool>("centered")) {
-      PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
-                     "Output(MeanGradOut) of RmspropOp should not be null.");
+      PADDLE_ENFORCE_EQ(
+          ctx->HasOutput("MeanGradOut"), true,
+          platform::errors::NotFound(
+              "Output(MeanGradOut) of RmspropOp should not be null."));
     }
 
     auto param_dim = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
         param_dim, ctx->GetInputDim("Grad"),
-        "Param and grad input of RmspropOp should have the same dimension.");
+        platform::errors::InvalidArgument(
+            "Param and grad input of RmspropOp should have the same dimension. "
+            "But received Param's dim [%s] and Grad's dim [%s].",
+            param_dim, ctx->GetInputDim("Grad")));
     PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
-                      "Param and Momentum input of RmspropOp "
-                      "should have the same dimension.");
+                      platform::errors::InvalidArgument(
+                          "Param and Momentum input of RmspropOp "
+                          "should have the same dimension. But received "
+                          "Param's dim [%s] and Moment [%s]",
+                          param_dim, ctx->GetInputDim("Moment")));
     PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
-                      "Param and Momentum input of RmspropOp "
-                      "should have the same dimension.");
+                      platform::errors::InvalidArgument(
+                          "Param and Momentum input of RmspropOp "
+                          "should have the same dimension. But received "
+                          "Param's dim [%s] and MeanSquare [%s]",
+                          param_dim, ctx->GetInputDim("MeanSquare")));
 
     auto lr_dim = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
-                      "Learning Rate should be a scalar.");
+                      platform::errors::InvalidArgument(
+                          "Learning Rate of RmspropOp should be a scalar. But "
+                          "received LearningRate's dim [%s]",
+                          framework::product(lr_dim)));
 
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("MomentOut", param_dim);
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h
index 4550052b2d614ccbbb09f4a2b9e747708b2a2baa..1ec712a1431a4657cf1c1456da91fe5369914438 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.h
@@ -148,11 +148,15 @@ class RmspropOpKernel : public framework::OpKernel<T> {
     auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");
 
     PADDLE_ENFORCE_EQ(&p_tensor, param_out,
-                      "Param and ParamOut must be the same Tensor");
+                      platform::errors::InvalidArgument(
+                          "Param and ParamOut must be the same Tensor"));
     PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
-                      "Moment and MomentOut must be the same Tensor");
-    PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
-                      "MeanSquare and MeanSquareOut must be the same Tensor");
+                      platform::errors::InvalidArgument(
+                          "Moment and MomentOut must be the same Tensor"));
+    PADDLE_ENFORCE_EQ(
+        &ms_tensor, mean_square_out,
+        platform::errors::InvalidArgument(
+            "MeanSquare and MeanSquareOut must be the same Tensor"));
 
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     size_t limit = static_cast<size_t>(ms_tensor.numel());
@@ -179,8 +183,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
           auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
           auto mg = EigenVector<T>::Flatten(mg_tensor);
           auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-          PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
-                            "MeanGrad and MeanGradOut must be the same Tensor");
+          PADDLE_ENFORCE_EQ(
+              &mg_tensor, mean_grad_out,
+              platform::errors::InvalidArgument(
+                  "MeanGrad and MeanGradOut must be the same Tensor"));
           auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
 
           mg_out.device(place) = rho * mg + (1 - rho) * g;
@@ -198,8 +204,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
         if (centered) {
           auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
           auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-          PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
-                            "MeanGrad and MeanGradOut must be the same Tensor");
+          PADDLE_ENFORCE_EQ(
+              &mg_tensor, mean_grad_out,
+              platform::errors::InvalidArgument(
+                  "MeanGrad and MeanGradOut must be the same Tensor"));
           for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
               param_out->mutable_data<T>(ctx.GetPlace()),
               mean_square_out->mutable_data<T>(ctx.GetPlace()),
@@ -233,8 +241,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
       if (centered) {
         auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
         auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-        PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
-                          "MeanGrad and MeanGradOut must be the same Tensor");
+        PADDLE_ENFORCE_EQ(
+            &mg_tensor, mean_grad_out,
+            platform::errors::InvalidArgument(
+                "MeanGrad and MeanGradOut must be the same Tensor"));
         for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
             param_out->mutable_data<T>(ctx.GetPlace()),
             mean_square_out->mutable_data<T>(ctx.GetPlace()),
@@ -249,7 +259,12 @@ class RmspropOpKernel : public framework::OpKernel<T> {
             rho, epsilon, momentum, grad_func));
       }
     } else {
-      PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
+      PADDLE_ENFORCE_EQ(false, true,
+                        platform::errors::PermissionDenied(
+                            "Unsupported Variable Type of Grad "
+                            "in RmspropOp. Excepted LodTensor "
+                            "or SelectedRows, But received [%s]",
+                            paddle::framework::ToTypeName(grad_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index aeff8da70b958a440953824c46a095a4f86b9379..569dbcd6a3ee105ae8dd8570bbb2215c32343d01 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -22,23 +22,31 @@ class SGDOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of SGDOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::NotFound(
+                          "Input(Param) of SGDOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Grad"), true,
+        platform::errors::NotFound("Input(Grad) of SGDOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
+                      platform::errors::NotFound(
+                          "Input(LearningRate) of SGDOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
+                      platform::errors::NotFound(
+                          "Output(ParamOut) of SGDOp should not be null."));
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
+                      platform::errors::NotFound(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 element");
+                      platform::errors::InvalidArgument(
+                          "Learning rate should have 1 element. But received "
+                          "LearningRate dims [%s]",
+                          framework::product(lr_dims)));
     auto param_dim = ctx->GetInputDim("Param");
     if (ctx->GetInputsVarType("Grad")[0] ==
         framework::proto::VarType::LOD_TENSOR) {
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index b70f24e0e5e8f2f6c6ac974942ccd4c4c3ad41bb..a5d9ad271f23a4d6a17f07a6c5b6a7d57aa7a3c5 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -57,11 +57,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          paddle::framework::ToTypeName(param_var->Type())));
 
     auto* param = ctx.Input<framework::Tensor>("Param");
     auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
@@ -91,18 +92,30 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
       // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
       // This manual optimization brings difficulty to track data dependency.
       // It's better to find a more elegant solution.
-      PADDLE_ENFORCE_EQ(param, param_out);
+      PADDLE_ENFORCE_EQ(
+          param, param_out,
+          platform::errors::InvalidArgument(
+              "The input tensor Param of SgdOp should be equal with ParamOut "
+              "if variable's type is SelectedRows."));
       auto* grad = ctx.Input<framework::SelectedRows>("Grad");
 
       auto in_height = grad->height();
       auto out_dims = param_out->dims();
-      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+      PADDLE_ENFORCE_EQ(in_height, out_dims[0],
+                        platform::errors::InvalidArgument(
+                            "The input tensor Grad's height of SgdOp should be "
+                            "equal with ParamOut's dims. But received Grad's "
+                            "height [%s] and ParamOut's dims [%s]",
+                            in_height, out_dims[0]));
 
       auto& in_value = grad->value();
       auto& in_rows = grad->rows();
 
       int64_t in_row_numel = in_value.numel() / in_rows.size();
-      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
+      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height,
+                        platform::errors::InvalidArgument(
+                            "The in_row_numel of SgdOp should be equal with "
+                            "param_out's numel / in_height."));
 
       auto* in_data = in_value.data<T>();
       auto* out_data = param_out->data<T>();
@@ -118,7 +131,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
           out_data, in_row_numel, in_rows.size());
 
     } else {
-      PADDLE_THROW("Unsupported Variable Type of Grad");
+      PADDLE_ENFORCE_EQ(false, true,
+                        platform::errors::PermissionDenied(
+                            "Unsupported Variable Type of Grad "
+                            "in SgdOp. Excepted LodTensor or "
+                            "SelectedRows, But received [%s]",
+                            paddle::framework::ToTypeName(grad_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index 539d774a395d739ae98adeb6bd2679d0487d0f06..1aaf95efc3250747a4cb46ab79b4415a6b527907 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -44,8 +44,20 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
       if (grad_var->IsType<framework::LoDTensor>()) {
         const auto *grad = ctx.Input<framework::Tensor>("Grad");
         auto sz = param_out->numel();
-        PADDLE_ENFORCE_EQ(param->numel(), sz);
-        PADDLE_ENFORCE_EQ(grad->numel(), sz);
+        PADDLE_ENFORCE_EQ(param->numel(), sz,
+                          platform::errors::InvalidArgument(
+                              "The input tensor Param's numel of SgdOp "
+                              "should be equal with ParamOut's numel. "
+                              "But received Param's "
+                              "numel = [%s], ParamOut's numel = [%s]",
+                              param->numel(), sz));
+        PADDLE_ENFORCE_EQ(grad->numel(), sz,
+                          platform::errors::InvalidArgument(
+                              "The input tensor Grad's numel of SgdOp "
+                              "should be equal with ParamOut's numel. "
+                              "But received Grad's "
+                              "numel = [%s], ParamOut's numel = [%s]",
+                              grad->numel(), sz));
 
         jit::sgd_attr_t attr(1, sz, 1, sz, 1);
         const T *lr = learning_rate->data<T>();
@@ -62,7 +74,11 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
         // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
         // This manual optimization brings difficulty to track data dependency.
         // It's better to find a more elegant solution.
-        PADDLE_ENFORCE_EQ(param, param_out);
+        PADDLE_ENFORCE_EQ(param, param_out,
+                          platform::errors::InvalidArgument(
+                              "The input tensor Param of SgdOp "
+                              "should be equal with ParamOut if variable's "
+                              "type is SelectedRows. "));
         const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
         auto &grad_rows = grad->rows();
 
@@ -73,7 +89,13 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
         }
 
         auto out_dims = param_out->dims();
-        PADDLE_ENFORCE_EQ(grad->height(), out_dims[0]);
+        PADDLE_ENFORCE_EQ(
+            grad->height(), out_dims[0],
+            platform::errors::InvalidArgument(
+                "The input tensor Grad's height of SgdOp "
+                "should be equal with ParamOut's dims. But received  Grad's "
+                "height [%s] and ParamOut's dims [%s]",
+                grad->height(), out_dims[0]));
         auto &grad_value = grad->value();
         const T *param_data = param->data<T>();
         const T *grad_data = grad_value.data<T>();
@@ -87,19 +109,31 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
         attr.grad_height = grad_rows.size();  // note: it is not grad->height()
         attr.grad_width = grad_value.numel() / attr.grad_height;
         attr.selected_rows_size = grad_rows.size();
-        PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width);
+        PADDLE_ENFORCE_EQ(
+            attr.grad_width, attr.param_width,
+            platform::errors::InvalidArgument(
+                "The grad_value's numel of SgdOp "
+                "should be equal with param_out's numel. But received "
+                "grad_value's numel [%s] and param_out's numel [%s]",
+                attr.grad_width, attr.param_width));
 
         auto sgd =
             jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
                 attr);
         sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
       } else {
-        PADDLE_THROW("Unsupported Variable Type of Grad");
+        PADDLE_ENFORCE_EQ(
+            false, true,
+            platform::errors::PermissionDenied(
+                "Unsupported Variable Type of Grad in SgdOp. Excepted "
+                "LodTensor or SelectedRows, But received [%s]",
+                paddle::framework::ToTypeName(grad_var->Type())));
       }
     } else if (param_var->IsType<framework::SelectedRows>()) {
-      PADDLE_ENFORCE(grad_var->IsType<framework::SelectedRows>(),
-                     "when param "
-                     "is SelectedRows, gradient should also be SelectedRows");
+      PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
+                        platform::errors::InvalidArgument(
+                            "when param is SelectedRows, "
+                            "gradient should also be SelectedRows"));
       const auto &param = param_var->Get<framework::SelectedRows>();
       auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
       const auto &grad = grad_var->Get<framework::SelectedRows>();
@@ -112,27 +146,36 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
 
       auto param_row_width = param.value().dims()[1];
       auto grad_row_width = grad.value().dims()[1];
-      VLOG(4) << " param rows: " << param.rows().size()
-              << " param memory rows: " << param.value().dims()[0]
-              << " grad rows: " << grad.rows().size()
-              << " grad memory rows: " << grad.value().dims()[0];
-      PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
-                        "param_row should have the same size with grad_row");
+      PADDLE_ENFORCE_EQ(
+          param_row_width, grad_row_width,
+          platform::errors::InvalidArgument(
+              "The param_row in SgdOP should have the same size with grad_row. "
+              "But received param_row's width is [%s], and grad_row's width is "
+              "[%s]",
+              param_row_width, grad_row_width));
 
       const auto *lr = learning_rate->data<T>();
       const auto *grad_data = grad.value().data<T>();
       auto *out_data = param_out->mutable_value()->data<T>();
       for (size_t i = 0; i < grad.rows().size(); i++) {
         int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
-        PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
-                          "id should be in the table");
+        PADDLE_ENFORCE_GE(
+            id_index, static_cast<int64_t>(0),
+            platform::errors::InvalidArgument(
+                "The id in SgdOp should be >= 0. But recevied id_index is [%s]",
+                id_index));
         for (int64_t j = 0; j < grad_row_width; j++) {
           out_data[id_index * grad_row_width + j] -=
               lr[0] * grad_data[i * grad_row_width + j];
         }
       }
     } else {
-      PADDLE_THROW("Unsupported Variable Type of Parameter");
+      PADDLE_ENFORCE_EQ(
+          false, true,
+          platform::errors::PermissionDenied(
+              "Unsupported Variable Type of Parameter in SgdOp. Excepted "
+              "LodTensor or SelectedRows, But received [%s]",
+              paddle::framework::ToTypeName(param_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index aec995304a77118ecbf788ca3984c7e9da531f18..05d077b173a13e457fd38187b832f9586926a2ee 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -41,7 +41,9 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
     // NOTE: Converting int64 to int32 may cause data overflow.
     vec_new_data = std::vector<T>(data, data + x->numel());
   } else {
-    PADDLE_THROW("The dtype of Tensor must be int32 or int64.");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The dtype of Tensor must be int32 or int64, but received: %s",
+        x->type()));
   }
   return vec_new_data;
 }
@@ -53,10 +55,11 @@ inline std::vector<T> GetDataFromTensorList(
   for (size_t i = 0; i < list_tensor.size(); ++i) {
     auto tensor = list_tensor[i];
     PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
-                      "ShapeError: The shape of Tensor in list must be [1]. "
-                      "But received the shape "
-                      "is [%s]",
-                      tensor->dims());
+                      platform::errors::InvalidArgument(
+                          "The shape of Tensor in list must be [1]. "
+                          "But received its shape "
+                          "is [%s]",
+                          tensor->dims()));
 
     if (tensor->type() == framework::proto::VarType::INT32) {
       if (platform::is_gpu_place(tensor->place())) {
@@ -76,7 +79,10 @@ inline std::vector<T> GetDataFromTensorList(
         vec_new_data.push_back(static_cast<T>(*tensor->data<int64_t>()));
       }
     } else {
-      PADDLE_THROW("The dtype of Tensor in list must be int32 or int64.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The dtype of Tensor in list must be int32 or int64, but received: "
+          "%s",
+          tensor->type()));
     }
   }
   return vec_new_data;
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index a5dd19d4363d6a8fa99cf48ef2969186de605127..4d9673e9646dedbd001eabcd70d4d34aecaa10b5 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -137,12 +137,12 @@ USE_CUDA_ATOMIC(Max, unsigned int);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
 #else
-CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {
+CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {  // NOLINT
   if (*address >= val) {
     return;
   }
 
-  unsigned long long int old = *address, assumed;
+  unsigned long long int old = *address, assumed;  // NOLINT
 
   do {
     assumed = old;
@@ -169,7 +169,7 @@ CUDA_ATOMIC_WRAPPER(Max, float) {
     return;
   }
 
-  int *const address_as_i = (int *)address;
+  int *const address_as_i = reinterpret_cast<int *>(address);
   int old = *address_as_i, assumed;
 
   do {
@@ -187,9 +187,9 @@ CUDA_ATOMIC_WRAPPER(Max, double) {
     return;
   }
 
-  unsigned long long int *const address_as_ull =
-      (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull, assumed;
+  unsigned long long int *const address_as_ull =            // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
 
   do {
     assumed = old;
@@ -209,12 +209,12 @@ USE_CUDA_ATOMIC(Min, unsigned int);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
 #else
-CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {
+CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {  // NOLINT
   if (*address <= val) {
     return;
   }
 
-  unsigned long long int old = *address, assumed;
+  unsigned long long int old = *address, assumed;  // NOLINT
 
   do {
     assumed = old;
@@ -241,7 +241,7 @@ CUDA_ATOMIC_WRAPPER(Min, float) {
     return;
   }
 
-  int *const address_as_i = (int *)address;
+  int *const address_as_i = reinterpret_cast<int *>(address);
   int old = *address_as_i, assumed;
 
   do {
@@ -259,9 +259,9 @@ CUDA_ATOMIC_WRAPPER(Min, double) {
     return;
   }
 
-  unsigned long long int *const address_as_ull =
-      (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull, assumed;
+  unsigned long long int *const address_as_ull =            // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
 
   do {
     assumed = old;
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index f14fbdd74f95bfbed53ff787af861ce4656159c0..f1832206a1abbbfccb5e79b39b1c67307aca6769 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/xpu_info.h"
 
 TEST(InitDevices, CPU) {
   using paddle::framework::InitDevices;
diff --git a/paddle/fluid/platform/xpu_header.h b/paddle/fluid/platform/xpu_header.h
index d8c5f85f9cfe4b9d6ac07069fff89d37c695af5b..95e4979951d7689b233166170060916033311011 100644
--- a/paddle/fluid/platform/xpu_header.h
+++ b/paddle/fluid/platform/xpu_header.h
@@ -15,9 +15,36 @@
 #pragma once
 
 #ifdef PADDLE_WITH_XPU
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/platform/errors.h"
 #include "xpu/api.h"
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
 
 namespace xpu = baidu::xpu::api;
+
+class XPUActHelper {
+ public:
+  // Convert string to activation type in xpu
+  static xpu::Activation_t ConvertToXpuActType(
+      const std::string& act_type_str) {
+    static std::unordered_map<std::string, xpu::Activation_t> str2act = {
+        {"linear", xpu::Activation_t::LINEAR},
+        {"relu", xpu::Activation_t::RELU},
+        {"sigmoid", xpu::Activation_t::SIGMOID},
+        {"tanh", xpu::Activation_t::TANH},
+        {"gelu", xpu::Activation_t::GELU},
+        {"leaky_relu", xpu::Activation_t::LEAKY_RELU},
+        {"sqrt", xpu::Activation_t::SQRT},
+        {"square", xpu::Activation_t::SQUARE}};
+
+    auto res = str2act.find(act_type_str);
+    PADDLE_ENFORCE_NE(res, str2act.end(),
+                      paddle::platform::errors::InvalidArgument(
+                          "Invalid activation type(%s) in XPU", act_type_str));
+    return res->second;
+  }
+};
 #endif
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 5ee15073267b6eac8978022a70ead5d0f439c62f..142ab2bb9d790175a843d1b81b74dc762a3213fd 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -41,6 +41,7 @@ namespace detail {
 // import numpy as np
 // print np.dtype(np.float16).num  # 23
 constexpr int NPY_FLOAT16_ = 23;
+constexpr int NPY_UINT16_ = 4;
 
 // Note: Since float16 is not a builtin type in C++, we register
 // paddle::platform::float16 as numpy.float16.
@@ -60,6 +61,23 @@ struct npy_format_descriptor<paddle::platform::float16> {
   static PYBIND11_DESCR name() { return _("float16"); }
 };
 
+// Note: Since bfloat16 is not a builtin type in C++ and in numpy,
+// we register paddle::platform::bfloat16 as numpy.uint16.
+template <>
+struct npy_format_descriptor<paddle::platform::bfloat16> {
+  static py::dtype dtype() {
+    handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_UINT16_);
+    return reinterpret_borrow<py::dtype>(ptr);
+  }
+  static std::string format() {
+    // Note: "H" represents UINT16.
+    // Details at:
+    // https://docs.python.org/3/library/struct.html#format-characters.
+    return "H";
+  }
+  static PYBIND11_DESCR name() { return _("bfloat16"); }
+};
+
 }  // namespace detail
 }  // namespace pybind11
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index ac6531a2cc55dbe47049797b96349d659adab496..69303013d2a41a049276c0d1b03b9d902b555d23 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -988,11 +988,6 @@ set +x
                 fi
                 read testcase <<< $(echo "$line"|grep -oEi "\w+$")
 
-                if python $PADDLE_ROOT/tools/is_ut_disabled.py $testcase; then
-                    echo $testcase" is disabled."
-                    continue
-                fi
-
                 if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then
                     echo $testcase" will only run at night."
                     continue
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index e749cf88b6a49846b678c1c4258d2b3c2a8c01a4..e707de8e068640e28d3a06d539e33f767d7ab2b3 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -230,7 +230,6 @@ from .framework import CPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPlace  #DEFINE_ALIAS
 from .framework import CUDAPinnedPlace  #DEFINE_ALIAS
 
-from .framework import to_variable  #DEFINE_ALIAS
 from .framework import grad  #DEFINE_ALIAS
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
@@ -258,6 +257,8 @@ from .tensor.stat import numel  #DEFINE_ALIAS
 from .device import get_cudnn_version
 from .device import set_device
 from .device import get_device
+from .device import is_compiled_with_xpu
+from .device import XPUPlace
 # from .tensor.tensor import Tensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensorArray        #DEFINE_ALIAS
diff --git a/python/paddle/device.py b/python/paddle/device.py
index de24fd875130e84d6532d033761f68a5c77a68c2..46d0ff7bedcecfefc7d054b0ccbcbf100c2fa0f6 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -22,7 +22,9 @@ from paddle.fluid.dygraph.parallel import ParallelEnv
 __all__ = [
     'get_cudnn_version',
     'set_device',
-    'get_device'
+    'get_device',
+    'XPUPlace',
+    'is_compiled_with_xpu'
     #            'cpu_places',
     #            'CPUPlace',
     #            'cuda_pinned_places',
@@ -35,6 +37,37 @@ __all__ = [
 _cudnn_version = None
 
 
+def is_compiled_with_xpu():
+    """
+    Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
+
+    Returns (bool): whether paddle was built with WITH_XPU=ON
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            support_xpu = paddle.device.is_compiled_with_xpu()
+    """
+    return core.is_compiled_with_xpu()
+
+
+def XPUPlace(dev_id):
+    """
+    Return a Baidu Kunlun Place
+
+    Parameters:
+        dev_id(int): Baidu Kunlun device id
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            place = paddle.device.XPUPlace(0)
+    """
+    return core.XPUPlace(dev_id)
+
+
 def get_cudnn_version():
     """
     This funciton return the version of cudnn. the retuen value is int which represents the 
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index f66f013e4dbaadd534d6859b7ba6530779c82a3b..36da7264efe2e489aadbffde56b4260418f91fb2 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -495,7 +495,7 @@ class RoleMakerBase(object):
         Returns:
             string: all heter_trainers'endpoints
         """
-        assert self._heter_trainer_endpoints != []
+        assert self._heter_trainer_endpoints != [], "Heter Worker Endpoints Not initialized"
         return self._heter_trainer_endpoints
 
     def _get_heter_worker_endpoint(self):
@@ -505,10 +505,10 @@ class RoleMakerBase(object):
 
         e.g: if we have 4 cpu-trainer(default), 2 gpu-trainer(heter)
              then No.0 and No.2 cpu-trainer will work with No.0 gpu-trainer
-             and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainerr
+             and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainer
         """
-        assert self._heter_trainer_endpoints != []
-        return self._heter_trainer_endpoints[(self._current_id + 1) %
+        assert self._heter_trainer_endpoints != [], "Heter Worker Endpoints Not initialized"
+        return self._heter_trainer_endpoints[(self._current_id) %
                                              self._heter_worker_num()]
 
     def _get_heter_worker_device(self):
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 6dd4661f00062f55bb834bbee50daf1924a0c87a..42be7e869d9a7c6394152167ac2cbce9b0986de0 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -23,6 +23,7 @@ from paddle.fluid.executor import Executor
 from paddle.fluid.parallel_executor import ParallelExecutor
 
 from .runtime_base import RuntimeBase
+from ..base.private_helper_function import wait_server_ready
 
 
 class ParameterServerRuntime(RuntimeBase):
@@ -94,8 +95,8 @@ class ParameterServerRuntime(RuntimeBase):
                 return False
 
             if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                            var.desc.type() == core.VarDesc.VarType.READER:
+                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                    var.desc.type() == core.VarDesc.VarType.READER:
                 return False
             return var.persistable
 
@@ -161,6 +162,17 @@ class ParameterServerRuntime(RuntimeBase):
 
         trainer_config = self.async_strategy.get_trainer_runtime_config()
 
+        dist_strategy = self.context["valid_strategy"]
+        launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
+        if launch_barrier:
+            # for trainer wait server ready
+            wait_server_ready(self.role_maker._get_pserver_endpoints())
+
+            # for ps-heter mode, wait heter worker ready
+            if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
+            ):
+                wait_server_ready(self.role_maker._get_heter_worker_endpoints())
+
         lrs = _has_global_step(_get_lr_ops(self.origin_main_program))
 
         if lrs:
@@ -312,7 +324,7 @@ class ParameterServerRuntime(RuntimeBase):
         opts = _get_optimize_ops(self.origin_main_program)
         for op in opts:
             if "Param" in op.input_names and \
-                            "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
+                    "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
                 return op
 
     def _save_dense_params(self, executor, dirname, context, main_program):
diff --git a/python/paddle/fluid/data.py b/python/paddle/fluid/data.py
index dc57e9f71ed3d0de1a374bdf719b32a083198b31..05ea66f54451ba08032bff4e7bc805bbffa15e73 100644
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
@@ -19,10 +19,12 @@ from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.data_feeder import check_dtype, check_type
 from ..utils import deprecated
+from paddle.fluid.framework import static_only
 
 __all__ = ['data']
 
 
+@static_only
 @deprecated(since="2.0.0", update_to="paddle.static.data")
 def data(name, shape, dtype='float32', lod_level=0):
     """
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 01c2f0fed496081400d363d9464360c69d924be8..69fb23383e5fc06da46d1791a056b6d8f4da8c52 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -363,7 +363,7 @@ def guard(place=None):
     with framework.program_guard(train, startup):
         with framework.unique_name.guard():
             with framework._dygraph_guard(tracer):
-                with framework._dygraph_place_guard(place):
+                with framework._dygraph_place_guard(expected_place):
                     yield
 
 
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index a14c3a81c121758ed90450cd5eb5990f3f7739e1..05269028acc4004b8af000dabd946f9a3d8ce40f 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -3230,14 +3230,11 @@ class Flatten(layers.Layer):
         .. code-block:: python
 
           import paddle
-          from paddle import to_variable
           import numpy as np
+          paddle.disable_static()
 
           inp_np = np.ones([5, 2, 3, 4]).astype('float32')
-          
-          paddle.disable_static()
-          
-          inp_np = to_variable(inp_np)
+          inp_np = paddle.to_tensor(inp_np)
           flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
           flatten_res = flatten(inp_np)
 
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 2e3f34f41648a9343b4bccd1044bcd3f7b3d8189..3dc30767e5aa42d6a0a9f673e093f40045cbed87 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1355,7 +1355,7 @@ class Executor(object):
         if not program._fleet_opt is None:
             if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker":
                 is_heter = 1
-            if program._fleet_opt("trainer", "") == "HeterXpuTrainer":
+            if program._fleet_opt.get("trainer", "") == "HeterXpuTrainer":
                 is_heter = 1
         if scope is None:
             scope = global_scope()
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 797b32f5d4768af59fa4e6aceb75e4b6d9029d91..b4cea6761dcd84e047f98929644a1e264976503d 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -217,7 +217,16 @@ def _dygraph_not_support_(func):
 def _dygraph_only_(func):
     def __impl__(*args, **kwargs):
         assert in_dygraph_mode(
-        ), "We Only support %s in dynamic mode, please call 'paddle.disable_static()' to enter dynamic mode." % func.__name__
+        ), "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode." % func.__name__
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
+def _static_only_(func):
+    def __impl__(*args, **kwargs):
+        assert not in_dygraph_mode(
+        ), "We only support '%s()' in static graph mode, please call 'paddle.enable_static()' to enter static graph mode." % func.__name__
         return func(*args, **kwargs)
 
     return __impl__
@@ -260,6 +269,7 @@ def deprecate_stat_dict(func):
 
 dygraph_not_support = wrap_decorator(_dygraph_not_support_)
 dygraph_only = wrap_decorator(_dygraph_only_)
+static_only = wrap_decorator(_static_only_)
 fake_interface_only = wrap_decorator(_fake_interface_only_)
 
 
@@ -603,7 +613,9 @@ def convert_np_dtype_to_dtype_(np_dtype):
     elif dtype == np.bool:
         return core.VarDesc.VarType.BOOL
     elif dtype == np.uint16:
-        return core.VarDesc.VarType.INT16
+        # since there is still no support for bfloat16 in NumPy,
+        # uint16 is used for casting bfloat16
+        return core.VarDesc.VarType.BF16
     elif dtype == np.uint8:
         return core.VarDesc.VarType.UINT8
     elif dtype == np.int8:
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index d513d44acfff230eb229e161e689fbc60a73c602..6b98dea42903e1392febd14b739b49cec7bc8c14 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -31,6 +31,7 @@ from ..unique_name import generate as unique_name
 
 import logging
 from ..data_feeder import check_dtype, check_type
+from paddle.fluid.framework import static_only
 
 __all__ = [
     'data', 'read_file', 'double_buffer', 'py_reader',
@@ -38,6 +39,7 @@ __all__ = [
 ]
 
 
+@static_only
 def data(name,
          shape,
          append_batch_size=True,
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2f8952a443107e32d3b6c96dc27c51a8aafe67a1..97a3ebc2135a0649fff88e1a1c14d02dfb7850b1 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -335,9 +335,6 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
 list(REMOVE_ITEM TEST_OPS test_sampling_id_op)
-list(REMOVE_ITEM TEST_OPS test_paddle_save_load)
-
-
 
 if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_dataset)
@@ -534,15 +531,15 @@ if(NOT WIN32)
 endif()
 
 if(NOT APPLE AND NOT WIN32)
-    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140   LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 endif()
 
 add_subdirectory(sequence)
diff --git a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
index 15e98481c26b20de4e9fa493fa022380ba1fcd63..92d84b8b3f381e8607d0168e3891c2399037e456 100644
--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
@@ -153,7 +153,7 @@ def gen_fake_line(dnn_data_num=7,
     return line
 
 
-def prepare_fake_data(file_nums=9, file_lines=1000):
+def prepare_fake_data(file_nums=6, file_lines=1000):
     """
     Create fake data with same type as avazu_ctr_data
     """
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index f62ad66e462862f4c3f04bacc58ca7aac583ef1e..fefaecd3b8979b47cf7c0c4f7aa058e9ffcaae42 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -206,13 +206,6 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
                 debug=int(os.getenv("Debug", "0")))
             pass_time = time.time() - pass_start
             print("do_dataset_training done. using time {}".format(pass_time))
-        if os.getenv("SAVE_MODEL") == "1":
-            model_dir = tempfile.mkdtemp()
-            fleet.save_inference_model(exe, model_dir,
-                                       [feed.name for feed in self.feeds],
-                                       self.avg_cost)
-            self.check_model_right(model_dir)
-            shutil.rmtree(model_dir)
 
         fleet.stop_worker()
         print("do_dataset_training stop worker.")
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ac33383fb26b2a35362e8e39e5994d82d6fe497
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import struct
+
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
+
+
+def conv2d_forward_refer(input, filter, group, conv_param):
+    out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group,
+                                                          conv_param)
+    return out
+
+
+def conv2d_residual_naive(out, residual):
+    assert out.shape == residual.shape
+    out = np.add(out, residual)
+    return out
+
+
+class TestConv2dBf16Op(TestConv2dOp):
+    def setUp(self):
+        self.op_type = "conv2d"
+        self.use_cudnn = False
+        self.exhaustive_search = False
+        self.use_cuda = False
+        self.use_mkldnn = True
+        self.weight_type = np.float32
+        self.input_type = np.float32
+        self.use_mkldnn = True
+        self.mkldnn_data_type = "bfloat16"
+        self.force_fp32_output = False
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+        self.init_fuse_relu()
+        self.init_fuse_residual()
+        self.init_data_type()
+        self.init_force_fp32_output()
+
+        conv2d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilation': self.dilations
+        }
+        self.input = np.random.random(self.input_size).astype(np.float32)
+        self.filter = np.random.random(self.filter_size).astype(np.float32)
+        conv_out, _, _, _, _ = conv2d_forward_naive(self.input, self.filter,
+                                                    self.groups, conv2d_param)
+        self.conv_output_float = conv_out
+
+        if self.fuse_residual:
+            self.input_residual = np.random.random(
+                self.input_residual_size).astype(np.float32)
+            self.conv_output_float = conv2d_residual_naive(
+                self.conv_output_float, self.input_residual)
+            self.conv_output = convert_float_to_uint16(self.conv_output_float)
+            self.outputs = {'Output': self.conv_output}
+        elif self.force_fp32_output:
+            self.outputs = {'Output': self.conv_output_float.astype(np.float32)}
+
+        if self.input_type is not np.float32:
+            self.input = convert_float_to_uint16(self.input)
+
+        self.inputs = {
+            'Input': self.input.view(self.input_type),
+            'Filter': OpTest.np_dtype_to_fluid_dtype(
+                self.filter.astype(self.weight_type))
+        }
+
+        if self.fuse_residual:
+            self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype(
+                convert_float_to_uint16(self.input_residual))
+
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups,
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn,
+            'mkldnn_data_type': self.mkldnn_data_type,
+            'force_fp32_output': self.force_fp32_output,
+            'fuse_residual_connection': self.fuse_residual
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        pass
+
+    def test_check_grad_no_filter(self):
+        pass
+
+    def test_check_grad_no_input(self):
+        pass
+
+    def init_test_case(self):
+        TestConv2dOp.init_test_case(self)
+        self.input_size = [1, 1, 5, 5]  # NCHW
+        f_c = self.input_size[1] // self.groups
+        self.input_residual_size = [1, 2, 3, 3]
+        self.filter_size = [2, f_c, 3, 3]
+
+    def init_data_type(self):
+        self.weight_type = np.float32
+        self.input_type = np.float32
+
+    def init_force_fp32_output(self):
+        self.force_fp32_output = False
+
+    def init_fuse_relu(self):
+        self.fuse_activation = "relu"
+
+    def init_fuse_residual(self):
+        self.fuse_residual = True
+
+
+class TestConv2d(TestConv2dBf16Op):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.input_residual_size = [2, 6, 3, 3]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_data_type(self):
+        self.input_type = np.uint16
+
+
+class TestWithPad(TestConv2d):
+    def init_test_case(self):
+        TestConv2d.init_test_case(self)
+        self.pad = [1, 1]
+        self.input_residual_size = [2, 6, 5, 5]
+
+
+class TestWithGroup(TestConv2d):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithStride(TestConv2dBf16Op):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]
+        self.input_residual_size = [2, 6, 3, 3]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_data_type(self):
+        self.input_type = np.uint16
+
+
+class TestWith1x1ForceFP32Output(TestConv2dBf16Op):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [1, 3, 5, 5]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_force_fp32_output(self):
+        self.force_fp32_output = True
+
+    def init_fuse_residual(self):
+        self.fuse_residual = False
+
+
+class TestWithInput1x1Filter1x1(TestConv2dBf16Op):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 1, 1]
+        self.input_residual_size = [2, 6, 1, 1]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
index 7a494e3c2c3040356641d05772c883e15e4579e3..9731efced69d4b53bbb5b57b4d252d9a7a0c4f5a 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -36,6 +36,7 @@ class TestConv2dInt8Op(TestConv2dOp):
         self.use_cuda = False
         self.use_mkldnn = False
         self.data_format = "NCHW"
+        self.mkldnn_data_type = "int8"
         self.weighttype = np.float32
         self.use_mkldnn = True
         self.init_group()
@@ -141,7 +142,8 @@ class TestConv2dInt8Op(TestConv2dOp):
             'Scale_weights': self.scale_weights,
             'Scale_in_eltwise': self.scale_in_eltwise,
             'fuse_activation': self.fuse_activation,
-            'fuse_residual_connection': self.fuse_residual
+            'fuse_residual_connection': self.fuse_residual,
+            'mkldnn_data_type': self.mkldnn_data_type
         }
         self.outputs = {'Output': output}
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
index 35419462909df1700219fbbe3841e4dbd094e719..70c76f1fb7186fcc983c0378af657d4aae2d2b32 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
 
 
 class TestDeQuantizeOp(OpTest):
@@ -32,6 +32,9 @@ class TestDeQuantizeOp(OpTest):
             input = (np.random.randint(0, 100, self.input_size) - 50
                      ).astype(self.data_type)
             output = (input * (1 / self.scale)).astype('float')
+        elif self.data_type == 'uint16':
+            output = np.random.random(self.input_size).astype(np.float32)
+            input = convert_float_to_uint16(output)
         else:
             input = (np.random.randint(0, 100,
                                        self.input_size)).astype(self.data_type)
@@ -70,5 +73,13 @@ class TestDeQuantizeOp2(TestDeQuantizeOp):
         self.data_type = 'uint8'
 
 
+class TestDeQuantizeOpBf16(TestDeQuantizeOp):
+    def set_scale(self):
+        self.scale = 1.0
+
+    def set_data_type(self):
+        self.data_type = 'uint16'
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index a6a4b9574c50e254def870783adbc0a0dc3c3ed8..96efc36ed0a5022ef7e1eae623a0eec02c4dcfef 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -20,6 +20,7 @@ import warnings
 import numpy as np
 import random
 import six
+import struct
 import time
 import itertools
 import collections
@@ -167,6 +168,18 @@ def skip_check_grad_ci(reason=None):
     return wrapper
 
 
+def copy_bits_from_float_to_uint16(f):
+    return struct.unpack('<I', struct.pack('<f', f))[0] >> 16
+
+
+def convert_float_to_uint16(float_list):
+    new_output = []
+    for x in np.nditer(float_list):
+        new_output.append(np.uint16(copy_bits_from_float_to_uint16(x)))
+
+    return np.reshape(new_output, float_list.shape).view(np.uint16)
+
+
 class OpTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -204,6 +217,9 @@ class OpTest(unittest.TestCase):
                     return False
             return True
 
+        def is_xpu_op_test():
+            return hasattr(cls, "use_xpu") and cls.use_xpu == True
+
         def is_mkldnn_op_test():
             return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True
 
@@ -226,6 +242,7 @@ class OpTest(unittest.TestCase):
             if cls.dtype in [np.float32, np.float64] \
                 and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \
                 and not hasattr(cls, 'exist_fp64_check_grad') \
+                and not is_xpu_op_test() \
                 and not is_mkldnn_op_test():
                 raise AssertionError(
                     "This test of %s op needs check_grad with fp64 precision." %
@@ -242,6 +259,11 @@ class OpTest(unittest.TestCase):
             self.call_once = True
             self.dtype = data_type
 
+    def is_bfloat16_op(self):
+        return self.dtype == np.uint16 or (
+            hasattr(self, 'mkldnn_data_type') and
+            getattr(self, 'mkldnn_data_type') is "bfloat16")
+
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def is_np_data(input):
             return isinstance(input, (np.ndarray, np.generic))
@@ -276,8 +298,9 @@ class OpTest(unittest.TestCase):
         infer_dtype(inputs, dtype_set)
         dtype_list = [
             np.dtype(np.float64), np.dtype(np.float32), np.dtype(np.float16),
-            np.dtype(np.int64), np.dtype(np.int32), np.dtype(np.int16),
-            np.dtype(np.int8), np.dtype(np.uint8), np.dtype(np.bool)
+            np.dtype(np.int64), np.dtype(np.int32), np.dtype(np.uint16),
+            np.dtype(np.int16), np.dtype(np.int8), np.dtype(np.uint8),
+            np.dtype(np.bool)
         ]
         # check the dtype in dtype_list in order, select the first dtype that in dtype_set
         for dtype in dtype_list:
@@ -317,6 +340,11 @@ class OpTest(unittest.TestCase):
                     self.attrs["use_mkldnn"] == True):
             self.__class__.use_mkldnn = True
 
+        if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
+            (hasattr(self, "attrs") and "use_xpu" in self.attrs and \
+                    self.attrs["use_xpu"] == True):
+            self.__class__.use_xpu = True
+
         op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
         "infer datatype from inputs and outputs for this test case"
         self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
@@ -913,6 +941,8 @@ class OpTest(unittest.TestCase):
         need_run_ops = self._get_need_run_ops(op_desc)
 
         res = {}
+        if hasattr(self, 'attrs') and bool(self.attrs.get('use_xpu', False)):
+            return
         for op_desc, father_op_desc in reversed(need_run_ops):
             # The first one is the forward op
             has_infer_inplace = fluid.core.has_infer_inplace(op_desc.type())
@@ -957,6 +987,14 @@ class OpTest(unittest.TestCase):
             self.op_type not in op_threshold_white_list.NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST:
             atol = 0
 
+        if self.is_bfloat16_op():
+            check_dygraph = False
+            if hasattr(self, 'force_fp32_output') and getattr(
+                    self, 'force_fp32_output'):
+                atol = 1e-2
+            else:
+                atol = 2
+
         if no_check_set is not None:
             if self.op_type not in no_check_set_white_list.no_check_set_white_list:
                 raise AssertionError(
@@ -1176,6 +1214,11 @@ class OpTest(unittest.TestCase):
                     self.attrs["use_mkldnn"] == True):
             self.__class__.use_mkldnn = True
 
+        if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
+            (hasattr(self, "attrs") and "use_xpu" in self.attrs and \
+                    self.attrs["use_xpu"] == True):
+            self.__class__.use_xpu = True
+
         places = self._get_places()
         for place in places:
             res = self.check_output_with_place(place, atol, no_check_set,
@@ -1286,8 +1329,9 @@ class OpTest(unittest.TestCase):
             no_grad_set = set()
         else:
             if (self.op_type not in no_grad_set_white_list.NEED_TO_FIX_OP_LIST
-                ) and (self.op_type not in
-                       no_grad_set_white_list.NOT_CHECK_OP_LIST):
+                ) and (
+                    self.op_type not in no_grad_set_white_list.NOT_CHECK_OP_LIST
+                ) and (not self.is_bfloat16_op()):
                 raise AssertionError("no_grad_set must be None, op_type is " +
                                      self.op_type + " Op.")
 
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index db9e8d2c6bda011bef7c23e7fb51e246137a3906..6c4834b84f91f68f51b65bfc831775966732b36c 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -78,15 +78,17 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase):
 class TestELUDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 6, 6]
         eps = 1e-6
         alpha = 1.1
         dtype = np.float64
+        SEED = 0
 
         x = layers.data('x', shape, False, dtype)
         x.persistable = True
 
         y = layers.elu(x, alpha=alpha)
+        np.random.RandomState(SEED)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
@@ -147,5 +149,53 @@ class TestSquareDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestAbsDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        shape = [2, 3, 7, 9]
+        eps = 1e-6
+        dtype = np.float64
+
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.abs(x)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestLogDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        shape = [2, 3, 7, 9]
+        eps = 1e-6
+        dtype = np.float64
+
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.log(x)
+
+        x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 791f1ee2dfa534437deb903fc60e2904a8b396a1..ad7539e76e41beaf7afa590f8a0af43a4b3c8b10 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -228,7 +228,7 @@ class TestTanhAPI(unittest.TestCase):
 
     def test_dygraph_api(self):
         paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x_np)
+        x = paddle.to_tensor(self.x_np)
         out1 = F.tanh(x)
         out2 = paddle.tanh(x)
         th = paddle.nn.Tanh()
@@ -573,7 +573,7 @@ class TestHardShrinkAPI(unittest.TestCase):
 
     def test_dygraph_api(self):
         paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x_np)
+        x = paddle.to_tensor(self.x_np)
         out1 = F.hardshrink(x)
         hd = paddle.nn.Hardshrink()
         out2 = hd(x)
@@ -639,7 +639,7 @@ class TestHardtanhAPI(unittest.TestCase):
 
     def test_dygraph_api(self):
         paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x_np)
+        x = paddle.to_tensor(self.x_np)
         out1 = F.hardtanh(x)
         m = paddle.nn.Hardtanh()
         out2 = m(x)
@@ -1063,7 +1063,7 @@ class TestLeakyReluAPI(unittest.TestCase):
 
     def test_dygraph_api(self):
         paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x_np)
+        x = paddle.to_tensor(self.x_np)
         out1 = F.leaky_relu(x)
         m = paddle.nn.LeakyReLU()
         out2 = m(x)
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_api.py b/python/paddle/fluid/tests/unittests/test_adamax_api.py
index 5a33e11d2862c037639b1643a2e44ff81a757053..6d2ec0eefbb1c5157fdbcb5a2e04e97e918a95c9 100644
--- a/python/paddle/fluid/tests/unittests/test_adamax_api.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py
@@ -25,7 +25,7 @@ class TestAdamaxAPI(unittest.TestCase):
     def test_adamax_api_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
-        a = paddle.to_variable(value)
+        a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         adam = paddle.optimizer.Adamax(
             learning_rate=0.01,
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index cce24b57d2ca50e96e3ae0cf6d8912a8aea79a31..b799508f6b8d57ed59251f95beeafdb720a7299f 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -22,7 +22,7 @@ class TestAdamWOp(unittest.TestCase):
     def test_adamw_op_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
-        a = paddle.to_variable(value)
+        a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         adam = paddle.optimizer.AdamW(
             learning_rate=0.01,
@@ -37,7 +37,7 @@ class TestAdamWOp(unittest.TestCase):
     def test_adamw_op_coverage(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
-        a = paddle.to_variable(value)
+        a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         adam = paddle.optimizer.AdamW(
             learning_rate=0.0,
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
index e3c70884ebcf116feb4f5b0aa808c71e4b7f8c4e..b8c5bd2949124d622bc61fdae9dc43c34ab717af 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
@@ -147,7 +147,7 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             out_1 = paddle.nn.functional.adaptive_avg_pool2d(
                 x=x, output_size=[3, 3])
@@ -245,7 +245,7 @@ class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
             out_1 = adaptive_avg_pool(x=x)
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
index a3c9dd91a69ea83b08c3f817403620460333b5e9..bb36aaebf08421d1bf97775bc73fd30288e95eeb 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
@@ -162,7 +162,7 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             out_1 = paddle.nn.functional.adaptive_avg_pool3d(
                 x=x, output_size=[3, 3, 3])
@@ -262,7 +262,7 @@ class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
                 output_size=[3, 3, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
index d78788eb1e7c63be485210780db25e1de6fd84b4..dfa6f3226c8ce06e2f22ee6690fbf1df12b649d0 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
@@ -147,7 +147,7 @@ class TestAdaptiveMaxPool2dAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             out_1 = paddle.nn.functional.adaptive_max_pool2d(
                 x=x, return_indices=False, output_size=[3, 3])
@@ -240,7 +240,7 @@ class TestAdaptiveMaxPool2dClassAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
             out_1 = adaptive_max_pool(x=x)
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
index a7de0a5c6a7017617124b893313e0f9830cc09f9..1fa703688cdd932f7121ae870fc26f16fda5d815 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
@@ -162,7 +162,7 @@ class TestAdaptiveMaxPool3dAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             out_1 = paddle.nn.functional.adaptive_max_pool3d(
                 x=x, output_size=[3, 3, 3])
@@ -257,7 +257,7 @@ class TestAdaptiveMaxPool3dClassAPI(unittest.TestCase):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)
 
             adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
                 output_size=[3, 3, 3])
diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py
index 6e66c0c0029accdcdf81ae67dff1a49e3e8867d4..6238d7dd4a1f4574fa1fabf5d531db6d4a64df09 100644
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -244,9 +244,9 @@ class TestAddMMAPI(unittest.TestCase):
 
         def test_error1():
             data_x_wrong = np.ones((2, 3)).astype(np.float32)
-            x = paddle.to_variable(data_x_wrong)
-            y = paddle.to_variable(data_y)
-            input = paddle.to_variable(data_input)
+            x = paddle.to_tensor(data_x_wrong)
+            y = paddle.to_tensor(data_y)
+            input = paddle.to_tensor(data_input)
             out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
         self.assertRaises(ValueError, test_error1)
 '''
diff --git a/python/paddle/fluid/tests/unittests/test_arange.py b/python/paddle/fluid/tests/unittests/test_arange.py
index 29003d28e441c02e040a8d6cb9888e376521bc72..d62c08b072b10b025e885ada1af0eb97817cab80 100644
--- a/python/paddle/fluid/tests/unittests/test_arange.py
+++ b/python/paddle/fluid/tests/unittests/test_arange.py
@@ -98,9 +98,9 @@ class TestArangeImperative(unittest.TestCase):
         x2 = paddle.tensor.arange(5)
         x3 = paddle.tensor.creation.arange(5)
 
-        start = paddle.to_variable(np.array([0], 'float32'))
-        end = paddle.to_variable(np.array([5], 'float32'))
-        step = paddle.to_variable(np.array([1], 'float32'))
+        start = paddle.to_tensor(np.array([0], 'float32'))
+        end = paddle.to_tensor(np.array([5], 'float32'))
+        step = paddle.to_tensor(np.array([1], 'float32'))
         x4 = paddle.arange(start, end, step, 'int64')
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
index ab08a0aacbf08768ffff43974ee9a7c7dd4a7288..2fcec657c1404d86354e8a43ea8ac81cfc6947ac 100644
--- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
@@ -96,7 +96,7 @@ class TestDygraph(unittest.TestCase):
         a = np.random.rand(3, 3)
         a_t = np.transpose(a, [1, 0])
         x_data = np.matmul(a, a_t) + 1e-03
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor(x_data)
         out = paddle.cholesky(x, upper=False)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index b56d9f6668e8bcbd37443fb88b1f5f4dd40a2511..2946798a82f78fc55adb6ff0bb3c7f9721a6d60f 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -168,9 +168,9 @@ class TestClipAPI(unittest.TestCase):
         paddle.disable_static(place)
         data_shape = [1, 9, 9, 4]
         data = np.random.random(data_shape).astype('float32')
-        images = paddle.to_variable(data, dtype='float32')
-        v_min = paddle.to_variable(np.array([0.2], dtype=np.float32))
-        v_max = paddle.to_variable(np.array([0.8], dtype=np.float32))
+        images = paddle.to_tensor(data, dtype='float32')
+        v_min = paddle.to_tensor(np.array([0.2], dtype=np.float32))
+        v_max = paddle.to_tensor(np.array([0.8], dtype=np.float32))
 
         out_1 = paddle.clip(images, min=0.2, max=0.8)
         out_2 = paddle.clip(images, min=0.2, max=0.9)
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index 5916000fba79fc0da2ef545beac634a3edfe01df..f625e1de4a3e0564037d71e2393f5914415917d9 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -113,6 +113,7 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
         strategy.a_sync_configs = {"k_steps": 100}
+        strategy.a_sync_configs = {"launch_barrier": False}
 
         if training_role == "TRAINER":
             self.run_trainer(role, strategy)
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
index 95b209b14602676a089a667b0a720056bbe1562b..78e2050d3b48edc4a2bd0195a371a6b34afb7b1e 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
@@ -51,6 +51,7 @@ class TestCommunicator(unittest.TestCase):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = False
+        strategy.a_sync_configs = {"launch_barrier": False}
 
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index b4dbba7eead397c46c37a8df013dabb00177f030..14c10e7aa2022e2963b0dbb973cefb1793be842f 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -285,9 +285,9 @@ class TestConcatAPI(unittest.TestCase):
         in2 = np.array([[11, 12, 13], [14, 15, 16]])
         in3 = np.array([[21, 22], [23, 24]])
         paddle.disable_static()
-        x1 = paddle.to_variable(in1)
-        x2 = paddle.to_variable(in2)
-        x3 = paddle.to_variable(in3)
+        x1 = paddle.to_tensor(in1)
+        x2 = paddle.to_tensor(in2)
+        x3 = paddle.to_tensor(in3)
         out1 = fluid.layers.concat(input=[x1, x2, x3], axis=-1)
         out2 = paddle.concat(x=[x1, x2], axis=0)
         np_out1 = np.concatenate([in1, in2, in3], axis=-1)
diff --git a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
index 1e25613fa63da440f71f23841095f153e61735e9..a8899d9f022c0961e5bce9c25e0b23cd7658f256 100644
--- a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
+++ b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
@@ -75,8 +75,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
         np_x2 = np.random.rand(*shape).astype(np.float32)
         np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
 
-        tesnor_x1 = paddle.to_variable(np_x1)
-        tesnor_x2 = paddle.to_variable(np_x2)
+        tesnor_x1 = paddle.to_tensor(np_x1)
+        tesnor_x2 = paddle.to_tensor(np_x2)
         y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
 
         self.assertTrue(np.allclose(y.numpy(), np_out))
@@ -92,8 +92,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
         np_x2 = np.random.rand(*shape).astype(np.float32)
         np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
 
-        tesnor_x1 = paddle.to_variable(np_x1)
-        tesnor_x2 = paddle.to_variable(np_x2)
+        tesnor_x1 = paddle.to_tensor(np_x1)
+        tesnor_x2 = paddle.to_tensor(np_x2)
         y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
 
         self.assertTrue(np.allclose(y.numpy(), np_out))
@@ -110,8 +110,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
         np_x2 = np.random.rand(*shape2).astype(np.float32)
         np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
 
-        tesnor_x1 = paddle.to_variable(np_x1)
-        tesnor_x2 = paddle.to_variable(np_x2)
+        tesnor_x1 = paddle.to_tensor(np_x1)
+        tesnor_x2 = paddle.to_tensor(np_x2)
         y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)
 
         self.assertTrue(np.allclose(y.numpy(), np_out))
@@ -129,8 +129,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
         np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)
 
         cos_sim_func = nn.CosineSimilarity(axis=axis, eps=eps)
-        tesnor_x1 = paddle.to_variable(np_x1)
-        tesnor_x2 = paddle.to_variable(np_x2)
+        tesnor_x1 = paddle.to_tensor(np_x1)
+        tesnor_x2 = paddle.to_tensor(np_x2)
         y = cos_sim_func(tesnor_x1, tesnor_x2)
 
         self.assertTrue(np.allclose(y.numpy(), np_out))
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
index ad121fac8cc045e67cf116d2cf9cedd6ac9bef99..818e15bb319b10a1ffa6850874a26f412422b574 100644
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -21,13 +21,12 @@ import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
-from paddle import to_variable
 
 
 class TestCumsumOp(unittest.TestCase):
     def run_cases(self):
         data_np = np.arange(12).reshape(3, 4)
-        data = to_variable(data_np)
+        data = paddle.to_tensor(data_np)
 
         y = paddle.cumsum(data)
         z = np.cumsum(data_np)
diff --git a/python/paddle/fluid/tests/unittests/test_data.py b/python/paddle/fluid/tests/unittests/test_data.py
index 8070148f8b36dd7dab7711abaf25994acebc7e6f..98739f6e1631e5ebd5fc8da45647118be8c05f6f 100644
--- a/python/paddle/fluid/tests/unittests/test_data.py
+++ b/python/paddle/fluid/tests/unittests/test_data.py
@@ -99,5 +99,17 @@ class TestApiStaticDataError(unittest.TestCase):
             self.assertRaises(TypeError, test_shape_type)
 
 
+class TestApiErrorWithDynamicMode(unittest.TestCase):
+    def test_error(self):
+        with program_guard(Program(), Program()):
+            paddle.disable_static()
+            self.assertRaises(AssertionError, fluid.data, 'a', [2, 25])
+            self.assertRaises(
+                AssertionError, fluid.layers.data, 'b', shape=[2, 25])
+            self.assertRaises(
+                AssertionError, paddle.static.data, 'c', shape=[2, 25])
+            paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_default_dtype.py b/python/paddle/fluid/tests/unittests/test_default_dtype.py
index 057933fc7a735c2732cd651e83e99ddfa747b8a8..29ca9a93985977936d1c99d1587d615c67524136 100644
--- a/python/paddle/fluid/tests/unittests/test_default_dtype.py
+++ b/python/paddle/fluid/tests/unittests/test_default_dtype.py
@@ -20,7 +20,6 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
 import paddle.fluid.core as core
-from paddle import to_variable
 
 
 class TestDefaultType(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
index 2a80e20d692c88497e7edccd6eca5509e3522871..97b6594eb382507ccbbb8b6bfad8e5631d534010 100755
--- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
@@ -72,6 +72,7 @@ class TestDeprecatedDocorator(unittest.TestCase):
         test old fluid elementwise_mul api, it should fire Warinng function, 
         which insert the Warinng info on top of API's doc string.
         """
+        paddle.enable_static()
         # Initialization
         x = fluid.data(name='x', shape=[3, 2, 1], dtype='float32')
 
@@ -80,6 +81,7 @@ class TestDeprecatedDocorator(unittest.TestCase):
 
         # captured        
         captured = get_warning_index(fluid.data)
+        paddle.disable_static()
 
         # testting
         self.assertGreater(expected, captured)
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 529fff158c55fc30248b9f5a88c8c615a8b55c79..2f35b45aa670c1d8e2caa7fe2ecf4e06b9884899 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -36,7 +36,7 @@ class TestDirectory(unittest.TestCase):
     def test_new_directory(self):
         new_directory = [
             'paddle.enable_static', 'paddle.disable_static',
-            'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad',
+            'paddle.in_dynamic_mode', 'paddle.to_tensor', 'paddle.grad',
             'paddle.no_grad', 'paddle.save', 'paddle.load',
             'paddle.static.save', 'paddle.static.load',
             'paddle.distributed.ParallelEnv',
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
index 7f55e956a94aee79dda07762e953e71807899bff..845be6eda6e0d972ed98535fc987f459e4e8d702 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -52,6 +52,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
+        strategy.a_sync_configs = {"launch_barrier": False}
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
@@ -92,6 +93,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
+        strategy.a_sync_configs = {"launch_barrier": False}
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
index db3f2afb3668bc1831286f8d13b274895e7632fd..668b4ad872f43c65c47f75a8af016a6c6af58513 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -44,6 +44,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = False
+        strategy.a_sync_configs = {"launch_barrier": False}
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index c46d1dc5b0f87262aee8efd4722418be433c98ea..195b3f8de0a40011e3da5ddfa6d633daaa90fc1c 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -312,9 +312,6 @@ class TestFleetBase(unittest.TestCase):
                 "========================Error tr1_err end==========================="
             )
 
-        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
-        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
-
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
@@ -325,6 +322,8 @@ class TestFleetBase(unittest.TestCase):
         ps1.terminate()
 
         shutil.rmtree(gloo_path)
+        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
+        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
         return 0, 0
 
     def check_with_place(self,
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
index ba97c5079bde429b0b7145208926b570d04725bc..6c5a1d6e36c2549d5e3549f81f26d5ffcca3a247 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
@@ -81,7 +81,7 @@ class FleetDistHeterRunnerBase(object):
     def build_strategy(self, args):
         self.strategy = paddle.distributed.fleet.DistributedStrategy()
         self.strategy.a_sync = True
-
+        self.strategy.a_sync_configs = {"launch_barrier": True}
         return self.strategy
 
     def build_optimizer(self, avg_cost, strategy):
@@ -237,7 +237,10 @@ class TestFleetHeterBase(unittest.TestCase):
         return heter0_proc, heter1_proc, heter0_pipe, heter1_pipe
 
     def _run_cluster(self, model, envs):
-        env = {'GRAD_CLIP': str(self._grad_clip_mode)}
+        env = {
+            'GRAD_CLIP': str(self._grad_clip_mode),
+            'FLAGS_eager_delete_tensor_gb': str(-1)
+        }
         python_path = self._python_interp
         gloo_path = tempfile.mkdtemp()
 
@@ -286,27 +289,6 @@ class TestFleetHeterBase(unittest.TestCase):
 
         tr0_ret = tr0.returncode
         tr1_ret = tr0.returncode
-        print("tr get returncode: {}".format(tr0_ret))
-        if tr0_ret != 0:
-            print(
-                "========================Error tr0_err begin==========================="
-            )
-            os.system("cat {}".format(tempfile.gettempdir() + "/tr0_err.log"))
-            print(
-                "========================Error tr0_err end==========================="
-            )
-
-        if tr1_ret != 0:
-            print(
-                "========================Error tr1_err begin==========================="
-            )
-            os.system("cat {}".format(tempfile.gettempdir() + "/tr1_err.log"))
-            print(
-                "========================Error tr1_err end==========================="
-            )
-
-        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
-        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
 
         # close trainer file
         tr0_pipe.close()
@@ -320,7 +302,8 @@ class TestFleetHeterBase(unittest.TestCase):
         ps1.terminate()
         heter0.terminate()
         heter1.terminate()
-
+        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
+        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
         shutil.rmtree(gloo_path)
         return 0, 0
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
index b3e38a421287611c43bb82d93b4df166e23f6484..5f7d7b21d7ff8da8699c2f55adcde954c1c0156d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
@@ -23,38 +23,6 @@ import paddle
 paddle.enable_static()
 
 
-class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
-    def _setup_config(self):
-        self._mode = "async"
-        self._reader = "dataset"
-
-    def check_with_place(self,
-                         model_file,
-                         delta=1e-3,
-                         check_error_log=False,
-                         need_envs={}):
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
-            "http_proxy": "",
-            "CPU_NUM": "3"
-        }
-
-        required_envs.update(need_envs)
-
-        if check_error_log:
-            required_envs["GLOG_v"] = "3"
-            required_envs["GLOG_logtostderr"] = "1"
-
-        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
-
-    def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_heter_ctr.py", delta=1e-5, check_error_log=True)
-
-
 class TestDistHeterPyreaderAsync2x2(TestFleetHeterBase):
     def _setup_config(self):
         self._mode = "async"
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
index 642044bb4b1152b0c6d2b5a8a64e22410f9bd151..e0e487eff11e700b6b2c0531cab9e102d302248e 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -195,7 +195,7 @@ class TestFlattenPython(unittest.TestCase):
 
         def test_Negative():
             paddle.disable_static()
-            img = paddle.to_variable(x)
+            img = paddle.to_tensor(x)
             out = paddle.flatten(img, start_axis=-2, stop_axis=-1)
             return out.numpy().shape
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 22f16287c33f96a43361b5fe4ed5d0fe3edbb1bc..7378975aa3795bb42e72f5e892bf08a2910e320a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -211,7 +211,7 @@ class TestImperative(unittest.TestCase):
         paddle.disable_static()
         self.assertTrue(paddle.in_dynamic_mode())
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        var_inp = paddle.to_variable(np_inp)
+        var_inp = paddle.to_tensor(np_inp)
         mlp = MLP(input_size=2)
         out = mlp(var_inp)
         dy_out1 = out.numpy()
@@ -221,7 +221,7 @@ class TestImperative(unittest.TestCase):
         self.assertFalse(paddle.in_dynamic_mode())
         paddle.disable_static()
         self.assertTrue(paddle.in_dynamic_mode())
-        var_inp = paddle.to_variable(np_inp)
+        var_inp = paddle.to_tensor(np_inp)
         mlp = MLP(input_size=2)
         out = mlp(var_inp)
         dy_out2 = out.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
index 59ddb365e539603c1eba06ca8828fc244b6e542d..97f7162e9979c504de0e29206a7b6d03884e3e19 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -54,7 +54,7 @@ class TestSimpleNet(unittest.TestCase):
                     # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
 
                     input_word = np.array([[1, 2], [2, 1]]).astype('int64')
-                    input = paddle.to_variable(input_word)
+                    input = paddle.to_tensor(input_word)
 
                     simplenet = SimpleNet(20, 32, dtype)
                     adam = SGDOptimizer(
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
index 8a868e751f0567e6387b0e9471f0382c9456bcb6..281dc7caded1f5d685350e88da776a614a3b8175 100644
--- a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
@@ -41,7 +41,7 @@ def run_dygraph(x_np, op_str, use_gpu=True):
     if use_gpu and fluid.core.is_compiled_with_cuda():
         place = paddle.CUDAPlace(0)
     paddle.disable_static(place)
-    x = paddle.to_variable(x_np)
+    x = paddle.to_tensor(x_np)
     dygraph_result = getattr(paddle.tensor, op_str)(x)
     return dygraph_result
 
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 7e6ca8076de5186def1229b58bd23df73021430e..99404246185040e35c59ab5cf374e1948870b2bb 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -543,9 +543,9 @@ class TestJitSaveMultiCases(unittest.TestCase):
         loaded_layer = paddle.jit.load(model_path)
         loaded_layer.eval()
         # inference & compare
-        x = paddle.to_variable(np.random.random((1, 784)).astype('float32'))
+        x = paddle.to_tensor(np.random.random((1, 784)).astype('float32'))
         if with_label:
-            y = paddle.to_variable(np.random.random((1, 1)).astype('int64'))
+            y = paddle.to_tensor(np.random.random((1, 1)).astype('int64'))
             pred, _ = layer(x, y)
             pred = pred.numpy()
         else:
@@ -677,7 +677,7 @@ class TestJitSaveMultiCases(unittest.TestCase):
 
         model_path = "test_not_prune_output_spec_name_warning"
         configs = paddle.SaveLoadConfig()
-        out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
+        out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
         configs.output_spec = [out]
         paddle.jit.save(layer, model_path, configs=configs)
 
@@ -709,7 +709,7 @@ class TestJitSaveMultiCases(unittest.TestCase):
 
         model_path = "test_prune_to_static_after_train"
         configs = paddle.SaveLoadConfig()
-        out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
+        out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
         configs.output_spec = [out]
         with self.assertRaises(ValueError):
             paddle.jit.save(
@@ -730,7 +730,7 @@ class TestJitSaveLoadEmptyLayer(unittest.TestCase):
 
     def test_save_load_empty_layer(self):
         layer = EmptyLayer()
-        x = paddle.to_variable(np.random.random((10)).astype('float32'))
+        x = paddle.to_tensor(np.random.random((10)).astype('float32'))
         out = layer(x)
         paddle.jit.save(layer, self.model_path)
         load_layer = paddle.jit.load(self.model_path)
@@ -746,8 +746,8 @@ class TestJitSaveLoadNoParamLayer(unittest.TestCase):
 
     def test_save_load_no_param_layer(self):
         layer = NoParamLayer()
-        x = paddle.to_variable(np.random.random((5)).astype('float32'))
-        y = paddle.to_variable(np.random.random((5)).astype('float32'))
+        x = paddle.to_tensor(np.random.random((5)).astype('float32'))
+        y = paddle.to_tensor(np.random.random((5)).astype('float32'))
         out = layer(x, y)
         paddle.jit.save(layer, self.model_path)
         load_layer = paddle.jit.load(self.model_path)
diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
index 041fe4e9043d60852fcaab42bc233b63b39609ce..3a3b7071e04dced7fc33c4ac56a9018bb82afbf4 100644
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
@@ -90,7 +90,7 @@ class TestKLDivLossDygraph(unittest.TestCase):
         with paddle.fluid.dygraph.guard():
             kldiv_criterion = paddle.nn.KLDivLoss(reduction)
             pred_loss = kldiv_criterion(
-                paddle.to_variable(x), paddle.to_variable(target))
+                paddle.to_tensor(x), paddle.to_tensor(target))
             self.assertTrue(np.allclose(pred_loss.numpy(), gt_loss))
 
     def test_kl_loss_batchmean(self):
diff --git a/python/paddle/fluid/tests/unittests/test_l1_loss.py b/python/paddle/fluid/tests/unittests/test_l1_loss.py
index 6a15fe494779f93c2f36a301594aaccf55283902..3c37397cae1b586f77249136d6abd2e0dce1d8b3 100644
--- a/python/paddle/fluid/tests/unittests/test_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py
@@ -26,8 +26,8 @@ class TestFunctionalL1Loss(unittest.TestCase):
         self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
 
     def run_imperative(self):
-        input = paddle.to_variable(self.input_np)
-        label = paddle.to_variable(self.label_np)
+        input = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np)
         dy_result = paddle.nn.functional.l1_loss(input, label)
         expected = np.mean(np.abs(self.input_np - self.label_np))
         self.assertTrue(np.allclose(dy_result.numpy(), expected))
@@ -106,8 +106,8 @@ class TestClassL1Loss(unittest.TestCase):
         self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)
 
     def run_imperative(self):
-        input = paddle.to_variable(self.input_np)
-        label = paddle.to_variable(self.label_np)
+        input = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np)
         l1_loss = paddle.nn.loss.L1Loss()
         dy_result = l1_loss(input, label)
         expected = np.mean(np.abs(self.input_np - self.label_np))
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index e3d7003ecedb60f9b4f9a542ed08ca88d894d24a..9ac4895f499f812906fe22cc31712459cb43976c 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -96,7 +96,7 @@ class TestNNLogSoftmaxAPI(unittest.TestCase):
 
         # test dygrapg api
         paddle.disable_static()
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         y = logsoftmax(x)
         self.assertTrue(np.allclose(y.numpy(), ref_out))
         paddle.enable_static()
@@ -127,7 +127,7 @@ class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
         self.assertTrue(np.allclose(out[0], ref_out))
 
         paddle.disable_static()
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         y = F.log_softmax(x, axis, dtype)
         self.assertTrue(np.allclose(y.numpy(), ref_out), True)
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index cf9203dffcbaa5da641b3f7cb8925ac9efcbe115..9032293070a9697aada6034d0b6028eeccc310dd 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -111,7 +111,7 @@ class TestLogsumexpAPI(unittest.TestCase):
         self.assertTrue(np.allclose(res[0], out_ref))
 
         paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         out = paddle.logsumexp(x, axis, keepdim)
         self.assertTrue(np.allclose(out.numpy(), out_ref))
         paddle.enable_static()
@@ -126,7 +126,7 @@ class TestLogsumexpAPI(unittest.TestCase):
 
     def test_alias(self):
         paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
         out1 = paddle.logsumexp(x)
         out2 = paddle.tensor.logsumexp(x)
         out3 = paddle.tensor.math.logsumexp(x)
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
index 3eb822bfed89b80bccc08fe0d96b6c4b2f9b4ec4..2d5f098a7fe86439d8b829a3b11cf23dfe072b77 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import paddle.fluid.core as core
 import unittest
 import numpy as np
 from op_test import OpTest
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 884139a23d51c95c79439b91d501dc935baeae36..640771df23b726bd0a8a36b168bc5428fd953c45 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -65,15 +65,21 @@ class TestMatMulV2Op(OpTest):
         self.y_shape = (100, )
         self.trans_x = False
         self.trans_y = False
+
+    def init_kernel_type(self):
         self.dtype = "float64"
 
     def setUp(self):
+        self.init_kernel_type()
         self.config()
         self.op_type = "matmul_v2"
         x = np.random.random(self.x_shape).astype(self.dtype)
         y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        x = -0.1 + 0.2 * x
+        y = -0.1 + 0.2 * y
         result = reference_matmul(x, y, self.trans_x, self.trans_y)
-
+        result = result.astype(self.dtype)
         self.inputs = {
             'X': x,
             'Y': y,
@@ -98,7 +104,6 @@ class TestMatMuklOp2(TestMatMulV2Op):
         self.y_shape = (1, 3, 2, 100)
         self.trans_x = False
         self.trans_y = True
-        self.dtype = "float64"
 
 
 class TestMatMuklOp3(TestMatMulV2Op):
@@ -111,7 +116,6 @@ class TestMatMuklOp3(TestMatMulV2Op):
         self.y_shape = (1, 1, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp4(TestMatMulV2Op):
@@ -124,7 +128,6 @@ class TestMatMuklOp4(TestMatMulV2Op):
         self.y_shape = (1, 2, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp5(TestMatMulV2Op):
@@ -133,11 +136,10 @@ class TestMatMuklOp5(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (1, 1, 100, 2)
+        self.x_shape = (1, 1, 100, 1)
         self.y_shape = (100, )
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp6(TestMatMulV2Op):
@@ -150,7 +152,6 @@ class TestMatMuklOp6(TestMatMulV2Op):
         self.y_shape = (100, )
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp7(TestMatMulV2Op):
@@ -163,7 +164,6 @@ class TestMatMuklOp7(TestMatMulV2Op):
         self.y_shape = (100, )
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp8(TestMatMulV2Op):
@@ -176,7 +176,6 @@ class TestMatMuklOp8(TestMatMulV2Op):
         self.y_shape = (1, 1, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp9(TestMatMulV2Op):
@@ -189,7 +188,6 @@ class TestMatMuklOp9(TestMatMulV2Op):
         self.y_shape = (2, 1, 2, 100)
         self.trans_x = False
         self.trans_y = True
-        self.dtype = "float64"
 
 
 class TestMatMuklOp10(TestMatMulV2Op):
@@ -198,11 +196,10 @@ class TestMatMuklOp10(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (1, 1, 2, 100)
-        self.y_shape = (1, 2, 100, 2)
+        self.x_shape = (1, 1, 25, 4)
+        self.y_shape = (1, 2, 4, 25)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp11(TestMatMulV2Op):
@@ -215,7 +212,6 @@ class TestMatMuklOp11(TestMatMulV2Op):
         self.y_shape = (1, 1, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp12(TestMatMulV2Op):
@@ -224,11 +220,10 @@ class TestMatMuklOp12(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (2, 1, 100, 2)
-        self.y_shape = (1, 1, 100, 2)
+        self.x_shape = (2, 1, 4, 25)
+        self.y_shape = (1, 1, 4, 25)
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp13(TestMatMulV2Op):
@@ -237,11 +232,10 @@ class TestMatMuklOp13(TestMatMulV2Op):
     """
 
     def config(self):
-        self.x_shape = (2, 2, 100, 2)
-        self.y_shape = (2, 2, 100, 2)
+        self.x_shape = (2, 2, 2, 50)
+        self.y_shape = (2, 2, 2, 50)
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp14(TestMatMulV2Op):
@@ -254,7 +248,6 @@ class TestMatMuklOp14(TestMatMulV2Op):
         self.y_shape = (1, 2, 2, 100, 2)
         self.trans_x = True
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp15(TestMatMulV2Op):
@@ -267,7 +260,6 @@ class TestMatMuklOp15(TestMatMulV2Op):
         self.y_shape = (1, 2, 2, 100, 1)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp16(TestMatMulV2Op):
@@ -277,10 +269,9 @@ class TestMatMuklOp16(TestMatMulV2Op):
 
     def config(self):
         self.x_shape = (100)
-        self.y_shape = (1, 2, 2, 100, 1)
+        self.y_shape = (1, 2, 2, 100, 2)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
 
 
 class TestMatMuklOp17(TestMatMulV2Op):
@@ -293,7 +284,54 @@ class TestMatMuklOp17(TestMatMulV2Op):
         self.y_shape = (100)
         self.trans_x = False
         self.trans_y = False
-        self.dtype = "float64"
+
+
+#--------------------test matmul fp16--------------------
+
+
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=atol)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_grad_with_place(
+                    place, ['X', 'Y'],
+                    'Out',
+                    max_relative_error=max_relative_error)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulV2Op)
+create_test_fp16_class(TestMatMuklOp2)
+create_test_fp16_class(TestMatMuklOp3)
+create_test_fp16_class(TestMatMuklOp4)
+create_test_fp16_class(TestMatMuklOp5)
+create_test_fp16_class(TestMatMuklOp6)
+create_test_fp16_class(TestMatMuklOp7)
+create_test_fp16_class(TestMatMuklOp8)
+create_test_fp16_class(TestMatMuklOp9)
+create_test_fp16_class(TestMatMuklOp10)
+create_test_fp16_class(TestMatMuklOp11)
+create_test_fp16_class(TestMatMuklOp12)
+create_test_fp16_class(TestMatMuklOp13)
+create_test_fp16_class(TestMatMuklOp14)
+create_test_fp16_class(TestMatMuklOp15)
+create_test_fp16_class(TestMatMuklOp16)
+create_test_fp16_class(TestMatMuklOp17)
 
 
 class TestMatMulV2API(unittest.TestCase):
@@ -331,6 +369,17 @@ class TestMatMulV2API(unittest.TestCase):
                 y = paddle.to_tensor(input_y)
                 result = paddle.matmul(x, y)
 
+    def test_dygraph_fp16(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                with fluid.dygraph.guard(place):
+                    input_x = np.random.random([4, 3]).astype("float16")
+                    input_y = np.random.random([3, 4]).astype("float16")
+                    x = paddle.to_tensor(input_x)
+                    y = paddle.to_tensor(input_y)
+                    result = paddle.matmul(x, y)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_max_op.py b/python/paddle/fluid/tests/unittests/test_max_op.py
index c9afc4bec66f2927a674ac15e807fe01f724c64f..4786d790b148177a9c687103260667309d6ec3d8 100644
--- a/python/paddle/fluid/tests/unittests/test_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_max_op.py
@@ -80,7 +80,7 @@ class ApiMaxTest(unittest.TestCase):
     def test_imperative_api(self):
         paddle.disable_static()
         np_x = np.array([10, 10]).astype('float64')
-        x = paddle.to_variable(np_x)
+        x = paddle.to_tensor(np_x)
         z = paddle.max(x, axis=0)
         np_z = z.numpy()
         z_expected = np.array(np.max(np_x, axis=0))
diff --git a/python/paddle/fluid/tests/unittests/test_maximum_op.py b/python/paddle/fluid/tests/unittests/test_maximum_op.py
index 5645597007a00cac9c75ec1ae90bc00a5bc75f22..54657d7900e3d43a54adf755e9b213727766db7d 100644
--- a/python/paddle/fluid/tests/unittests/test_maximum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maximum_op.py
@@ -61,8 +61,8 @@ class ApiMaximumTest(unittest.TestCase):
     def test_dynamic_api(self):
         paddle.disable_static()
         np_x = np.array([10, 10]).astype('float64')
-        x = paddle.to_variable(self.input_x)
-        y = paddle.to_variable(self.input_y)
+        x = paddle.to_tensor(self.input_x)
+        y = paddle.to_tensor(self.input_y)
         z = paddle.maximum(x, y)
         np_z = z.numpy()
         z_expected = np.array(np.maximum(self.input_x, self.input_y))
@@ -73,8 +73,8 @@ class ApiMaximumTest(unittest.TestCase):
         np_x = np.random.rand(5, 4, 3, 2).astype("float64")
         np_y = np.random.rand(4, 3).astype("float64")
 
-        x = paddle.to_variable(self.input_x)
-        y = paddle.to_variable(self.input_y)
+        x = paddle.to_tensor(self.input_x)
+        y = paddle.to_tensor(self.input_y)
         result_1 = paddle.maximum(x, y, axis=1)
         result_2 = paddle.maximum(x, y, axis=-2)
         self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True)
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index 29e79b096cf790858e8e07aedc5c6b76881e8f82..f0094e703cd0d2a8113f70ea99dc8e71cef5b86f 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -204,7 +204,7 @@ class TestMeanAPI(unittest.TestCase):
         paddle.disable_static(self.place)
 
         def test_case(x, axis=None, keepdim=False):
-            x_tensor = paddle.to_variable(x)
+            x_tensor = paddle.to_tensor(x)
             out = paddle.mean(x_tensor, axis, keepdim)
             if isinstance(axis, list):
                 axis = tuple(axis)
diff --git a/python/paddle/fluid/tests/unittests/test_min_op.py b/python/paddle/fluid/tests/unittests/test_min_op.py
index b9eff05c5ea9fb585421b6f99bf55b3bb95bf9ff..9c15d7216352c20031b92263a0344ca6eac76c89 100644
--- a/python/paddle/fluid/tests/unittests/test_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_min_op.py
@@ -80,7 +80,7 @@ class ApiMinTest(unittest.TestCase):
     def test_imperative_api(self):
         paddle.disable_static()
         np_x = np.array([10, 10]).astype('float64')
-        x = paddle.to_variable(np_x)
+        x = paddle.to_tensor(np_x)
         z = paddle.min(x, axis=0)
         np_z = z.numpy()
         z_expected = np.array(np.min(np_x, axis=0))
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 5f223de1954f7b401ac031265cca8c2e661c7392..927383c1223d54753dd59b3a6c40ea0b9594c378 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -18,6 +18,8 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid.core as core
+import sys
+sys.path.append("..")
 from op_test import OpTest
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
@@ -175,57 +177,5 @@ class TestFP16MulOp2(TestMulOp2):
                 no_grad_set=set('Y'))
 
 
-@unittest.skipIf(not core.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUMulOp1(TestMulOp):
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        place = core.XPUPlace(0)
-        self.check_output_with_place(place, atol=1e-1)
-
-    def test_check_grad_normal(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', max_relative_error=0.5)
-
-    def test_check_grad_ingore_x(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-
-
-@unittest.skipIf(not core.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUMulOp2(TestMulOp2):
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        place = core.XPUPlace(0)
-        self.check_output_with_place(place, atol=2e-1)
-
-    def test_check_grad_normal(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', max_relative_error=0.9)
-
-    def test_check_grad_ingore_x(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index a89b9fde7f92de0d493ad87a2f0950548ba8ff98..cb4bd16ce219f8a649716d8efff07eb82d5fffc4 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -130,5 +130,41 @@ class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
         self.shape = [2, 2, 3, 4, 5]
 
 
+class TestBatchNormDoubleGradCheckCase5(TestBatchNormDoubleGradCheck):
+    @prog_scope()
+    def func(self, place):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+            np.random.seed()
+            dtype = "float32"
+            eps = 0.005
+            atol = 2e-4
+            chn = self.shape[1] if self.data_layout == 'NCHW' else self.shape[
+                -1]
+            x = layers.create_parameter(dtype=dtype, shape=self.shape, name='x')
+            z = fluid.layers.batch_norm(
+                input=x,
+                data_layout=self.data_layout,
+                use_global_stats=self.use_global_stats)
+            x_arr = np.random.uniform(-1, 1, self.shape).astype(dtype)
+            w, b = prog.global_block().all_parameters()[1:3]
+            w_arr = np.ones(chn).astype(dtype)
+            b_arr = np.zeros(chn).astype(dtype)
+            gradient_checker.double_grad_check(
+                [x, w, b],
+                z,
+                x_init=[x_arr, w_arr, b_arr],
+                atol=atol,
+                place=place,
+                eps=eps)
+
+
+class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
+    def init_test(self):
+        self.data_layout = 'NCHW'
+        self.use_global_stats = True
+        self.shape = [2, 3, 4, 5]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 74d44d0f8b66739f7883173081f1da1250335b17..fee3494558604fb00f767261c06d4b3612e62ad0 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -29,19 +29,23 @@ IMAGE_SIZE = 784
 CLASS_NUM = 10
 
 
-# define a random dataset
-class RandomDataset(paddle.io.Dataset):
-    def __init__(self, num_samples):
-        self.num_samples = num_samples
-
-    def __getitem__(self, idx):
+def random_batch_reader():
+    def _get_random_inputs_and_labels():
         np.random.seed(SEED)
-        image = np.random.random([IMAGE_SIZE]).astype('float32')
-        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+        image = np.random.random([BATCH_SIZE, IMAGE_SIZE]).astype('float32')
+        label = np.random.randint(0, CLASS_NUM - 1, (
+            BATCH_SIZE,
+            1, )).astype('int64')
         return image, label
 
-    def __len__(self):
-        return self.num_samples
+    def __reader__():
+        for _ in range(BATCH_NUM):
+            batch_image, batch_label = _get_random_inputs_and_labels()
+            batch_image = paddle.to_tensor(batch_image)
+            batch_label = paddle.to_tensor(batch_label)
+            yield batch_image, batch_label
+
+    return __reader__
 
 
 class LinearNet(nn.Layer):
@@ -66,8 +70,7 @@ def train(layer, loader, loss_fn, opt):
 class TestSaveLoad(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
-        self.place = paddle.CPUPlace()
-        paddle.disable_static(self.place)
+        paddle.disable_static()
 
         # config seed
         paddle.manual_seed(SEED)
@@ -81,14 +84,8 @@ class TestSaveLoad(unittest.TestCase):
         adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
 
         # create data loader
-        dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
-        loader = paddle.io.DataLoader(
-            dataset,
-            places=self.place,
-            batch_size=BATCH_SIZE,
-            shuffle=True,
-            drop_last=True,
-            num_workers=2)
+        # TODO: using new DataLoader cause unknown Timeout on windows, replace it
+        loader = random_batch_reader()
 
         # train
         train(layer, loader, loss_fn, adam)
@@ -103,8 +100,8 @@ class TestSaveLoad(unittest.TestCase):
         layer, opt = self.build_and_train_model()
 
         # save
-        layer_save_path = "linear.pdparams"
-        opt_save_path = "linear.pdopt"
+        layer_save_path = "test_paddle_save_load.linear.pdparams"
+        opt_save_path = "test_paddle_save_load.linear.pdopt"
         layer_state_dict = layer.state_dict()
         opt_state_dict = opt.state_dict()
 
@@ -120,7 +117,7 @@ class TestSaveLoad(unittest.TestCase):
 
         # test save load in static mode
         paddle.enable_static()
-        static_save_path = "static_mode_test/linear.pdparams"
+        static_save_path = "static_mode_test/test_paddle_save_load.linear.pdparams"
         paddle.save(layer_state_dict, static_save_path)
         load_static_state_dict = paddle.load(static_save_path)
         self.check_load_state_dict(layer_state_dict, load_static_state_dict)
@@ -133,15 +130,15 @@ class TestSaveLoad(unittest.TestCase):
 
         # 2. test save path format error
         with self.assertRaises(ValueError):
-            paddle.save(layer_state_dict, "linear.model/")
+            paddle.save(layer_state_dict, "test_paddle_save_load.linear.model/")
 
         # 3. test load path not exist error
         with self.assertRaises(ValueError):
-            paddle.load("linear.params")
+            paddle.load("test_paddle_save_load.linear.params")
 
         # 4. test load old save path error
         with self.assertRaises(ValueError):
-            paddle.load("linear")
+            paddle.load("test_paddle_save_load.linear")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index 25216175d59935535a352b02afc3c8f371cedd63..c1169dfc5210ac80a709afa06d3bf9a470a785b0 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -195,6 +195,23 @@ class TestPool1d_API(unittest.TestCase):
             result = max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_max_dygraph_return_index_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result, index = F.max_pool1d(
+                input, kernel_size=2, stride=2, padding=0, return_indices=True)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool1d_dg = paddle.nn.layer.MaxPool1d(
+                kernel_size=2, stride=None, padding=0)
+            result = max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_dygraph_padding_same(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
@@ -228,6 +245,7 @@ class TestPool1d_API(unittest.TestCase):
             self.check_avg_static_results(place)
             self.check_max_dygraph_padding_same(place)
             self.check_avg_dygraph_padding_same(place)
+            self.check_max_dygraph_return_index_results(place)
 
 
 class TestPool2dError_API(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_randn_op.py b/python/paddle/fluid/tests/unittests/test_randn_op.py
index 9d2c03f3bba914d8f6b06b54ce0e19c168edb9e3..4ddd98a8a73424d3dee5ff640a73f5b49e4453c5 100644
--- a/python/paddle/fluid/tests/unittests/test_randn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randn_op.py
@@ -63,7 +63,7 @@ class TestRandnOpForDygraph(unittest.TestCase):
         dim_2 = paddle.fill_constant([1], "int32", 50)
         x3 = paddle.randn(shape=[dim_1, dim_2, 784])
 
-        var_shape = paddle.to_variable(np.array(shape))
+        var_shape = paddle.to_tensor(np.array(shape))
         x4 = paddle.randn(var_shape)
 
         for out in [x1, x2, x3, x4]:
diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py
index 9abbee173852baf9db998aad3b71edabdb3e11ed..98c7e3800c20c900b9cdcb01eacd7ab20a612780 100644
--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -105,8 +105,8 @@ class TestRetainGraph(unittest.TestCase):
         A = np.random.rand(2, 3, 32, 32).astype('float32')
         B = np.random.rand(2, 3, 32, 32).astype('float32')
 
-        realA = paddle.to_variable(A)
-        realB = paddle.to_variable(B)
+        realA = paddle.to_tensor(A)
+        realB = paddle.to_tensor(B)
         fakeB = g(realA)
 
         optim_d.clear_gradients()
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index 7c7a71a3be1b508c850048c3945f29ef7424654c..067d1ea5f73bf7d7af8a3511fa18dbc38b148656 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -487,24 +487,24 @@ class TestTransformer(unittest.TestCase):
                 dropout=dropout,
                 weight_attr=[None],
                 bias_attr=[False])
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                 np.random.rand(batch_size, source_length, d_model).astype(
                     "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                 np.random.rand(batch_size, target_length, d_model).astype(
                     "float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
             tgt_mask = np.zeros((batch_size, n_head, target_length,
                                  target_length)).astype("float32")
             tgt_mask[0][0][0][0] = -1e9
             memory_mask = np.zeros((batch_size, n_head, target_length,
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
@@ -521,24 +521,24 @@ class TestTransformer(unittest.TestCase):
                 dropout=dropout,
                 weight_attr=[None, None],
                 bias_attr=[False, False])
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                 np.random.rand(batch_size, source_length, d_model).astype(
                     "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                 np.random.rand(batch_size, target_length, d_model).astype(
                     "float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
             tgt_mask = np.zeros((batch_size, n_head, target_length,
                                  target_length)).astype("float32")
             tgt_mask[0][0][0][0] = -1e9
             memory_mask = np.zeros((batch_size, n_head, target_length,
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
@@ -555,24 +555,24 @@ class TestTransformer(unittest.TestCase):
                 dropout=dropout,
                 weight_attr=[None, None, None],
                 bias_attr=[False, False, True])
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                 np.random.rand(batch_size, source_length, d_model).astype(
                     "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                 np.random.rand(batch_size, target_length, d_model).astype(
                     "float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
             tgt_mask = np.zeros((batch_size, n_head, target_length,
                                  target_length)).astype("float32")
             tgt_mask[0][0][0][0] = -1e9
             memory_mask = np.zeros((batch_size, n_head, target_length,
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
@@ -588,24 +588,24 @@ class TestTransformer(unittest.TestCase):
                 dim_feedforward=dim_feedforward,
                 dropout=dropout,
                 bias_attr=False)
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                 np.random.rand(batch_size, source_length, d_model).astype(
                     "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                 np.random.rand(batch_size, target_length, d_model).astype(
                     "float32"))
             src_mask = np.zeros((batch_size, n_head, source_length,
                                  source_length)).astype("float32")
             src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
             tgt_mask = np.zeros((batch_size, n_head, target_length,
                                  target_length)).astype("float32")
             tgt_mask[0][0][0][0] = -1e9
             memory_mask = np.zeros((batch_size, n_head, target_length,
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
index 21e618a46201659fe0c4e5c67d1d9a8bafd70f1b..2cea3072809ec316abf55844b75c775bcd72042c 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
@@ -63,7 +63,7 @@ class TestZerosLikeImpeartive(unittest.TestCase):
         place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         paddle.disable_static(place)
-        x = paddle.to_variable(np.ones(shape))
+        x = paddle.to_tensor(np.ones(shape))
         for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
             out = zeros_like(x, dtype)
             self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..788c110a592c0e18734e2b361a7edcdbc691230a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op.py
@@ -0,0 +1,215 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from scipy.special import expit, erf
+import paddle
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.fluid import compiler, Program, program_guard
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUActivation(OpTest):
+    def setUp(self):
+        self.op_type = "exp"
+        self.init_dtype()
+        self.init_kernel_type()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.exp(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-3)
+
+    def init_kernel_type(self):
+        pass
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSigmoid(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "sigmoid"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = 1 / (1 + np.exp(-x))
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'], 'Out', max_relative_error=0.01)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUTanh(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "tanh"
+        self.init_dtype()
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.tanh(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSqrt(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "sqrt"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUAbs(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "abs"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 25]).astype(self.dtype)
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.abs(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPURelu(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "relu"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUGelu(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "gelu"
+        self.init_dtype()
+        approximate = False
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = gelu(x, approximate)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {"approximate": approximate, 'use_xpu': True}
+
+
+def gelu(x, approximate):
+    if approximate:
+        y_ref = 0.5 * x * (1.0 + np.tanh(
+            np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+    else:
+        y_ref = 0.5 * x * (1 + erf(x / np.sqrt(2)))
+    return y_ref.astype(x.dtype)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPULog(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "log"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSquare(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "square"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.square(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUPow(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "pow"
+        self.init_dtype()
+
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.power(x, 3)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'factor': 3.0, 'use_xpu': True}
+        self.outputs = {'Out': out}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c6e7d21c1a1918e1d24f580f1e6787632b41dbc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op.py
@@ -0,0 +1,346 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+
+class TestElementwiseAddOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        self.check_output(check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_normal(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if self.dtype == np.float16:
+            return
+        self.check_grad(
+            ['X', 'Y'], 'Out', check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_ingore_x(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if self.dtype == np.float16:
+            return
+        self.check_grad(
+            ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_ingore_y(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if self.dtype == np.float16:
+            return
+        self.check_grad(
+            ['X'],
+            'Out',
+            no_grad_set=set('Y'),
+            check_dygraph=(self.use_mkldnn == False))
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_axis()
+
+        self.inputs = {'X': self.x, 'Y': self.y}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': False, 'use_xpu': True}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['Y'], 'Out')
+
+    def test_check_grad_ingore_y(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
+class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((100, )).astype(self.dtype)
+        self.y = np.random.random((100, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
+class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 100, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 100, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 100)
+
+
+class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(100, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
+        self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(1, 1, 100).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = 2
+
+
+class TestElementwiseAddOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
+
+            # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
+            # float16 only can be set on GPU place
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2)
+
+
+class TestAddOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.add(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_declarative(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = paddle.add(x, y)
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((z_value == z_expected).all(), True)
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 4]).astype('float64')
+            np_y = np.array([1, 5, 2]).astype('float64')
+            x = fluid.dygraph.to_variable(np_x)
+            y = fluid.dygraph.to_variable(np_y)
+            z = paddle.add(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((np_z == z_expected).all(), True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac32d224910a9254efa9f95135652af6ef8adafe
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op.py
@@ -0,0 +1,355 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import paddle.fluid.core as core
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
+    BATCH_SIZE = 2
+    M = 3
+    N = 4
+    K = 5
+    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
+        K = 1
+    if dim_X == 1:
+        if transpose_X:
+            shape_X = [M]
+        else:
+            shape_X = [K]
+    if dim_Y == 1:
+        if transpose_Y:
+            shape_Y = [N]
+        else:
+            shape_Y = [K]
+    if dim_X >= 2:
+        if transpose_X:
+            shape_X = [K, M]
+        else:
+            shape_X = [M, K]
+    if dim_X == 3:
+        shape_X = [BATCH_SIZE] + shape_X
+    if dim_Y >= 2:
+        if transpose_Y:
+            shape_Y = [N, K]
+        else:
+            shape_Y = [K, N]
+    if dim_Y == 3:
+        shape_Y = [BATCH_SIZE] + shape_Y
+    return shape_X, shape_Y
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, 1))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((1, Y.size))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float32")
+    return Out
+
+
+class Generator(object):
+    def setUp(self):
+        self.op_type = "matmul"
+        X = np.random.random(self.shape_X).astype("float32")
+        Y = np.random.random(self.shape_Y).astype("float32")
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y)
+        self.inputs = {'X': X, 'Y': Y}
+        self.attrs = {
+            'transpose_X': self.transpose_X,
+            'transpose_Y': self.transpose_Y
+        }
+        self.outputs = {'Out': Out}
+
+    def test_check_output(self):
+        self.check_output()
+        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
+                self.inputs['Y'].shape) and self.inputs['X'].shape[
+                    0] == self.inputs['Y'].shape[0]:
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
+        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
+                self.inputs['Y'].shape) and self.inputs['X'].shape[
+                    0] == self.inputs['Y'].shape[0]:
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X', 'Y'], 'Out', max_relative_error=5e-2)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X"))
+        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
+                self.inputs['Y'].shape) and self.inputs['X'].shape[
+                    0] == self.inputs['Y'].shape[0]:
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=5e-2,
+                no_grad_set=set("X"))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y'))
+        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
+                self.inputs['Y'].shape) and self.inputs['X'].shape[
+                    0] == self.inputs['Y'].shape[0]:
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=5e-2,
+                no_grad_set=set('Y'))
+
+
+class TestMatmulOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The inputs type of matmul_op must be Variable.
+            input1 = 12
+            self.assertRaises(TypeError, fluid.layers.matmul, input1, input1)
+            # The inputs dtype of matmul_op must be float32, float64.
+            input2 = fluid.layers.data(
+                name='input2', shape=[10, 10], dtype="int32")
+            self.assertRaises(TypeError, fluid.layers.matmul, input2, input2)
+            input3 = fluid.layers.data(
+                name='input3', shape=[2, 2], dtype="float16")
+            fluid.layers.matmul(input3, input3)
+
+
+# Negative dimension generation
+def generate_negative_dims(in_shape):
+    from itertools import combinations
+    size = len(in_shape)
+    indexs = list()
+    shapes = list()
+    for i in range(size):
+        indexs.extend(list(combinations([j for j in range(size)], i + 1)))
+    for idx in indexs:
+        shapes.append(
+            [in_shape[i] if i not in idx else -1 for i in range(size)])
+    return shapes
+
+
+# Build program with inputs sizes that contain negative numbers
+def test_negative_dims_program(obj):
+    for shape_x in generate_negative_dims(obj.shape_X):
+        for shape_y in generate_negative_dims(obj.shape_Y):
+            X = np.random.random(obj.shape_X).astype("float32")
+            Y = np.random.random(obj.shape_Y).astype("float32")
+            Ref = reference_matmul(X, Y, obj.transpose_X, obj.transpose_Y)
+            with program_guard(Program(), Program()):
+                x = fluid.data(name='x', shape=shape_x, dtype='float32')
+                y = fluid.data(name='y', shape=shape_y, dtype='float32')
+                output = fluid.layers.matmul(x, y, obj.transpose_X,
+                                             obj.transpose_Y)
+                obj.assertEqual(len(Ref.shape), len(output.shape))
+                for idx in range(len(Ref.shape)):
+                    if output.shape[idx] != -1:
+                        obj.assertEqual(Ref.shape[idx], output.shape[idx])
+                exe = fluid.Executor(fluid.CPUPlace())
+                res, = exe.run(fluid.default_main_program(),
+                               feed={'x': X,
+                                     'y': Y},
+                               fetch_list=[output])
+                np.allclose(res, Ref, atol=1e-5)
+
+
+# Generate program api cases for all negative possibilities
+def api_test(dim_x, dim_y, trans_x, trans_y):
+    test_name = ('TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+        dim_x, dim_y, trans_x, trans_y))
+    shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
+                                                  trans_y)
+    globals()[test_name] = type(test_name, (unittest.TestCase, ), {
+        'shape_X': shape_x,
+        'shape_Y': shape_y,
+        'transpose_X': trans_x,
+        'transpose_Y': trans_y,
+        'test_propram': test_negative_dims_program,
+    })
+
+
+# Generate operators cases for all possibilities
+def inject_test(dim_x, dim_y, trans_x, trans_y):
+    test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+        dim_x, dim_y, trans_x, trans_y))
+    shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
+                                                  trans_y)
+    globals()[test_name] = type(test_name, (Generator, OpTest), {
+        'shape_X': shape_x,
+        'shape_Y': shape_y,
+        'transpose_X': trans_x,
+        'transpose_Y': trans_y,
+    })
+
+
+for dim_X in (1, 2, 3):
+    for dim_Y in (1, 2, 3):
+        for transose_x in (False, True):
+            for transose_y in (False, True):
+                inject_test(dim_X, dim_Y, transose_x, transose_y)
+                api_test(dim_X, dim_Y, transose_x, transose_y)
+
+
+# Test case n-dim
+def generate_compatible_shapes(dim, transpose_X, transpose_Y):
+    M = 2
+    N = 4
+    K = 3
+    shape_X = [2 for _ in range(dim - 2)]
+    shape_Y = [2 for _ in range(dim - 2)]
+
+    if transpose_X:
+        shape_X += [K, M]
+    else:
+        shape_X += [M, K]
+
+    if transpose_Y:
+        shape_Y += [N, K]
+    else:
+        shape_Y += [K, N]
+
+    return shape_X, shape_Y
+
+
+# # Test case n-dim
+for dim in [4]:
+    for transpose_X in [False, True]:
+        for transpose_Y in [False, True]:
+            test_name = (
+                'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+                    dim, dim, transpose_X, transpose_Y))
+            shape_X, shape_Y = generate_compatible_shapes(dim, transpose_X,
+                                                          transpose_Y)
+            globals()[test_name] = type(test_name, (Generator, OpTest), {
+                'shape_X': shape_X,
+                'shape_Y': shape_Y,
+                'transpose_X': transpose_X,
+                'transpose_Y': transpose_Y,
+            })
+
+
+class API_TestMm(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2], dtype="float64")
+            y = fluid.data(name='y', shape=[2], dtype='float64')
+            res = fluid.data(name="output", shape=[1], dtype="float64")
+            result = paddle.mm(x, y)
+            exe = fluid.Executor(fluid.CPUPlace())
+            data1 = np.random.rand(2)
+            data2 = np.random.rand(2)
+            np_res = exe.run(feed={'x': data1, 'y': data2}, fetch_list=[result])
+            expected_result = np.matmul(
+                data1.reshape(1, 2), data2.reshape(2, 1))
+
+        self.assertTrue(
+            np.allclose(
+                np_res, expected_result, atol=1e-5),
+            "two value is\
+            {}\n{}, check diff!".format(np_res, expected_result))
+
+    def test_dygraph_without_out(self):
+        device = fluid.CPUPlace()
+        with fluid.dygraph.guard(device):
+            input_array1 = np.random.rand(3, 4).astype("float64")
+            input_array2 = np.random.rand(4, 3).astype("float64")
+            data1 = fluid.dygraph.to_variable(input_array1)
+            data2 = fluid.dygraph.to_variable(input_array2)
+            out = paddle.mm(data1, data2)
+            expected_result = np.matmul(input_array1, input_array2)
+        self.assertTrue(np.allclose(expected_result, out.numpy()))
+
+
+class Test_API_Matmul(unittest.TestCase):
+    def test_dygraph_without_out(self):
+        device = fluid.CPUPlace()
+        with fluid.dygraph.guard(device):
+            input_array1 = np.random.rand(3, 4).astype("float64")
+            input_array2 = np.random.rand(4, 3).astype("float64")
+            data1 = fluid.dygraph.to_variable(input_array1)
+            data2 = fluid.dygraph.to_variable(input_array2)
+            out = paddle.matmul(data1, data2)
+            expected_result = np.matmul(input_array1, input_array2)
+        self.assertTrue(np.allclose(expected_result, out.numpy()))
+
+
+class API_TestMmError(unittest.TestCase):
+    def test_errors(self):
+        def test_error1():
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                data1 = fluid.data(name="data1", shape=[10, 2], dtype="float32")
+                data2 = fluid.data(name="data2", shape=[3, 10], dtype="float32")
+                paddle.mm(data1, data2)
+
+        self.assertRaises(ValueError, test_error1)
+
+        def test_error2():
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                data1 = fluid.data(
+                    name="data1", shape=[-1, 10, 2], dtype="float32")
+                data2 = fluid.data(
+                    name="data2", shape=[-1, 2, 10], dtype="float32")
+                paddle.mm(data1, data2)
+
+        test_error2()
+
+        def test_error3():
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                data1 = fluid.data(
+                    name="data1", shape=[10, 10, 2], dtype="float32")
+                data2 = fluid.data(
+                    name="data2", shape=[3, 2, 10], dtype="float32")
+                paddle.mm(data1, data2)
+
+        self.assertRaises(ValueError, test_error3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mul_op.py b/python/paddle/fluid/tests/unittests/xpu/test_mul_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ab5b71e4fbf05fd7ccb61b44dea0cf2a9e0b34
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mul_op.py
@@ -0,0 +1,161 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+class TestMulOp(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.dtype = np.float64
+        self.init_dtype_type()
+        self.inputs = {
+            'X': np.random.random((20, 5)).astype(self.dtype),
+            'Y': np.random.random((5, 21)).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+    def init_dtype_type(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+class TestMulOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The input type of mul_op must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x2 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, fluid.layers.mul, x1, x2)
+            # The input dtype of mul_op must be float32 or float64.
+            x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32")
+            x4 = fluid.layers.data(name='x4', shape=[4], dtype="int32")
+            self.assertRaises(TypeError, fluid.layers.mul, x3, x4)
+
+
+class TestMulOp2(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.dtype = np.float64
+        self.init_dtype_type()
+        self.inputs = {
+            'X': np.random.random((3, 4, 2, 9)).astype(self.dtype),
+            'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.dtype)
+        }
+        self.attrs = {
+            'x_num_col_dims': 2,
+            'y_num_col_dims': 2,
+        }
+        result = np.dot(self.inputs['X'].reshape(3 * 4, 2 * 9),
+                        self.inputs['Y'].reshape(3 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
+        self.outputs = {'Out': result}
+
+    def init_dtype_type(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set('X'))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp1(TestMulOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place, atol=1e-1)
+
+    def test_check_grad_normal(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+
+    def test_check_grad_ingore_x(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp2(TestMulOp2):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place, atol=2e-1)
+
+    def test_check_grad_normal(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.9)
+
+    def test_check_grad_ingore_x(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 7ed571fa9c6a4a962b20397c999368dad0734ff0..69b7fedd72eed52cfd06715025f5cd88983e2e2a 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -301,10 +301,11 @@ class ProgBarLogger(Callback):
 
             train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-            model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+            lenet = paddle.vision.LeNet()
+            model = paddle.Model(lenet,
                 inputs, labels)
 
-            optim = paddle.optimizer.Adam(0.001)
+            optim = paddle.optimizer.Adam(0.001, parameters=lenet.parameters())
             model.prepare(optimizer=optim,
                         loss=paddle.nn.CrossEntropyLoss(),
                         metrics=paddle.metric.Accuracy())
@@ -436,10 +437,11 @@ class ModelCheckpoint(Callback):
 
             train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-            model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+            lenet = paddle.vision.LeNet()
+            model = paddle.Model(lenet,
                 inputs, labels)
 
-            optim = paddle.optimizer.Adam(0.001)
+            optim = paddle.optimizer.Adam(0.001, parameters=lenet.parameters())
             model.prepare(optimizer=optim,
                         loss=paddle.nn.CrossEntropyLoss(),
                         metrics=paddle.metric.Accuracy())
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index d00cdf1044bdcd41165777a63feb2950d51e1eb7..fd6161f1cf8ec43ccc391442ebdb7c12769de4cb 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -820,10 +820,9 @@ class Model(object):
         from paddle.static import InputSpec
 
         device = paddle.set_device('cpu') # or 'gpu'
-        # if use static graph, do not set
-        paddle.disable_static(device)
 
         net = nn.Sequential(
+            nn.Flatten(1),
             nn.Linear(784, 200),
             nn.Tanh(),
             nn.Linear(200, 10))
@@ -839,7 +838,7 @@ class Model(object):
                       paddle.nn.CrossEntropyLoss(),
                       paddle.metric.Accuracy())
         
-        data = paddle.vision.datasets.MNIST(mode='train', chw_format=False)
+        data = paddle.vision.datasets.MNIST(mode='train')
         model.fit(data, epochs=2, batch_size=32, verbose=1)
     """
 
@@ -855,6 +854,7 @@ class Model(object):
         self._is_shape_inferred = False
         self._test_dataloader = None
 
+<<<<<<< HEAD
         if not in_dygraph_mode():
             if not isinstance(inputs, (list, dict, Input)):
                 raise TypeError(
@@ -863,6 +863,13 @@ class Model(object):
             self._shapes = [list(input.shape) for input in inputs]
 
         self._inputs = self._verify_spec(inputs, is_input=True)
+=======
+        if not isinstance(inputs, (list, dict, Input)):
+            raise TypeError(
+                "'inputs' must be list or dict in static graph mode")
+
+        self._inputs = self._verify_spec(inputs, True)
+>>>>>>> 6b727e08b1f38b3f4acc1708c163ed6ae5df8d58
         self._labels = self._verify_spec(labels)
 
         # init backend
@@ -896,7 +903,6 @@ class Model(object):
               from paddle.static import InputSpec
 
               device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
 
               net = nn.Sequential(
                   nn.Linear(784, 200),
@@ -946,7 +952,6 @@ class Model(object):
               from paddle.static import InputSpec
 
               device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
 
               net = nn.Sequential(
                   nn.Linear(784, 200),
@@ -991,9 +996,12 @@ class Model(object):
               import numpy as np
               import paddle
               import paddle.nn as nn
+              from paddle.static import InputSpec
 
               device = paddle.set_device('cpu') # or 'gpu'
-              paddle.disable_static(device)
+              
+              input = InputSpec([None, 784], 'float32', 'x')
+              label = InputSpec([None, 1], 'int64', 'label')
 
               net = nn.Sequential(
                   nn.Linear(784, 200),
@@ -1001,7 +1009,7 @@ class Model(object):
                   nn.Linear(200, 10),
                   nn.Softmax())
 
-              model = paddle.Model(net)
+              model = paddle.Model(net, input, label)
               model.prepare()
               data = np.random.random(size=(4,784)).astype(np.float32)
               out = model.test_batch([data])
@@ -1052,6 +1060,7 @@ class Model(object):
                     def __init__(self):
                         super(Mnist, self).__init__()
                         self.net = nn.Sequential(
+                            nn.Flatten(1),
                             nn.Linear(784, 200),
                             nn.Tanh(),
                             nn.Linear(200, 10),
@@ -1071,7 +1080,7 @@ class Model(object):
                 optim = paddle.optimizer.SGD(learning_rate=1e-3,
                     parameters=model.parameters())
                 model.prepare(optim, paddle.nn.CrossEntropyLoss())
-                data = paddle.vision.datasets.MNIST(mode='train', chw_format=False)
+                data = paddle.vision.datasets.MNIST(mode='train')
                 model.fit(data, epochs=1, batch_size=32, verbose=0)
                 model.save('checkpoint/test')  # save for training
                 model.save('inference_model', False)  # save for inference
@@ -1118,15 +1127,18 @@ class Model(object):
             
               import paddle
               import paddle.nn as nn
-              
+              from paddle.static import InputSpec
+
               device = paddle.set_device('cpu')
-              paddle.disable_static(device)
+
+              input = InputSpec([None, 784], 'float32', 'x')
 
               model = paddle.Model(nn.Sequential(
                   nn.Linear(784, 200),
                   nn.Tanh(),
                   nn.Linear(200, 10),
-                  nn.Softmax()))
+                  nn.Softmax()), input)
+
               model.save('checkpoint/test')
               model.load('checkpoint/test')
         """
@@ -1191,13 +1203,15 @@ class Model(object):
 
               import paddle
               import paddle.nn as nn
+              from paddle.static import InputSpec
 
-              paddle.disable_static()
-
+              input = InputSpec([None, 784], 'float32', 'x')
+              
               model = paddle.Model(nn.Sequential(
                   nn.Linear(784, 200),
                   nn.Tanh(),
-                  nn.Linear(200, 10)))
+                  nn.Linear(200, 10)), input)
+
               params = model.parameters()
         """
         return self._adapter.parameters()
@@ -1339,7 +1353,7 @@ class Model(object):
               label = InputSpec([None, 1], 'int64', 'label')
            
               model = paddle.Model(
-                  paddle.vision.models.LeNet(classifier_activation=None),
+                  paddle.vision.models.LeNet(),
                   input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
@@ -1376,7 +1390,7 @@ class Model(object):
               label = InputSpec([None, 1], 'int64', 'label')
            
               model = paddle.Model(
-                  paddle.vision.models.LeNet(classifier_activation=None), input, label)
+                  paddle.vision.models.LeNet(), input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
               model.prepare(
@@ -1509,7 +1523,7 @@ class Model(object):
 
             # imperative mode
             paddle.disable_static()
-            model = paddle.Model(paddle.vision.models.LeNet())
+            model = paddle.Model(paddle.vision.models.LeNet(), input, label)
             model.prepare(metrics=paddle.metric.Accuracy())
             result = model.evaluate(val_dataset, batch_size=64)
             print(result)
@@ -1606,19 +1620,20 @@ class Model(object):
 
             test_dataset = MnistDataset(mode='test', return_label=False)
 
-            # declarative mode
+            # imperative mode
             input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
             model = paddle.Model(paddle.vision.models.LeNet(), input)
             model.prepare()
-
             result = model.predict(test_dataset, batch_size=64)
             print(len(result[0]), result[0][0].shape)
 
-            # imperative mode
+            # declarative mode
             device = paddle.set_device('cpu')
-            paddle.disable_static(device)
-            model = paddle.Model(paddle.vision.models.LeNet())
+            paddle.enable_static()
+            input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
+            model = paddle.Model(paddle.vision.models.LeNet(), input)
             model.prepare()
+
             result = model.predict(test_dataset, batch_size=64)
             print(len(result[0]), result[0][0].shape)
         """
@@ -1875,15 +1890,11 @@ class Model(object):
 
               import paddle
               from paddle.static import InputSpec
-
-              dynamic = True
-              device = paddle.set_device('cpu')
-              paddle.disable_static(device) if dynamic else None
            
               input = InputSpec([None, 1, 28, 28], 'float32', 'image')
               label = InputSpec([None, 1], 'int64', 'label')
            
-              model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+              model = paddle.Model(paddle.vision.LeNet(),
                   input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 1cd65171ff034e8b834c38184e4452796da985ca..f4a9b8c01d02a109f91aa717342ba47321f5f47e 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -182,7 +182,6 @@ class Accuracy(Metric):
         import numpy as np
         import paddle
 
-        paddle.disable_static()
         x = paddle.to_tensor(np.array([
             [0.1, 0.2, 0.3, 0.4],
             [0.1, 0.4, 0.3, 0.2],
@@ -202,11 +201,13 @@ class Accuracy(Metric):
         .. code-block:: python
 
         import paddle
-
-        paddle.disable_static()
+        from paddle.static import InputSpec
+           
+        input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+        label = InputSpec([None, 1], 'int64', 'label')
         train_dataset = paddle.vision.datasets.MNIST(mode='train')
 
-        model = paddle.Model(paddle.vision.LeNet(classifier_activation=None))
+        model = paddle.Model(paddle.vision.LeNet(), input, label)
         optim = paddle.optimizer.Adam(
             learning_rate=0.001, parameters=model.parameters())
         model.prepare(
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 1eb9167d0352f36bfcb87db79ba23dce14bac507..bed5df8fa78c753565c8391ba414135e63d335aa 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -571,15 +571,26 @@ def max_pool1d(x,
     padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
-        pool_out = core.ops.max_pool2d_with_index(
-            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
-            'paddings', padding, 'padding_algorithm', padding_algorithm,
-            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
-            'exclusive', True, 'data_format', data_format)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+        if return_indices:
+            pool_out = core.ops.max_pool2d_with_index(
+                x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
+                stride, 'paddings', padding, 'padding_algorithm',
+                padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+                'use_mkldnn', False, 'exclusive', True, 'data_format',
+                data_format)
+            return (squeeze(pool_out[0], [2]), squeeze(
+                pool_out[1],
+                [2])) if return_indices else squeeze(pool_out[0], [2])
+        else:
+            pool_out = core.ops.pool2d(
+                x, 'pooling_type', 'max', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
+            return squeeze(pool_out, [2])
 
-    op_type = 'max_pool2d_with_index'
+    op_type = 'max_pool2d_with_index' if return_indices else "pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -696,7 +707,7 @@ def max_pool2d(x,
         )
 
     if in_dygraph_mode():
-        if data_format == "NCHW":
+        if return_indices:
             output = core.ops.max_pool2d_with_index(
                 x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
                 stride, 'paddings', padding, 'padding_algorithm',
@@ -704,7 +715,7 @@ def max_pool2d(x,
                 'use_mkldnn', False, 'exclusive', True, 'data_format',
                 data_format)
             return output if return_indices else output[0]
-        elif data_format == "NHWC" and not return_indices:
+        else:
             output = core.ops.pool2d(
                 x, 'pooling_type', 'max', 'ksize', kernel_size,
                 'global_pooling', False, 'padding_algorithm', padding_algorithm,
@@ -713,7 +724,7 @@ def max_pool2d(x,
                 'data_format', data_format)
             return output
 
-    op_type = 'max_pool2d_with_index' if data_format == "NCHW" else "pool2d"
+    op_type = 'max_pool2d_with_index' if return_indices else "pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -822,7 +833,7 @@ def max_pool3d(x,
         )
 
     if in_dygraph_mode():
-        if data_format == "NCDHW":
+        if return_indices:
             output = core.ops.max_pool3d_with_index(
                 x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
                 stride, 'paddings', padding, 'global_pooling', False,
@@ -830,7 +841,7 @@ def max_pool3d(x,
                 'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
                 'data_format', data_format)
             return output if return_indices else output[0]
-        elif data_format == "NDHWC" and not return_indices:
+        else:
             output = core.ops.pool3d(
                 x, 'pooling_type', 'max', 'ksize', kernel_size,
                 'global_pooling', False, 'padding_algorithm', padding_algorithm,
@@ -839,7 +850,7 @@ def max_pool3d(x,
                 'data_format', data_format)
             return output
 
-    op_type = "max_pool3d_with_index" if data_format == "NCDHW" else "pool3d"
+    op_type = "max_pool3d_with_index" if return_indices else "pool3d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index eb70320ea7551de6e1117900e3769f000fdf23dd..d7a3cfcdb92debe0447cb4054478729e92dbab32 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -19,10 +19,12 @@ from paddle.fluid import core, Variable
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
+from paddle.fluid.framework import static_only
 
 __all__ = ['data', 'InputSpec']
 
 
+@static_only
 def data(name, shape, dtype=None, lod_level=0):
     """
     **Data Layer**
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index f27cfba487d78f284408815eaba933b18f303df9..15580b6618e6dc61d5e74216776417a02846a16a 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -156,8 +156,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     def __check_input(x, y):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
-            check_variable_and_dtype(val, name, ['float32', 'float64'],
-                                     'matmul')
+            check_variable_and_dtype(
+                val, name, ['float16', 'float32', 'float64'], 'matmul')
 
     __check_input(x, y)
 
@@ -707,20 +707,14 @@ def cross(x, y, axis=None, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            from paddle import to_variable
-            import numpy as np
-
             paddle.disable_static()
 
-            data_x = np.array([[1.0, 1.0, 1.0],
-                               [2.0, 2.0, 2.0],
-                               [3.0, 3.0, 3.0]])
-            data_y = np.array([[1.0, 1.0, 1.0],
-                               [1.0, 1.0, 1.0],
-                               [1.0, 1.0, 1.0]])
-            x = to_variable(data_x)
-            y = to_variable(data_y)
-
+            x = paddle.to_tensor([[1.0, 1.0, 1.0],
+                                  [2.0, 2.0, 2.0],
+                                  [3.0, 3.0, 3.0]])
+            y = paddle.to_tensor([[1.0, 1.0, 1.0],
+                                  [1.0, 1.0, 1.0],
+                                  [1.0, 1.0, 1.0]])
             z1 = paddle.cross(x, y)
             print(z1.numpy())
             # [[-1. -1. -1.]
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 966544c7abb54ae7de163aa322890a55ee94d3d8..ce32fb76f5cd4da30f95baa3b8928d1c879477ca 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1650,12 +1650,11 @@ def cumsum(x, axis=None, dtype=None, name=None):
         .. code-block:: python
             
             import paddle
-            from paddle import to_variable
             import numpy as np
 
             paddle.disable_static()
             data_np = np.arange(12).reshape(3, 4)
-            data = to_variable(data_np)
+            data = paddle.to_tensor(data_np)
 
             y = paddle.cumsum(data)
             print(y.numpy())
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index 6fb73b08c11b417332b064df7408e78ed390cc2f..e1bc65a5d15c2883e14d20c5e06c2ee3cd726ea5 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -8,10 +8,6 @@ foreach(TEST_OP ${DIST_TEST_OPS})
     list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
 
-# disable test_pretrained_model and test_vision_models
-list(REMOVE_ITEM TEST_OPS test_pretrained_model)
-list(REMOVE_ITEM TEST_OPS test_vision_models)
-
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/tests/dist_hapi_mnist_dynamic.py b/python/paddle/tests/dist_hapi_mnist_dynamic.py
index 13d966bf38f2aaed35e120aa4d25705cfc36c230..46d02789402b22263cfbd8cbdfeb6d66a5de900d 100644
--- a/python/paddle/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py
@@ -68,7 +68,7 @@ class TestDistTraning(unittest.TestCase):
         inputs = [Input(im_shape, 'float32', 'image')]
         labels = [Input([None, 1], 'int64', 'label')]
 
-        model = Model(LeNet(classifier_activation=None), inputs, labels)
+        model = Model(LeNet(), inputs, labels)
         optim = fluid.optimizer.Momentum(
             learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
         model.prepare(optim, CrossEntropyLoss(), Accuracy())
diff --git a/python/paddle/tests/dist_hapi_mnist_static.py b/python/paddle/tests/dist_hapi_mnist_static.py
index 9d8e5f3652c9810579a0b66035a64d1d3b915bff..eab34a6dafbc354a24aa51e93a9fec9efc3b3cee 100644
--- a/python/paddle/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/tests/dist_hapi_mnist_static.py
@@ -67,7 +67,7 @@ class TestDistTraning(unittest.TestCase):
         inputs = [Input(im_shape, 'float32', 'image')]
         labels = [Input([None, 1], 'int64', 'label')]
 
-        model = Model(LeNet(classifier_activation=None), inputs, labels)
+        model = Model(LeNet(), inputs, labels)
         optim = fluid.optimizer.Momentum(
             learning_rate=0.001, momentum=.9, parameter_list=model.parameters())
         model.prepare(optim, CrossEntropyLoss(), Accuracy())
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 2de39c709d899e5ee56a0ec43c52233de132c920..34e7065cd2391f74dc9d3c62dd1a7eab7bea4afb 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -40,7 +40,7 @@ from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTra
 
 
 class LeNetDygraph(paddle.nn.Layer):
-    def __init__(self, num_classes=10, classifier_activation=None):
+    def __init__(self, num_classes=10):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
@@ -55,8 +55,7 @@ class LeNetDygraph(paddle.nn.Layer):
 
         if num_classes > 0:
             self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
+                Linear(400, 120), Linear(120, 84), Linear(84, 10))
 
     def forward(self, inputs):
         x = self.features(inputs)
@@ -67,6 +66,34 @@ class LeNetDygraph(paddle.nn.Layer):
         return x
 
 
+class LeNetDeclarative(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10):
+        super(LeNetDeclarative, self).__init__()
+        self.num_classes = num_classes
+        self.features = Sequential(
+            Conv2d(
+                1, 6, 3, stride=1, padding=1),
+            ReLU(),
+            Pool2D(2, 'max', 2),
+            Conv2d(
+                6, 16, 5, stride=1, padding=0),
+            ReLU(),
+            Pool2D(2, 'max', 2))
+
+        if num_classes > 0:
+            self.fc = Sequential(
+                Linear(400, 120), Linear(120, 84), Linear(84, 10))
+
+    @declarative
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+
 class MnistDataset(MNIST):
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
@@ -198,7 +225,7 @@ class TestModel(unittest.TestCase):
         paddle.manual_seed(seed)
         paddle.framework.random._manual_program_seed(seed)
 
-        net = LeNet(classifier_activation=None)
+        net = LeNet()
         optim_new = fluid.optimizer.Adam(
             learning_rate=0.001, parameter_list=net.parameters())
         model = Model(net, inputs=self.inputs, labels=self.labels)
@@ -287,14 +314,12 @@ class TestModel(unittest.TestCase):
 
 
 class MyModel(paddle.nn.Layer):
-    def __init__(self, classifier_activation='softmax'):
+    def __init__(self):
         super(MyModel, self).__init__()
         self._fc = Linear(20, 10)
-        self._act = Softmax()  #Todo: accept any activation
 
     def forward(self, x):
         y = self._fc(x)
-        y = self._act(y)
         return y
 
 
@@ -311,7 +336,7 @@ class TestModelFunction(unittest.TestCase):
         def get_expect():
             fluid.enable_dygraph(fluid.CPUPlace())
             self.set_seed()
-            m = MyModel(classifier_activation=None)
+            m = MyModel()
             optim = fluid.optimizer.SGD(learning_rate=0.001,
                                         parameter_list=m.parameters())
             m.train()
@@ -330,7 +355,7 @@ class TestModelFunction(unittest.TestCase):
             fluid.enable_dygraph(device) if dynamic else None
             self.set_seed()
 
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
             optim2 = fluid.optimizer.SGD(learning_rate=0.001,
                                          parameter_list=net.parameters())
 
@@ -374,7 +399,7 @@ class TestModelFunction(unittest.TestCase):
         for dynamic in [True, False]:
             device = paddle.set_device('cpu')
             fluid.enable_dygraph(device) if dynamic else None
-            net = MyModel(classifier_activation=None)
+            net = MyModel()
             inputs = [InputSpec([None, 20], 'float32', 'x')]
             labels = [InputSpec([None, 1], 'int64', 'label')]
             optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -415,7 +440,7 @@ class TestModelFunction(unittest.TestCase):
         # dynamic saving
         device = paddle.set_device('cpu')
         fluid.enable_dygraph(device)
-        model = Model(MyModel(classifier_activation=None))
+        model = Model(MyModel())
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -424,7 +449,7 @@ class TestModelFunction(unittest.TestCase):
 
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
-        model = Model(MyModel(classifier_activation=None), inputs, labels)
+        model = Model(MyModel(), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -434,7 +459,7 @@ class TestModelFunction(unittest.TestCase):
     def test_static_save_dynamic_load(self):
         path = tempfile.mkdtemp()
 
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -446,7 +471,7 @@ class TestModelFunction(unittest.TestCase):
         device = paddle.set_device('cpu')
         fluid.enable_dygraph(device)  #if dynamic else None
 
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
         inputs = [InputSpec([None, 20], 'float32', 'x')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
         optim = fluid.optimizer.SGD(learning_rate=0.001,
@@ -582,13 +607,14 @@ class TestModelFunction(unittest.TestCase):
 
 class TestRaiseError(unittest.TestCase):
     def test_input_without_name(self):
-        net = MyModel(classifier_activation=None)
+        net = MyModel()
 
         inputs = [InputSpec([None, 10], 'float32')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
         with self.assertRaises(ValueError):
             model = Model(net, inputs, labels)
 
+<<<<<<< HEAD
     def test_export_deploy_model_without_inputs_and_run_in_dygraph(self):
         paddle.disable_static()
         net = MyModel(classifier_activation=None)
@@ -599,6 +625,15 @@ class TestRaiseError(unittest.TestCase):
             model = Model(net)
             model.save(save_dir, training=False)
         paddle.enable_static()
+=======
+    def test_input_without_input_spec(self):
+        for dynamic in [True, False]:
+            paddle.disable_static() if dynamic else None
+            net = MyModel()
+            with self.assertRaises(TypeError):
+                model = Model(net)
+            paddle.enable_static()
+>>>>>>> 6b727e08b1f38b3f4acc1708c163ed6ae5df8d58
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
index 641147d39e94f7c2bbb426900ed484546bad49c6..bf9c2a2ae061179bd9d656fa3cb23c5ac93c6c53 100644
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import unittest
+import tempfile
+import shutil
 import numpy as np
 
 import paddle
@@ -23,27 +25,36 @@ import paddle.vision.models as models
 # test the predicted resutls of static graph and dynamic graph are equal
 # when used pretrained model
 class TestPretrainedModel(unittest.TestCase):
-    def infer(self, x, arch, dygraph=True):
-        if dygraph:
-            paddle.disable_static()
-
-        net = models.__dict__[arch](pretrained=True, classifier_activation=None)
-        inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')]
-        model = paddle.Model(network=net, inputs=inputs)
-        model.prepare()
-        res = model.test_batch(x)
-
-        if dygraph:
-            paddle.enable_static()
-        return res
+    def infer(self, arch):
+        path = tempfile.mkdtemp()
+        x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
+        res = {}
+        for dygraph in [True, False]:
+            if not dygraph:
+                paddle.enable_static()
+
+            net = models.__dict__[arch]()
+            inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')]
+            model = paddle.Model(network=net, inputs=inputs)
+            model.prepare()
+
+            if dygraph:
+                model.save(path)
+                res['dygraph'] = model.test_batch(x)
+            else:
+                model.load(path)
+                res['static'] = model.test_batch(x)
+
+            if not dygraph:
+                paddle.disable_static()
+
+        shutil.rmtree(path)
+        np.testing.assert_allclose(res['dygraph'], res['static'])
 
     def test_models(self):
         arches = ['mobilenet_v1', 'mobilenet_v2', 'resnet18']
         for arch in arches:
-            x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
-            y_dygraph = self.infer(x, arch)
-            y_static = self.infer(x, arch, dygraph=False)
-            np.testing.assert_allclose(y_dygraph, y_static)
+            self.infer(arch)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
index 44f9ab5390122f086af4168e225fe2b5a2d8a9b2..6489b02615bb94269f83c4ed780e555c487eacbe 100644
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -36,7 +36,7 @@ class TestVisonModels(unittest.TestCase):
         model.test_batch(x)
 
     def test_mobilenetv2_pretrained(self):
-        self.models_infer('mobilenet_v2', pretrained=True)
+        self.models_infer('mobilenet_v2', pretrained=False)
 
     def test_mobilenetv1(self):
         self.models_infer('mobilenet_v1')
diff --git a/python/paddle/vision/models/lenet.py b/python/paddle/vision/models/lenet.py
index c2d4be7cda10d580af44154e6a03e0871ec20706..b30d5992f9adf792f0bae90e19b9c00c4d47c0a2 100644
--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
@@ -12,20 +12,19 @@
 #See the License for the specific language governing permissions and
 #limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax
+import paddle
+import paddle.nn as nn
 
 __all__ = ['LeNet']
 
 
-class LeNet(fluid.dygraph.Layer):
+class LeNet(nn.Layer):
     """LeNet model from
     `"LeCun Y, Bottou L, Bengio Y, et al. Gradient-based learning applied to document recognition[J]. Proceedings of the IEEE, 1998, 86(11): 2278-2324.`_
 
     Args:
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 10.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -35,28 +34,27 @@ class LeNet(fluid.dygraph.Layer):
             model = LeNet()
     """
 
-    def __init__(self, num_classes=10, classifier_activation='softmax'):
+    def __init__(self, num_classes=10):
         super(LeNet, self).__init__()
         self.num_classes = num_classes
-        self.features = Sequential(
-            Conv2d(
+        self.features = nn.Sequential(
+            nn.Conv2d(
                 1, 6, 3, stride=1, padding=1),
-            ReLU(),
-            Pool2D(2, 'max', 2),
-            Conv2d(
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(
                 6, 16, 5, stride=1, padding=0),
-            ReLU(),
-            Pool2D(2, 'max', 2))
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2))
 
         if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
+            self.fc = nn.Sequential(
+                nn.Linear(400, 120), nn.Linear(120, 84), nn.Linear(84, 10))
 
     def forward(self, inputs):
         x = self.features(inputs)
 
         if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
+            x = paddle.flatten(x, 1)
             x = self.fc(x)
         return x
diff --git a/python/paddle/vision/models/mobilenetv1.py b/python/paddle/vision/models/mobilenetv1.py
index 10defbf593dca642386e73b65094612f93dce9dc..39654122e3b33e52f4693653dbdd14d6e513228e 100644
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.fluid.initializer import MSRA
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+import paddle
+import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -24,85 +22,66 @@ __all__ = ['MobileNetV1', 'mobilenet_v1']
 model_urls = {
     'mobilenetv1_1.0':
     ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams',
-     'bf0d25cb0bed1114d9dac9384ce2b4a6')
+     '42a154c2f26f86e7457d6daded114e8c')
 }
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(nn.Layer):
     def __init__(self,
-                 num_channels,
-                 filter_size,
-                 num_filters,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
                  stride,
                  padding,
-                 channels=None,
-                 num_groups=1,
-                 act='relu',
-                 use_cudnn=True,
-                 name=None):
+                 num_groups=1):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
+        self._conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
             stride=stride,
             padding=padding,
             groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=ParamAttr(
-                initializer=MSRA(), name=self.full_name() + "_weights"),
             bias_attr=False)
 
-        self._batch_norm = BatchNorm(
-            num_filters,
-            act=act,
-            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
-            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
-            moving_mean_name=self.full_name() + "_bn" + '_mean',
-            moving_variance_name=self.full_name() + "_bn" + '_variance')
+        self._norm_layer = nn.BatchNorm2d(out_channels)
+        self._act = nn.ReLU()
 
-    def forward(self, inputs):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        return y
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._norm_layer(x)
+        x = self._act(x)
+        return x
 
 
-class DepthwiseSeparable(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters1,
-                 num_filters2,
-                 num_groups,
-                 stride,
-                 scale,
-                 name=None):
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self, in_channels, out_channels1, out_channels2, num_groups,
+                 stride, scale):
         super(DepthwiseSeparable, self).__init__()
 
         self._depthwise_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=int(num_filters1 * scale),
-            filter_size=3,
+            in_channels,
+            int(out_channels1 * scale),
+            kernel_size=3,
             stride=stride,
             padding=1,
-            num_groups=int(num_groups * scale),
-            use_cudnn=False)
+            num_groups=int(num_groups * scale))
 
         self._pointwise_conv = ConvBNLayer(
-            num_channels=int(num_filters1 * scale),
-            filter_size=1,
-            num_filters=int(num_filters2 * scale),
+            int(out_channels1 * scale),
+            int(out_channels2 * scale),
+            kernel_size=1,
             stride=1,
             padding=0)
 
-    def forward(self, inputs):
-        y = self._depthwise_conv(inputs)
-        y = self._pointwise_conv(y)
-        return y
+    def forward(self, x):
+        x = self._depthwise_conv(x)
+        x = self._pointwise_conv(x)
+        return x
 
 
-class MobileNetV1(fluid.dygraph.Layer):
+class MobileNetV1(nn.Layer):
     """MobileNetV1 model from
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" <https://arxiv.org/abs/1704.04861>`_.
 
@@ -111,7 +90,6 @@ class MobileNetV1(fluid.dygraph.Layer):
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
         with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -121,11 +99,7 @@ class MobileNetV1(fluid.dygraph.Layer):
             model = MobileNetV1()
     """
 
-    def __init__(self,
-                 scale=1.0,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
         super(MobileNetV1, self).__init__()
         self.scale = scale
         self.dwsl = []
@@ -133,18 +107,17 @@ class MobileNetV1(fluid.dygraph.Layer):
         self.with_pool = with_pool
 
         self.conv1 = ConvBNLayer(
-            num_channels=3,
-            filter_size=3,
-            channels=3,
-            num_filters=int(32 * scale),
+            in_channels=3,
+            out_channels=int(32 * scale),
+            kernel_size=3,
             stride=2,
             padding=1)
 
         dws21 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(32 * scale),
-                num_filters1=32,
-                num_filters2=64,
+                in_channels=int(32 * scale),
+                out_channels1=32,
+                out_channels2=64,
                 num_groups=32,
                 stride=1,
                 scale=scale),
@@ -153,9 +126,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws22 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(64 * scale),
-                num_filters1=64,
-                num_filters2=128,
+                in_channels=int(64 * scale),
+                out_channels1=64,
+                out_channels2=128,
                 num_groups=64,
                 stride=2,
                 scale=scale),
@@ -164,9 +137,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws31 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
-                num_filters1=128,
-                num_filters2=128,
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=128,
                 num_groups=128,
                 stride=1,
                 scale=scale),
@@ -175,9 +148,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws32 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(128 * scale),
-                num_filters1=128,
-                num_filters2=256,
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=256,
                 num_groups=128,
                 stride=2,
                 scale=scale),
@@ -186,9 +159,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws41 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
-                num_filters1=256,
-                num_filters2=256,
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=256,
                 num_groups=256,
                 stride=1,
                 scale=scale),
@@ -197,9 +170,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws42 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(256 * scale),
-                num_filters1=256,
-                num_filters2=512,
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=512,
                 num_groups=256,
                 stride=2,
                 scale=scale),
@@ -209,9 +182,9 @@ class MobileNetV1(fluid.dygraph.Layer):
         for i in range(5):
             tmp = self.add_sublayer(
                 sublayer=DepthwiseSeparable(
-                    num_channels=int(512 * scale),
-                    num_filters1=512,
-                    num_filters2=512,
+                    in_channels=int(512 * scale),
+                    out_channels1=512,
+                    out_channels2=512,
                     num_groups=512,
                     stride=1,
                     scale=scale),
@@ -220,9 +193,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws56 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(512 * scale),
-                num_filters1=512,
-                num_filters2=1024,
+                in_channels=int(512 * scale),
+                out_channels1=512,
+                out_channels2=1024,
                 num_groups=512,
                 stride=2,
                 scale=scale),
@@ -231,9 +204,9 @@ class MobileNetV1(fluid.dygraph.Layer):
 
         dws6 = self.add_sublayer(
             sublayer=DepthwiseSeparable(
-                num_channels=int(1024 * scale),
-                num_filters1=1024,
-                num_filters2=1024,
+                in_channels=int(1024 * scale),
+                out_channels1=1024,
+                out_channels2=1024,
                 num_groups=1024,
                 stride=1,
                 scale=scale),
@@ -241,29 +214,23 @@ class MobileNetV1(fluid.dygraph.Layer):
         self.dwsl.append(dws6)
 
         if with_pool:
-            self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
-
-        if num_classes > -1:
-            self.out = Linear(
-                int(1024 * scale),
-                num_classes,
-                act=classifier_activation,
-                param_attr=ParamAttr(
-                    initializer=MSRA(), name=self.full_name() + "fc7_weights"),
-                bias_attr=ParamAttr(name="fc7_offset"))
-
-    def forward(self, inputs):
-        y = self.conv1(inputs)
+            self.pool2d_avg = nn.AdaptiveAvgPool2d(1)
+
+        if num_classes > 0:
+            self.fc = nn.Linear(int(1024 * scale), num_classes)
+
+    def forward(self, x):
+        x = self.conv1(x)
         for dws in self.dwsl:
-            y = dws(y)
+            x = dws(x)
 
         if self.with_pool:
-            y = self.pool2d_avg(y)
+            x = self.pool2d_avg(x)
 
         if self.num_classes > 0:
-            y = fluid.layers.reshape(y, shape=[-1, 1024])
-            y = self.out(y)
-        return y
+            x = paddle.flatten(x, 1)
+            x = self.fc(x)
+        return x
 
 
 def _mobilenet(arch, pretrained=False, **kwargs):
@@ -275,7 +242,7 @@ def _mobilenet(arch, pretrained=False, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index c08fb88f8bdb234fec99ed139aa7eb6249965c79..bab8b7b2b1b93bb17612843bf0032ee278c3e93f 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -14,9 +14,9 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
+
+import paddle.nn as nn
+import paddle.nn.functional as F
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -25,221 +25,166 @@ __all__ = ['MobileNetV2', 'mobilenet_v2']
 model_urls = {
     'mobilenetv2_1.0':
     ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams',
-     '8ff74f291f72533f2a7956a4efff9d88')
+     '0340af0a901346c8d46f4529882fb63d')
 }
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_channels,
-                 filter_size,
-                 num_filters,
-                 stride,
-                 padding,
-                 channels=None,
-                 num_groups=1,
-                 use_cudnn=True):
-        super(ConvBNLayer, self).__init__()
-
-        tmp_param = ParamAttr(name=self.full_name() + "_weights")
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            act=None,
-            use_cudnn=use_cudnn,
-            param_attr=tmp_param,
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm(
-            num_filters,
-            param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"),
-            bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"),
-            moving_mean_name=self.full_name() + "_bn" + '_mean',
-            moving_variance_name=self.full_name() + "_bn" + '_variance')
-
-    def forward(self, inputs, if_act=True):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        if if_act:
-            y = fluid.layers.relu6(y)
-        return y
-
-
-class InvertedResidualUnit(fluid.dygraph.Layer):
-    def __init__(
-            self,
-            num_channels,
-            num_in_filter,
-            num_filters,
-            stride,
-            filter_size,
-            padding,
-            expansion_factor, ):
-        super(InvertedResidualUnit, self).__init__()
-        num_expfilter = int(round(num_in_filter * expansion_factor))
-        self._expand_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_expfilter,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1)
-
-        self._bottleneck_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_expfilter,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            num_groups=num_expfilter,
-            use_cudnn=False)
-
-        self._linear_conv = ConvBNLayer(
-            num_channels=num_expfilter,
-            num_filters=num_filters,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            num_groups=1)
-
-    def forward(self, inputs, ifshortcut):
-        y = self._expand_conv(inputs, if_act=True)
-        y = self._bottleneck_conv(y, if_act=True)
-        y = self._linear_conv(y, if_act=False)
-        if ifshortcut:
-            y = fluid.layers.elementwise_add(inputs, y)
-        return y
-
-
-class InvresiBlocks(fluid.dygraph.Layer):
-    def __init__(self, in_c, t, c, n, s):
-        super(InvresiBlocks, self).__init__()
-
-        self._first_block = InvertedResidualUnit(
-            num_channels=in_c,
-            num_in_filter=in_c,
-            num_filters=c,
-            stride=s,
-            filter_size=3,
-            padding=1,
-            expansion_factor=t)
-
-        self._inv_blocks = []
-        for i in range(1, n):
-            tmp = self.add_sublayer(
-                sublayer=InvertedResidualUnit(
-                    num_channels=c,
-                    num_in_filter=c,
-                    num_filters=c,
-                    stride=1,
-                    filter_size=3,
-                    padding=1,
-                    expansion_factor=t),
-                name=self.full_name() + "_" + str(i + 1))
-            self._inv_blocks.append(tmp)
-
-    def forward(self, inputs):
-        y = self._first_block(inputs, ifshortcut=False)
-        for inv_block in self._inv_blocks:
-            y = inv_block(y, ifshortcut=True)
-        return y
-
-
-class MobileNetV2(fluid.dygraph.Layer):
-    """MobileNetV2 model from
-    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
-
-    Args:
-        scale (float): scale of channels in each layer. Default: 1.0.
-        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
-                            will not be defined. Default: 1000.
-        with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
-
-    Examples:
-        .. code-block:: python
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
 
-            from paddle.vision.models import MobileNetV2
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
 
-            model = MobileNetV2()
-    """
 
+class ConvBNReLU(nn.Sequential):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel_size=3,
+                 stride=1,
+                 groups=1,
+                 norm_layer=nn.BatchNorm2d):
+        padding = (kernel_size - 1) // 2
+
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups=groups,
+                bias_attr=False),
+            norm_layer(out_planes),
+            nn.ReLU6())
+
+
+class InvertedResidual(nn.Layer):
     def __init__(self,
-                 scale=1.0,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
+                 inp,
+                 oup,
+                 stride,
+                 expand_ratio,
+                 norm_layer=nn.BatchNorm2d):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                ConvBNReLU(
+                    inp, hidden_dim, kernel_size=1, norm_layer=norm_layer))
+        layers.extend([
+            ConvBNReLU(
+                hidden_dim,
+                hidden_dim,
+                stride=stride,
+                groups=hidden_dim,
+                norm_layer=norm_layer),
+            nn.Conv2d(
+                hidden_dim, oup, 1, 1, 0, bias_attr=False),
+            norm_layer(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Layer):
+    def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
+        """MobileNetV2 model from
+        `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
+
+        Args:
+            scale (float): scale of channels in each layer. Default: 1.0.
+            num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                                will not be defined. Default: 1000.
+            with_pool (bool): use pool before the last fc layer or not. Default: True.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.vision.models import MobileNetV2
+
+                model = MobileNetV2()
+        """
         super(MobileNetV2, self).__init__()
-        self.scale = scale
         self.num_classes = num_classes
         self.with_pool = with_pool
+        input_channel = 32
+        last_channel = 1280
+
+        block = InvertedResidual
+        round_nearest = 8
+        norm_layer = nn.BatchNorm2d
+        inverted_residual_setting = [
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],
+        ]
 
-        bottleneck_params_list = [
-            (1, 16, 1, 1),
-            (6, 24, 2, 2),
-            (6, 32, 3, 2),
-            (6, 64, 4, 2),
-            (6, 96, 3, 1),
-            (6, 160, 3, 2),
-            (6, 320, 1, 1),
+        input_channel = _make_divisible(input_channel * scale, round_nearest)
+        self.last_channel = _make_divisible(last_channel * max(1.0, scale),
+                                            round_nearest)
+        features = [
+            ConvBNReLU(
+                3, input_channel, stride=2, norm_layer=norm_layer)
         ]
 
-        self._conv1 = ConvBNLayer(
-            num_channels=3,
-            num_filters=int(32 * scale),
-            filter_size=3,
-            stride=2,
-            padding=1)
-
-        self._invl = []
-        i = 1
-        in_c = int(32 * scale)
-        for layer_setting in bottleneck_params_list:
-            t, c, n, s = layer_setting
-            i += 1
-            tmp = self.add_sublayer(
-                sublayer=InvresiBlocks(
-                    in_c=in_c, t=t, c=int(c * scale), n=n, s=s),
-                name='conv' + str(i))
-            self._invl.append(tmp)
-            in_c = int(c * scale)
-
-        self._out_c = int(1280 * scale) if scale > 1.0 else 1280
-        self._conv9 = ConvBNLayer(
-            num_channels=in_c,
-            num_filters=self._out_c,
-            filter_size=1,
-            stride=1,
-            padding=0)
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * scale, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(
+                    block(
+                        input_channel,
+                        output_channel,
+                        stride,
+                        expand_ratio=t,
+                        norm_layer=norm_layer))
+                input_channel = output_channel
+
+        features.append(
+            ConvBNReLU(
+                input_channel,
+                self.last_channel,
+                kernel_size=1,
+                norm_layer=norm_layer))
+
+        self.features = nn.Sequential(*features)
 
         if with_pool:
-            self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
-
-        if num_classes > 0:
-            tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
-            self._fc = Linear(
-                self._out_c,
-                num_classes,
-                act=classifier_activation,
-                param_attr=tmp_param,
-                bias_attr=ParamAttr(name="fc10_offset"))
-
-    def forward(self, inputs):
-        y = self._conv1(inputs, if_act=True)
-        for inv in self._invl:
-            y = inv(y)
-        y = self._conv9(y, if_act=True)
+            self.pool2d_avg = nn.AdaptiveAvgPool2d(1)
+
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Dropout(0.2), nn.Linear(self.last_channel, num_classes))
+
+    def forward(self, x):
+        x = self.features(x)
 
         if self.with_pool:
-            y = self._pool2d_avg(y)
+            x = self.pool2d_avg(x)
+
         if self.num_classes > 0:
-            y = fluid.layers.reshape(y, shape=[-1, self._out_c])
-            y = self._fc(y)
-        return y
+            x = paddle.flatten(x, 1)
+            x = self.classifier(x)
+        return x
 
 
 def _mobilenet(arch, pretrained=False, **kwargs):
@@ -251,7 +196,7 @@ def _mobilenet(arch, pretrained=False, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
index da0c3e9eb3f67f0aad67cdef3c5527cb2275e844..f9e00aefd6bb2b6d0b7bf75055cc735c6651a52d 100644
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -15,11 +15,8 @@
 from __future__ import division
 from __future__ import print_function
 
-import math
-import paddle.fluid as fluid
-
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-from paddle.fluid.dygraph.container import Sequential
+import paddle
+import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -29,143 +26,129 @@ __all__ = [
 
 model_urls = {
     'resnet18': ('https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams',
-                 '0ba53eea9bc970962d0ef96f7b94057e'),
+                 'cf548f46534aa3560945be4b95cd11c4'),
     'resnet34': ('https://paddle-hapi.bj.bcebos.com/models/resnet34.pdparams',
-                 '46bc9f7c3dd2e55b7866285bee91eff3'),
+                 '8d2275cf8706028345f78ac0e1d31969'),
     'resnet50': ('https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams',
-                 '5ce890a9ad386df17cf7fe2313dca0a1'),
+                 'ca6f485ee1ab0492d38f323885b0ad80'),
     'resnet101': ('https://paddle-hapi.bj.bcebos.com/models/resnet101.pdparams',
-                  'fb07a451df331e4b0bb861ed97c3a9b9'),
+                  '02f35f034ca3858e1e54d4036443c92d'),
     'resnet152': ('https://paddle-hapi.bj.bcebos.com/models/resnet152.pdparams',
-                  'f9c700f26d3644bb76ad2226ed5f5713'),
+                  '7ad16a2f1e7333859ff986138630fd7a'),
 }
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
+class BasicBlock(nn.Layer):
+    expansion = 1
+
     def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
+                 inplanes,
+                 planes,
                  stride=1,
+                 downsample=None,
                  groups=1,
-                 act=None):
-        super(ConvBNLayer, self).__init__()
-
-        self._conv = Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            act=None,
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm(num_filters, act=act)
-
-    def forward(self, inputs):
-        x = self._conv(inputs)
-        x = self._batch_norm(x)
-
-        return x
-
-
-class BasicBlock(fluid.dygraph.Layer):
-    """residual block of resnet18 and resnet34
-    """
-    expansion = 1
-
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
         super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
 
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=3,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act='relu')
+        if dilation > 1:
+            raise NotImplementedError(
+                "Dilation > 1 not supported in BasicBlock")
 
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters,
-                filter_size=1,
-                stride=stride)
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, 3, padding=1, stride=stride, bias_attr=False)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias_attr=False)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
 
-        self.shortcut = shortcut
+    def forward(self, x):
+        identity = x
 
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
 
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
+        out = self.conv2(out)
+        out = self.bn2(out)
 
-        y = short + conv1
+        if self.downsample is not None:
+            identity = self.downsample(x)
 
-        return fluid.layers.relu(y)
+        out += identity
+        out = self.relu(out)
 
+        return out
 
-class BottleneckBlock(fluid.dygraph.Layer):
-    """residual block of resnet50, resnet101 amd resnet152
-    """
+
+class BottleneckBlock(nn.Layer):
 
     expansion = 4
 
-    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
         super(BottleneckBlock, self).__init__()
-
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            act='relu')
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+
+        self.conv1 = nn.Conv2d(inplanes, width, 1, bias_attr=False)
+        self.bn1 = norm_layer(width)
+
+        self.conv2 = nn.Conv2d(
+            width,
+            width,
+            3,
+            padding=dilation,
             stride=stride,
-            act='relu')
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * self.expansion,
-            filter_size=1,
-            act=None)
+            groups=groups,
+            dilation=dilation,
+            bias_attr=False)
+        self.bn2 = norm_layer(width)
 
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * self.expansion,
-                filter_size=1,
-                stride=stride)
+        self.conv3 = nn.Conv2d(
+            width, planes * self.expansion, 1, bias_attr=False)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
 
-        self.shortcut = shortcut
+    def forward(self, x):
+        identity = x
 
-        self._num_channels_out = num_filters * self.expansion
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
 
-    def forward(self, inputs):
-        x = self.conv0(inputs)
-        conv1 = self.conv1(x)
-        conv2 = self.conv2(conv1)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
 
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
+        out = self.conv3(out)
+        out = self.bn3(out)
 
-        x = fluid.layers.elementwise_add(x=short, y=conv2)
+        if self.downsample is not None:
+            identity = self.downsample(x)
 
-        return fluid.layers.relu(x)
+        out += identity
+        out = self.relu(out)
 
+        return out
 
-class ResNet(fluid.dygraph.Layer):
+
+class ResNet(nn.Layer):
     """ResNet model from
     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
 
@@ -175,7 +158,6 @@ class ResNet(fluid.dygraph.Layer):
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
         with_pool (bool): use pool before the last fc layer or not. Default: True.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -189,82 +171,87 @@ class ResNet(fluid.dygraph.Layer):
 
     """
 
-    def __init__(self,
-                 Block,
-                 depth=50,
-                 num_classes=1000,
-                 with_pool=True,
-                 classifier_activation='softmax'):
+    def __init__(self, block, depth, num_classes=1000, with_pool=True):
         super(ResNet, self).__init__()
-
-        self.num_classes = num_classes
-        self.with_pool = with_pool
-
-        layer_config = {
+        layer_cfg = {
             18: [2, 2, 2, 2],
             34: [3, 4, 6, 3],
             50: [3, 4, 6, 3],
             101: [3, 4, 23, 3],
-            152: [3, 8, 36, 3],
+            152: [3, 8, 36, 3]
         }
-        assert depth in layer_config.keys(), \
-            "supported depth are {} but input layer is {}".format(
-                layer_config.keys(), depth)
-
-        layers = layer_config[depth]
-
-        in_channels = 64
-        out_channels = [64, 128, 256, 512]
-
-        self.conv = ConvBNLayer(
-            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
-        self.pool = Pool2D(
-            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
-
-        self.layers = []
-        for idx, num_blocks in enumerate(layers):
-            blocks = []
-            shortcut = False
-            for b in range(num_blocks):
-                if b == 1:
-                    in_channels = out_channels[idx] * Block.expansion
-                block = Block(
-                    num_channels=in_channels,
-                    num_filters=out_channels[idx],
-                    stride=2 if b == 0 and idx != 0 else 1,
-                    shortcut=shortcut)
-                blocks.append(block)
-                shortcut = True
-            layer = self.add_sublayer("layer_{}".format(idx),
-                                      Sequential(*blocks))
-            self.layers.append(layer)
+        layers = layer_cfg[depth]
+        self.num_classes = num_classes
+        self.with_pool = with_pool
+        self._norm_layer = nn.BatchNorm2d
+
+        self.inplanes = 64
+        self.dilation = 1
 
+        self.conv1 = nn.Conv2d(
+            3,
+            self.inplanes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias_attr=False)
+        self.bn1 = self._norm_layer(self.inplanes)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
         if with_pool:
-            self.global_pool = Pool2D(
-                pool_size=7, pool_type='avg', global_pooling=True)
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
 
         if num_classes > 0:
-            stdv = 1.0 / math.sqrt(out_channels[-1] * Block.expansion * 1.0)
-            self.fc_input_dim = out_channels[-1] * Block.expansion * 1 * 1
-            self.fc = Linear(
-                self.fc_input_dim,
-                num_classes,
-                act=classifier_activation,
-                param_attr=fluid.param_attr.ParamAttr(
-                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
-
-    def forward(self, inputs):
-        x = self.conv(inputs)
-        x = self.pool(x)
-        for layer in self.layers:
-            x = layer(x)
-
-        if self.with_pool:
-            x = self.global_pool(x)
-
-        if self.num_classes > -1:
-            x = fluid.layers.reshape(x, shape=[-1, self.fc_input_dim])
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    1,
+                    stride=stride,
+                    bias_attr=False),
+                norm_layer(planes * block.expansion), )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, 1, 64,
+                  previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        if self.with_pool > 0:
+            x = self.avgpool(x)
+
+        if self.num_classes > 0:
+            x = paddle.flatten(x, 1)
             x = self.fc(x)
+
         return x
 
 
@@ -277,7 +264,7 @@ def _resnet(arch, Block, depth, pretrained, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.set_dict(param)
 
     return model
diff --git a/python/paddle/vision/models/vgg.py b/python/paddle/vision/models/vgg.py
index 8bfacda2476d0e24e549513b379181bf47e40d45..d11845b6616267d1cdfff197cc2c4a25a62c7d9e 100644
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.nn import Conv2d, Pool2D, BatchNorm, Linear, ReLU, Softmax
-from paddle.fluid.dygraph.container import Sequential
+import paddle
+import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
@@ -28,39 +27,18 @@ __all__ = [
 
 model_urls = {
     'vgg16': ('https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams',
-              'c788f453a3b999063e8da043456281ee')
+              '89bbffc0f87d260be9b8cdc169c991c4')
 }
 
 
-class Classifier(fluid.dygraph.Layer):
-    def __init__(self, num_classes, classifier_activation='softmax'):
-        super(Classifier, self).__init__()
-        self.linear1 = Linear(512 * 7 * 7, 4096)
-        self.linear2 = Linear(4096, 4096)
-        self.linear3 = Linear(4096, num_classes)
-        self.act = Softmax()  #Todo: accept any activation
-
-    def forward(self, x):
-        x = self.linear1(x)
-        x = fluid.layers.relu(x)
-        x = fluid.layers.dropout(x, 0.5)
-        x = self.linear2(x)
-        x = fluid.layers.relu(x)
-        x = fluid.layers.dropout(x, 0.5)
-        x = self.linear3(x)
-        out = self.act(x)
-        return out
-
-
-class VGG(fluid.dygraph.Layer):
+class VGG(nn.Layer):
     """VGG model from
     `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
 
     Args:
-        features (fluid.dygraph.Layer): vgg features create by function make_layers.
+        features (nn.Layer): vgg features create by function make_layers.
         num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
                             will not be defined. Default: 1000.
-        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
 
     Examples:
         .. code-block:: python
@@ -76,44 +54,41 @@ class VGG(fluid.dygraph.Layer):
 
     """
 
-    def __init__(self,
-                 features,
-                 num_classes=1000,
-                 classifier_activation='softmax'):
+    def __init__(self, features, num_classes=1000):
         super(VGG, self).__init__()
         self.features = features
-        self.num_classes = num_classes
-
-        if num_classes > 0:
-            classifier = Classifier(num_classes, classifier_activation)
-            self.classifier = self.add_sublayer("classifier",
-                                                Sequential(classifier))
+        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
+        self.classifier = nn.Sequential(
+            nn.Linear(512 * 7 * 7, 4096),
+            nn.ReLU(),
+            nn.Dropout(),
+            nn.Linear(4096, 4096),
+            nn.ReLU(),
+            nn.Dropout(),
+            nn.Linear(4096, num_classes), )
 
     def forward(self, x):
         x = self.features(x)
-
-        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
-            x = self.classifier(x)
+        x = self.avgpool(x)
+        x = paddle.flatten(x, 1)
+        x = self.classifier(x)
         return x
 
 
 def make_layers(cfg, batch_norm=False):
     layers = []
     in_channels = 3
-
     for v in cfg:
         if v == 'M':
-            layers += [Pool2D(pool_size=2, pool_stride=2)]
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
         else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
             if batch_norm:
-                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
-                layers += [conv2d, BatchNorm(v), ReLU()]
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU()]
             else:
-                conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1)
-                layers += [conv2d, ReLU()]
+                layers += [conv2d, nn.ReLU()]
             in_channels = v
-    return Sequential(*layers)
+    return nn.Sequential(*layers)
 
 
 cfgs = {
@@ -144,7 +119,7 @@ def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
                                                 model_urls[arch][1])
         assert weight_path.endswith(
             '.pdparams'), "suffix of weight must be .pdparams"
-        param, _ = fluid.load_dygraph(weight_path)
+        param, _ = paddle.load(weight_path)
         model.load_dict(param)
 
     return model
diff --git a/tools/is_ut_disabled.py b/tools/is_ut_disabled.py
deleted file mode 100644
index a21fe39e71e516cf14d1eda970d2deba986ef8a3..0000000000000000000000000000000000000000
--- a/tools/is_ut_disabled.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Check whether ut is disabled. """
-
-import os
-import sys
-
-
-def check_ut():
-    """ Get disabled unit tests. """
-    disable_ut_file = 'disable_ut'
-    cmd = 'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/{}'.format(
-        disable_ut_file)
-    os.system(cmd)
-    with open(disable_ut_file) as utfile:
-        for u in utfile:
-            if u.rstrip('\r\n') == sys.argv[1]:
-                exit(0)
-    exit(1)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        exit(1)
-    try:
-        check_ut()
-    except Exception as e:
-        print(e)
-        exit(1)
diff --git a/tools/wlist.json b/tools/wlist.json
index 0ed0b4e40698ce26fbddb7e5a421143749b3a3ef..3ca14cd1dd6f964f87031e31128bfb2cb1c733a1 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -251,9 +251,10 @@
         "BilinearTensorProduct",
         "GroupNorm",
         "SpectralNorm",
-        "TreeConv",
+        "TreeConv"
+    ],
+    "wlist_temp":[
         "prroi_pool",
-        "to_tensor",
         "ChunkEvaluator",
         "EditDistance",
         "ErrorClipByValue",
@@ -406,7 +407,9 @@
         "TransformerDecoder.prepare_incremental_cache",
         "LinearChainCRF.forward",
         "CRFDecoding.forward",
-        "SequenceTagging.forward"
+        "SequenceTagging.forward",
+        "XPUPlace",
+        "is_compiled_with_xpu"
     ],
     "gpu_not_white":[
         "deformable_conv",