Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_test_activation_op_random_bug

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_test_activation_op_random_bug
36cecf61 · Shang Zhizhou · 6b87469c · 7c516240 · 36cecf61 · 36cecf61
149 changed file
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 set(WARPCTC_REPOSITORY  https://github.com/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         fc7f226b93758216a03b1be9d24593a12819b984)
+set(WARPCTC_TAG         95a461eddeabd51099ef059dcfada1117eb1bfb8)

 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
    CACHE PATH "Warp-ctc Directory" FORCE)
@@ -44,8 +44,9 @@ ExternalProject_Add(
    "${WARPCTC_DOWNLOAD_CMD}"
    PREFIX          ${WARPCTC_PREFIX_DIR}
    SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
-    UPDATE_COMMAND  ""
+    #UPDATE_COMMAND  ""
    PATCH_COMMAND   ""
+    BUILD_ALWAYS    1
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}

--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -4,7 +4,7 @@ endif()

 INCLUDE(ExternalProject)
 SET(XPU_PROJECT                 "extern_xpu")
-SET(XPU_URL    "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
 SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
 SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -62,9 +62,9 @@ function(op_library TARGET)
            endif()
        endif()
        if(WITH_XPU)
-            string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
-                list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
+            string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc)
+                list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
            endif()
        endif()
    else()
@@ -83,7 +83,7 @@ function(op_library TARGET)
                list(APPEND mkldnn_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
                list(APPEND cu_cc_srcs ${src})
-            elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
+            elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                list(APPEND xpu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
                list(APPEND cc_srcs ${src})

--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -76,7 +76,7 @@ void AllReduceOpHandle::AllReduceImpl(
                    platform::errors::InvalidArgument(
                        "The NoDummyInputSize should be equal "
                        "to the number of places, but got NoDummyInputSize is "
-                        "%d and the number of place is %d.",
+                        "%d and the number of places is %d.",
                        in_var_handles.size(), num_places));
  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), out_var_handles.size(),
@@ -89,7 +89,7 @@ void AllReduceOpHandle::AllReduceImpl(
      platform::errors::InvalidArgument(
          "The number of local scopes should be equal "
          "to the number of places, but got the number of local scopes is "
-          "%d and the number of place is %d.",
+          "%d and the number of places is %d.",
          in_var_handles.size(), num_places));

  std::vector<const void *> lod_tensor_data;

--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() {
  auto out_var_handles = DynamicCast<VarHandle>(outputs_);

  PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
-                    "The number of input should be one.");
-  PADDLE_ENFORCE_EQ(
-      out_var_handles.size(), places_.size(),
-      "The number of output should equal to the number of places.");
+                    platform::errors::PreconditionNotMet(
+                        "The number of inputs should be 1, but got %d.",
+                        in_var_handles.size()));
+  PADDLE_ENFORCE_EQ(out_var_handles.size(), places_.size(),
+                    platform::errors::PreconditionNotMet(
+                        "The number of outputs and the number of places should "
+                        "be equal, but got the number of outputs is %d and the "
+                        "number of places is %d.",
+                        out_var_handles.size(), places_.size()));

  VarHandle *in_var_handle = in_var_handles[0];

@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar(
    const std::vector<Scope *> &var_scopes) {
  auto *in_var =
      var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name());
-  PADDLE_ENFORCE_NOT_NULL(in_var);
+  PADDLE_ENFORCE_NOT_NULL(
+      in_var, platform::errors::NotFound("Variable %s is not found in scopes.",
+                                         in_var_handle.name()));
  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
  if (UNLIKELY(!in_tensor.IsInitialized())) {
    VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!";
@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar(

      broadcast_calls.emplace_back(
          [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
-            PADDLE_ENFORCE(platform::dynload::ncclBcast(
+            PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
                send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
                root_id, nccl_ctx.comm_, nccl_ctx.stream()));
          });
@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar(
      nccl_ctxs_->DevCtx(p)->Wait();
    }
 #else
-    PADDLE_THROW("CUDA is not enabled.");
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
  }
 }
@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue(
    auto t_out_p = out_var_handle->place();
    auto *out_var = var_scopes.at(out_var_handle->scope_idx())
                        ->FindVar(out_var_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
+                                         "Variable %s is not found in scopes.",
+                                         out_var_handle->name()));
    if (is_gpu_place(in_tensor.place())) {
-      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
-                     "Places of input and output must be all on GPU.");
+      PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
+                        platform::errors::PreconditionNotMet(
+                            "Places of input and output must be all on GPU."));
    } else {
      t_out_p = platform::CPUPlace();
    }

--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -79,7 +79,8 @@ struct TestBroadcastOpHandle {
      }
      nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
    } else {
      int count = 8;
@@ -113,7 +114,8 @@ struct TestBroadcastOpHandle {
      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                         place_list_, nccl_ctxs_.get());
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
    } else {
 #if defined(PADDLE_WITH_NCCL)
@@ -171,7 +173,9 @@ struct TestBroadcastOpHandle {
                                   float val_scalar = 0.0) {
    auto var = param_scopes_[input_scope_idx]->FindVar(varname);

-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                        varname));
    auto lod_tensor = var->GetMutable<f::LoDTensor>();
    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
    for (size_t k = 0; k < send_vector.size(); ++k) {
@@ -194,7 +198,9 @@ struct TestBroadcastOpHandle {
    }

    auto var = param_scopes_[input_scope_idx]->FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                        varname));
    auto selected_rows = var->GetMutable<f::SelectedRows>();
    auto value = selected_rows->mutable_value();
    value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
@@ -211,13 +217,24 @@ struct TestBroadcastOpHandle {
                         const std::vector<float>& send_vector,
                         const std::vector<int64_t>& rows, int height) {
    auto var = param_scopes_[input_scope_idx]->FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                        varname));
    auto& selected_rows = var->Get<f::SelectedRows>();
    auto rt = selected_rows.value();
-    PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal.");
+    PADDLE_ENFORCE_EQ(selected_rows.height(), height,
+                      platform::errors::InvalidArgument(
+                          "The height of SelectedRows is not equal to "
+                          "the expected, expect %d, but got %ld.",
+                          height, selected_rows.height()));

    for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]);
+      PADDLE_ENFORCE_EQ(
+          selected_rows.rows()[k], rows[k],
+          platform::errors::InvalidArgument(
+              "The item at position %zu of rows of SelectedRows "
+              "is not equal to the expected, expect %ld, but got %ld.",
+              k, rows[k], selected_rows.rows()[k]));
    }

    p::CPUPlace cpu_place;
@@ -235,9 +252,15 @@ struct TestBroadcastOpHandle {
                      framework::Scope* scope) {
    p::CPUPlace cpu_place;
    auto var = scope->FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                        varname));
    auto tensor = var->Get<f::LoDTensor>();
-    PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal.");
+    PADDLE_ENFORCE_EQ(tensor.lod(), lod,
+                      platform::errors::InvalidArgument(
+                          "The LoD of tensor is not equal to "
+                          "the expected, expect %s, but got %s.",
+                          lod, tensor.lod()));
    f::Tensor result_tensor;
    f::TensorCopySync(tensor, cpu_place, &result_tensor);
    float* ct = result_tensor.mutable_data<float>(cpu_place);

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
              AppendPass("reduce_mode_multi_devices_pass").get();
          break;
        default:
-          PADDLE_THROW("Unknown reduce strategy.");
+          PADDLE_THROW(
+              platform::errors::Unimplemented("Unknown reduce strategy."));
      }
    }
    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",

--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+
 #include <memory>
 #include <unordered_set>
 #include <utility>

-#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
    if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
      platform::CUDADeviceGuard guard(
          BOOST_GET_CONST(platform::CUDAPlace, place).device);
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-      PADDLE_ENFORCE_NOT_NULL(event_);
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+      PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
+                                          "The cuda envet created is NULL."));
    }
  }
 #endif
-  PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument(
-                                            "Variable names are empty."));
+  PADDLE_ENFORCE_NE(vars.empty(), true,
+                    platform::errors::InvalidArgument(
+                        "The variables to be deleted are empty."));
  for (auto *var : var_infos_) {
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
+                                     "The memory optimization info is NULL."));
  }
 }

@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
  if (event_) {
    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
    platform::CUDADeviceGuard guard(gpu_place.device);
-    PADDLE_ENFORCE(cudaEventDestroy(event_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
  }
 #endif
 }
@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() {
 }

 void EagerDeletionOpHandle::CallOnce() {
-  PADDLE_ENFORCE(vars_.empty(), "vars_ must be initialized here");
+  PADDLE_ENFORCE_EQ(
+      vars_.empty(), true,
+      platform::errors::InvalidArgument(
+          "The variables to be deleted should be initialized here."));
  Scope *exec_scope = local_exec_scopes_[0];
  for (auto *var_info : var_infos_) {
    auto *var = exec_scope->FindVar(var_info->Name());
-    PADDLE_ENFORCE_NOT_NULL(var, "Variable %s should not be nullptr",
-                            var_info->Name());
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound(
+                 "The variable(%s) to be inplaced is not found in scope.",
+                 var_info->Name()));
    vars_.emplace_back(var);
  }
 }
@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() {
        garbages.emplace_back(t.MoveMemoryHolder());
      }
    } else {
-      PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                   framework::ToTypeName(var->Type()), var_info->Name());
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "The variable(%s) of type %s is not supported in eager deletion.",
+          framework::ToTypeName(var->Type()), var_info->Name()));
    }
  }

@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages(
    auto callback_stream =
        reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
    auto callback_func = [=]() {
-      PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-      PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaStreamWaitEvent(callback_stream, event_, 0));
    };
    gc_->Add(std::move(*garbages), callback_func);
  } else {

--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
+
 #include <algorithm>
 #include <utility>
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() {
  size_t place_num = places_.size();
  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), place_num * num_of_all_reduce_,
-      "The NoDummyInputSize should be equal to the number of places.");
+      platform::errors::PreconditionNotMet(
+          "The number of input variable handles should be equal to the number "
+          "of places plus the number of all reduce handles, "
+          "but got the number of input variable handles is %d, the "
+          "number of places is %d, and the number of all reduce handles "
+          "is %d.",
+          in_var_handles.size(), place_num, num_of_all_reduce_));
  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+      platform::errors::PreconditionNotMet(
+          "The number of input variable handles should be equal to the number "
+          "of output variable handles, but got the number of input variable "
+          "handles is %d, and the number of  output variable handles is %d.",
+          in_var_handles.size(), out_var_handles.size()));

  // Note: some gradient op doesn't have CUDAKernel, so the gradients of
  // those op are in CPUPlace, in this case, the all reduce should not be fused.
@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
      dtype = ele_dtype;
    }

-    PADDLE_ENFORCE_EQ(ele_dtype, dtype);
+    PADDLE_ENFORCE_EQ(
+        ele_dtype, dtype,
+        platform::errors::InvalidArgument(
+            "The DataType of grad tensors of fused_all_reduce_op_handle  "
+            "must be consistent. The current dtype is %s, but the "
+            "previous dtype is %s.",
+            DataTypeToString(ele_dtype), DataTypeToString(dtype)));

    // Check whether the address space is contiguous.
    std::sort(
@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
          "input[%d] address: 0X%02x. The offset: %d",
          k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
          next_address, k, infer_next_address, offset);
-      PADDLE_ENFORCE_EQ(infer_next_address, next_address,
-                        "The address is not consistent.");
+      PADDLE_ENFORCE_EQ(
+          infer_next_address, next_address,
+          platform::errors::InvalidArgument(
+              "The infered address of the next tensor should be equal to the "
+              "real address of the next tensor. But got infered address is %p "
+              "and real address is %p.",
+              infer_next_address, next_address));
    }
  }

  if (!FLAGS_skip_fused_all_reduce_check) {
    for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
      for (size_t j = 1; j < num_of_all_reduce_; ++j) {
-        PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first,
-                          grads_tensor.at(scope_idx).at(j).first);
+        PADDLE_ENFORCE_EQ(
+            grads_tensor.at(0).at(j).first,
+            grads_tensor.at(scope_idx).at(j).first,
+            platform::errors::InvalidArgument(
+                "The variable name of grad tensors of "
+                "fused_all_reduce_op_handle  "
+                "must be consistent. The current name is %s, but the "
+                "previous name is %s.",
+                grads_tensor.at(0).at(j).first,
+                grads_tensor.at(scope_idx).at(j).first));
      }
    }
  }
@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
    for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
      auto var_name = in_var_handles[j]->name();
      auto var = local_scope->FindVar(var_name);
-      PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          var, platform::errors::NotFound(
+                   "The variable '%s' is not found in local scope.", var_name));
      auto &lod_tensor = var->Get<LoDTensor>();
      if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
        return true;
@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
  size_t place_num = places_.size();
  for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
    auto var_name = in_var_handles[j]->name();
-    PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name());
+    PADDLE_ENFORCE_EQ(
+        var_name, out_var_handles[j]->name(),
+        platform::errors::InvalidArgument(
+            "The name of input variable should be equal "
+            "to the name of output variable. But got the name of input "
+            "variable is %s and the name of output variable is %s.",
+            var_name, out_var_handles[j]->name()));
    auto var = local_scope->FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound(
+                 "The variable '%s' is not found in local scope.", var_name));
    auto &lod_tensor = var->Get<LoDTensor>();

    PADDLE_ENFORCE_EQ(
        platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)),
-        true, "%s(%d) is not in the right place.", var_name, scope_idx);
+        true, platform::errors::InvalidArgument(
+                  "The variable '%s' at scope %d is not in the right place.",
+                  var_name, scope_idx));
    grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
  }
 }
@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
  size_t size_of_dtype = 0;
  for (size_t i = 0; i < grad_tensor.size(); ++i) {
    // Get dtype
-    auto ele_type = grad_tensor.at(i).second->type();
+    auto ele_dtype = grad_tensor.at(i).second->type();
    if (i == 0) {
-      *dtype = ele_type;
-      size_of_dtype = framework::SizeOfType(ele_type);
+      *dtype = ele_dtype;
+      size_of_dtype = framework::SizeOfType(ele_dtype);
    }
-    PADDLE_ENFORCE_EQ(ele_type, *dtype);
+    PADDLE_ENFORCE_EQ(
+        ele_dtype, *dtype,
+        platform::errors::InvalidArgument(
+            "The DataType of grad tensors of fused_all_reduce_op_handle  "
+            "must be consistent. The current dtype is %s, but the "
+            "previous dtype is %s.",
+            DataTypeToString(ele_dtype), DataTypeToString(*dtype)));

    // Get element number
    int64_t len = grad_tensor.at(i).second->numel();
-    PADDLE_ENFORCE_GT(len, 0);
+    PADDLE_ENFORCE_GT(
+        len, 0, platform::errors::InvalidArgument(
+                    "The size of grad tensors of fused_all_reduce_op_handle  "
+                    "must be > 0, but got %d.",
+                    len));
    *numel +=
        platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
  }

--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() {
  WaitInputVarGenerated();

  size_t place_num = places_.size();
-  PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size());
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size() * place_num, out_var_handles.size(),
+      platform::errors::PreconditionNotMet(
+          "The number of input variable handles plus the number "
+          "of places should be equal to the number of output variable handles, "
+          "but got the number of input variable handles is %d, the "
+          "number of places is %d, and the number of output variable handles "
+          "is %d.",
+          in_var_handles.size(), place_num, out_var_handles.size()));

  for (size_t i = 0; i < in_var_handles.size(); ++i) {
    BroadcastOneVar(

--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
+
 #include <memory>
 #include <unordered_map>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
@@ -58,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
      op_handle_ = new FusedBroadcastOpHandle(
          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
-      PADDLE_THROW("CUDA is not supported.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
    } else {
 #if defined(PADDLE_WITH_NCCL)

--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/gather_op_handle.h"
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"

@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() {

  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), places_.size(),
-      "The number of output should equal to the number of places.");
+      platform::errors::InvalidArgument(
+          "The number of input variables should be equal "
+          "to the number of places, but got the number of input variables is "
+          "%d and the number of places is %d.",
+          in_var_handles.size(), places_.size()));

  VarHandle *out_var_handle;
  {
    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
-                      "The number of output should be one.");
+    PADDLE_ENFORCE_EQ(
+        out_var_handles.size(), 1,
+        platform::errors::InvalidArgument(
+            "The number of output variables should be 1, but got %d.",
+            out_var_handles.size()));
    out_var_handle = out_var_handles.front();
  }

@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() {
  auto in_0_handle = in_var_handles[0];
  auto pre_in_var =
      var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
+  PADDLE_ENFORCE_NOT_NULL(
+      pre_in_var,
+      platform::errors::NotFound("The variable '%s' is not found in the scope.",
+                                 in_0_handle->name()));

-  PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
-                 "Currently, gather_op only can gather SelectedRows.");
+  PADDLE_ENFORCE_EQ(pre_in_var->IsType<framework::SelectedRows>(), true,
+                    platform::errors::Unimplemented(
+                        "Currently, gather_op only supports SelectedRows."));

  // Wait input done, this Wait is asynchronous operation
  WaitInputVarGenerated();
@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() {
  for (auto *in_handle : in_var_handles) {
    auto *in_var =
        var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(in_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var,
+        platform::errors::NotFound(
+            "The variable '%s' is not found in the scope.", in_handle->name()));
    VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);

    auto &in_sr_value = in_var->Get<framework::SelectedRows>();
@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() {
  // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
  platform::Place t_out_p = out_var_handle->place();
  if (platform::is_gpu_place(pre_in_value.place())) {
-    PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
-                   "Places of input and output must be all on GPU.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
+                      platform::errors::PreconditionNotMet(
+                          "Places of input and output must be all on GPU."));
  } else {
    t_out_p = platform::CPUPlace();
  }

  auto out_var = var_scopes.at(out_var_handle->scope_idx())
                     ->FindVar(out_var_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(out_var);
+  PADDLE_ENFORCE_NOT_NULL(
+      out_var,
+      platform::errors::NotFound("The variable '%s' is not found in the scope.",
+                                 out_var_handle->name()));
  auto out_value = out_var->GetMutable<framework::SelectedRows>();
  out_value->set_height(pre_in_value.height());
  out_value->set_rows(out_rows);

--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/gather_op_handle.h"
+
 #include <memory>
 #include <unordered_map>
+
 #include "gtest/gtest.h"

 namespace paddle {
@@ -60,7 +62,8 @@ struct TestGatherOpHandle {
        ctxs_.emplace_back(new p::CUDADeviceContext(p));
      }
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
    } else {
      int count = 8;
@@ -141,7 +144,9 @@ struct TestGatherOpHandle {
    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
         ++input_scope_idx) {
      auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
-      PADDLE_ENFORCE_NOT_NULL(in_var);
+      PADDLE_ENFORCE_NOT_NULL(
+          in_var, platform::errors::NotFound(
+                      "The variable '%s' is not found in the scope.", "input"));
      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
      auto value = in_selected_rows->mutable_value();
      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -155,7 +160,9 @@ struct TestGatherOpHandle {
    }

    auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
-    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        out_var, platform::errors::NotFound(
+                     "The variable '%s' is not found in the scope.", "out"));
    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();

    auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
@@ -173,9 +180,19 @@ struct TestGatherOpHandle {
    auto& out_select_rows = out_var->Get<f::SelectedRows>();
    auto rt = out_select_rows.value();

-    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                      platform::errors::InvalidArgument(
+                          "The height of SelectedRows is not equal to "
+                          "the expected, expect %d, but got %d.",
+                          height, out_select_rows.height()));
+
    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+      PADDLE_ENFORCE_EQ(
+          out_select_rows.rows()[k], rows[k % rows.size()],
+          platform::errors::InvalidArgument(
+              "The item at position %d of rows of SelectedRows is not equal to "
+              "the expected, expect %d, but got %d.",
+              k, rows[k % rows.size()], out_select_rows.rows()[k]));
    }

    f::Tensor result_tensor;
@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) {
  test_op.TestGatherSelectedRows(input_scope_idx);
 }
 #endif
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_op_handle.h
@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase {
  }
  virtual ~NCCLOpHandleBase() {
    for (auto& ev : inter_events_) {
-      PADDLE_ENFORCE(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
    }
    for (auto& ev : exter_events_) {
-      PADDLE_ENFORCE(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
    }
  }
  void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
-    PADDLE_ENFORCE(run_order >= 0, "run_order must >= 0");
+    PADDLE_ENFORCE_GE(
+        run_order, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order must be >= 0, but got %d.", run_order));
    run_order_ = run_order;
    use_hierarchical_allreduce_ = use_hierarchical_allreduce;

@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase {
      return;
    }

-    PADDLE_ENFORCE(places_.size() == 1,
-                   "HierarchicalAllReduce run one proc with one card mode.");
+    PADDLE_ENFORCE_EQ(places_.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "HierarchicalAllReduce can only run "
+                          "one proccess with one card mode, but got %d cards.",
+                          places_.size()));

    for (auto& p : places_) {
      auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order);
@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase {
        continue;
      }

-      PADDLE_ENFORCE(cudaSetDevice(dev_id));
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&inter_events_[dev_id],
-                                              cudaEventDisableTiming));
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&exter_events_[dev_id],
-                                              cudaEventDisableTiming));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
+          &inter_events_[dev_id], cudaEventDisableTiming));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
+          &exter_events_[dev_id], cudaEventDisableTiming));
      VLOG(10) << "Create events on dev_id:" << dev_id
               << ", inter_event:" << &inter_events_[dev_id]
               << ", exter_event:" << &exter_events_[dev_id];
@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase {
  void FlatNCCLAllReduce(platform::Place place, const void* sendbuff,
                         void* recvbuff, size_t count, ncclDataType_t datatype,
                         ncclRedOp_t op) {
-    PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
+    PADDLE_ENFORCE_GE(
+        run_order_, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order_ must be >= 0, but got %d.", run_order_));
    auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
    auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase {
             << ", dev_id:" << dev_id << ", dtype:" << datatype
             << ", place:" << place;

-    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
        sendbuff, recvbuff, count, datatype, op, comm, stream));
  }

  void NCCLAllReduce(platform::Place place, const void* sendbuff,
                     void* recvbuff, size_t count, ncclDataType_t datatype,
                     ncclRedOp_t op) {
-    PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
+    PADDLE_ENFORCE_GE(
+        run_order_, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order_ must be >= 0, but got %d.", run_order_));
    if (!use_hierarchical_allreduce_) {
      FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op);
      return;
@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase {
  void HierarchicalAllReduce(platform::Place place, const void* sendbuff,
                             void* recvbuff, size_t count,
                             ncclDataType_t datatype, ncclRedOp_t op) {
-    PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0");
+    PADDLE_ENFORCE_GE(
+        run_order_, 0,
+        platform::errors::InvalidArgument(
+            "The argument run_order_ must be >= 0, but got %d.", run_order_));
    InterReduce(place, sendbuff, recvbuff, count, datatype, op);
    // When a trainer is not in exter allreduce ring
    // they need not to call this.
@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase {
             << ", dtype:" << datatype << ", place:" << place
             << ", stream:" << stream;

-    PADDLE_ENFORCE(platform::dynload::ncclReduce(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
        sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));

    cudaEventRecord(inter_events_.at(dev_id), stream);

    if (FLAGS_sync_nccl_allreduce) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream),
-                     "sync HierarchicalAllReduce inter stream error");
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
    }
  }

@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase {
                      void* recvbuff, size_t count, ncclDataType_t datatype,
                      ncclRedOp_t op) {
    auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_);
-    PADDLE_ENFORCE(nccl_ctxs_, "can't get exter %d nccl_ctxs", run_order_);
+    PADDLE_ENFORCE_NOT_NULL(
+        nccl_ctxs_, platform::errors::NotFound(
+                        "Can't get exter %d nccl contexts.", run_order_));
    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
    auto& nccl_ctx = nccl_ctxs->at(dev_id);
    auto stream = nccl_ctx.stream();
@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase {

    cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);

-    PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
        sendbuff, recvbuff, count, datatype, op, comm, stream));

    cudaEventRecord(exter_events_.at(dev_id), stream);

    if (FLAGS_sync_nccl_allreduce) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream),
-                     "sync HierarchicalAllReduce exter stream error");
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
    }
  }

@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase {
             << ", stream:" << stream;

    cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
-    PADDLE_ENFORCE(platform::dynload::ncclBcast(sendbuff, count, datatype, 0,
-                                                comm, stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
+        sendbuff, count, datatype, 0, comm, stream));
  }

 protected:

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -47,8 +47,8 @@ void OpHandleBase::InitCUDA() {
 #ifdef PADDLE_WITH_CUDA
  for (auto &p : dev_ctxes_) {
    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
-    PADDLE_ENFORCE(cudaSetDevice(dev_id));
-    PADDLE_ENFORCE(
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
  }
  if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
@@ -62,17 +62,22 @@ void OpHandleBase::InitCUDA() {
      }
    }
  } else {
-    PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
-                      "%s should have only one dev_ctx.", Name());
+    PADDLE_ENFORCE_EQ(
+        dev_ctxes_.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Operator %s should have only one dev_ctx, but got %d.", Name(),
+            dev_ctxes_.size()));
    auto &place = dev_ctxes_.begin()->first;
    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
    for (auto &out_var : outputs_) {
      auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
      if (out_var_handle) {
-        PADDLE_ENFORCE(platform::is_same_place(place, out_var_handle->place()),
-                       "The place of output(%s) is not consistent with the "
-                       "place of current op(%s).",
-                       out_var_handle->Name(), Name());
+        PADDLE_ENFORCE_EQ(
+            platform::is_same_place(place, out_var_handle->place()), true,
+            platform::errors::InvalidArgument(
+                "The place of output(%s) is not consistent with the "
+                "place of current op(%s).",
+                out_var_handle->Name(), Name()));
        out_var_handle->SetGenerateEvent(events_.at(dev_id));
      }
    }
@@ -86,7 +91,10 @@ void OpHandleBase::Run(bool use_cuda) {
    InitCUDA();
  }
 #else
-  PADDLE_ENFORCE(!use_cuda);
+  PADDLE_ENFORCE_EQ(use_cuda, false,
+                    platform::errors::InvalidArgument(
+                        "Argument use_cuda should be false when Paddle is not "
+                        "compiled with CUDA."));
 #endif

  // skip running current op, used with inplace_addto_op_pass
@@ -100,17 +108,20 @@ void OpHandleBase::Run(bool use_cuda) {

 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
 #ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_NOT_NULL(waited_ctx);
+  PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
+                                          "Argument waited_ctx is NULL."));
  if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
    for (auto &dev_ctx : dev_ctxes_) {
-      PADDLE_ENFORCE_NOT_NULL(dev_ctx.second);
+      PADDLE_ENFORCE_NOT_NULL(
+          dev_ctx.second,
+          platform::errors::InvalidArgument("The device context is NULL."));
      dev_ctx.second->Wait();
    }
  } else {
    auto stream =
        static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
    for (auto &ev : events_) {
-      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
    }
  }
 #else
@@ -145,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() {
          auto stream =
              static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
                  ->stream();
-          PADDLE_ENFORCE(
+          PADDLE_ENFORCE_CUDA_SUCCESS(
              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
-          PADDLE_THROW("Doesn't compile the GPU.");
+          PADDLE_THROW(
+              platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
        }
        // There are nothing to do when the place is CPUPlace.
@@ -169,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
          auto stream = static_cast<platform::CUDADeviceContext *>(
                            dev_ctxes_.at(in_var_handle->place()))
                            ->stream();
-          PADDLE_ENFORCE(
+          PADDLE_ENFORCE_CUDA_SUCCESS(
              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
 #else
-          PADDLE_THROW("Doesn't compile the GPU.");
+          PADDLE_THROW(
+              platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
        }
        // There are nothing to do when the place is CPUPlace.
@@ -242,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes(
  auto scopes = GetLocalScopes();
  for (auto *scope : scopes) {
    auto iter = scope_map.find(scope);
-    PADDLE_ENFORCE(iter != scope_map.end(), "Local scope not found");
+    PADDLE_ENFORCE_NE(
+        iter, scope_map.end(),
+        platform::errors::NotFound("Local scope not found in scope map."));
    local_exec_scopes_.emplace_back(iter->second);
  }
 }

--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/inplace_op_inference.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
  void operator()(const char* op_type, OpInfo* info) const {
    PADDLE_ENFORCE_EQ(info->proto_, nullptr,
                      platform::errors::AlreadyExists(
-                          "OpProto of %s has been registered", op_type));
+                          "OpProto of %s has been registered.", op_type));
    PADDLE_ENFORCE_EQ(info->checker_, nullptr,
                      platform::errors::AlreadyExists(
-                          "OpAttrChecker of %s has been registered", op_type));
+                          "OpAttrChecker of %s has been registered.", op_type));
    info->proto_ = new proto::OpProto;
    info->checker_ = new OpAttrChecker();
    T maker;
    maker(info->proto_, info->checker_);
    info->proto_->set_type(op_type);
-    PADDLE_ENFORCE(
-        info->proto_->IsInitialized(),
-        "Fail to initialize %s's OpProto, because %s is not initialized",
-        op_type, info->proto_->InitializationErrorString());
+    PADDLE_ENFORCE_EQ(
+        info->proto_->IsInitialized(), true,
+        platform::errors::PreconditionNotMet(
+            "Fail to initialize %s's OpProto, because %s is not initialized.",
+            op_type, info->proto_->InitializationErrorString()));
  }
 };


--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <map>
 #include <vector>
+
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -32,9 +33,13 @@ struct ReduceLoDTensor {

  template <typename T>
  void apply() const {
-    PADDLE_ENFORCE(!src_tensors_.empty());
+    PADDLE_ENFORCE_NE(src_tensors_.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The number of tensors to be reduced is 0."));
    auto &t0 = *src_tensors_[0];
-    PADDLE_ENFORCE_NE(t0.numel(), 0);
+    PADDLE_ENFORCE_NE(t0.numel(), 0,
+                      platform::errors::InvalidArgument(
+                          "The size of first tensor to be reduced is 0."));

    dst_tensor_.Resize(t0.dims());
    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
@@ -45,8 +50,19 @@ struct ReduceLoDTensor {
        continue;
      }

-      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
-      PADDLE_ENFORCE_EQ(t.type(), t0.type());
+      PADDLE_ENFORCE_EQ(t.dims(), t0.dims(),
+                        platform::errors::InvalidArgument(
+                            "The shape of tensors to be reduced must be "
+                            "consistent. The shape of current tensor is %s, "
+                            "but the shape of the first tensor is %s.",
+                            t.dims(), t0.dims()));
+
+      PADDLE_ENFORCE_EQ(t.type(), t0.type(),
+                        platform::errors::InvalidArgument(
+                            "The type of tensors to be reduced must be "
+                            "consistent. The type of current tensor is %s, "
+                            "but the type of the first tensor is %s.",
+                            t.type(), t0.type()));
      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
                     [](T a, T b) -> T { return a + b; });
    }
@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor {
        in_places_(in_places),
        out_place_(out_place),
        dst_selected_rows_(dst_selected_rows) {
-    PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false);
+    PADDLE_ENFORCE_NE(src_selected_rows.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The number of selected_rows to be gathered is 0."));

    std::vector<int64_t> out_rows;


--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -13,7 +13,9 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/reduce_op_handle.h"
+
 #include <memory>
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows(
  merged_dev_ctx->Wait();
  scope->EraseVars(std::vector<std::string>{gathered_var_name});

-  PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope));
-  PADDLE_ENFORCE(remote.size() == vars.size());
+  PADDLE_ENFORCE_EQ(
+      client->Gather(vars, &remote, *merged_dev_ctx, scope), true,
+      platform::errors::PreconditionNotMet("Gather SelectedRows failed."));
+  PADDLE_ENFORCE_EQ(remote.size(), vars.size(),
+                    platform::errors::PreconditionNotMet(
+                        "The number of remotes should be equal to the number "
+                        "of variables to be gathered, but got the number of "
+                        "remotes is %d and the number of variables is %d.",
+                        remote.size(), vars.size()));

  // 4. merged local selected rows.
  std::vector<const SelectedRows *> all;
@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() {

  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), places_.size(),
-      "The number of output should equal to the number of places.");
+      platform::errors::InvalidArgument(
+          "The number of inputs should equal to the number of places, but got "
+          "the number of inputs is %d and the number of places is %d.",
+          in_var_handles.size(), places_.size()));

  VarHandle *out_var_handle;
  {
    auto out_var_handles = DynamicCast<VarHandle>(outputs_);

    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
-                      "The number of output should be one.");
+                      platform::errors::InvalidArgument(
+                          "The number of output should be one, but got %d.",
+                          out_var_handles.size()));
    out_var_handle = out_var_handles.front();
  }

@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() {

  auto pre_in_var =
      var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
+
+  PADDLE_ENFORCE_NOT_NULL(pre_in_var, platform::errors::NotFound(
+                                          "Variable %s is not found in scope.",
+                                          in_0_handle->name()));

  // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
  std::vector<platform::Place> in_places;  // used to get dev_ctx
@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() {
    in_places.emplace_back(in_handle->place());
    auto in_var =
        var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(in_var);
+
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                           in_handle->name()));
+
    VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
  }

  auto out_var = var_scopes.at(out_var_handle->scope_idx())
                     ->FindVar(out_var_handle->name());
-  PADDLE_ENFORCE_NOT_NULL(out_var);
+
+  PADDLE_ENFORCE_NOT_NULL(
+      out_var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                          out_var_handle->name()));

  // NOTE: The tensors' Place of input and output must be all on GPU or all on
  // CPU.
  auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
  platform::Place t_out_p;
  if (platform::is_gpu_place(in_p)) {
-    PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place()),
-                   "Places of input and output must be all on GPU.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_var_handle->place()), true,
+                      platform::errors::PreconditionNotMet(
+                          "Places of input and output must be all on GPU."));
    t_out_p = out_var_handle->place();
  } else {
    t_out_p = platform::CPUPlace();
@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() {
            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
            out_var->GetMutable<framework::SelectedRows>());
      } else {
-        PADDLE_THROW("only support double or float when gather SelectedRows");
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Only support double or float when gather SelectedRows, but got "
+            "%s.",
+            framework::DataTypeToString(in_selected_rows[0]->value().type())));
      }
 #endif
    });
@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() {
        size_t numel = static_cast<size_t>(lod_tensor.numel());
        all_reduce_calls.emplace_back(
            [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
-              PADDLE_ENFORCE(platform::dynload::ncclReduce(
+              PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
                  buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
                  ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
            });
@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() {
        }
      });
 #else
-      PADDLE_THROW("CUDA is not enabled.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
    } else {
-      PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The place of tensor should be CPUPlace or CUDAPlace, but got %s.",
+          lod_tensors[0]->place()));
    }
  }
 }

--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/reduce_op_handle.h"
+
 #include <unordered_map>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"

@@ -69,7 +71,8 @@ struct TestReduceOpHandle {
      }
      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
    } else {
      int count = 8;
@@ -103,7 +106,8 @@ struct TestReduceOpHandle {
      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                          gpu_list_, nccl_ctxs_.get()));
 #else
-      PADDLE_THROW("CUDA is not support.");
+      PADDLE_THROW(
+          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
    } else {
 #if defined(PADDLE_WITH_NCCL)
@@ -164,7 +168,10 @@ struct TestReduceOpHandle {
    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
         ++input_scope_idx) {
      auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
-      PADDLE_ENFORCE_NOT_NULL(in_var);
+
+      PADDLE_ENFORCE_NOT_NULL(
+          in_var, platform::errors::NotFound(
+                      "Variable %s is not found in scope.", "input"));
      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
      auto value = in_selected_rows->mutable_value();
      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
@@ -178,7 +185,9 @@ struct TestReduceOpHandle {
    }

    auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
-    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NOT_NULL(out_var,
+                            platform::errors::NotFound(
+                                "Variable %s is not found in scope.", "out"));
    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();

    auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
@@ -196,9 +205,18 @@ struct TestReduceOpHandle {
    auto &out_select_rows = out_var->Get<f::SelectedRows>();
    auto rt = out_select_rows.value();

-    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                      platform::errors::InvalidArgument(
+                          "The height of SelectedRows is not equal to "
+                          "the expected, expect %d, but got %d.",
+                          height, out_select_rows.height()));
    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
-      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+      PADDLE_ENFORCE_EQ(
+          out_select_rows.rows()[k], rows[k % rows.size()],
+          platform::errors::InvalidArgument(
+              "The item at position %d of rows of SelectedRows is not equal to "
+              "the expected, expect %d, but got %d.",
+              k, rows[k % rows.size()], out_select_rows.rows()[k]));
    }

    f::Tensor result_tensor;
@@ -208,7 +226,7 @@ struct TestReduceOpHandle {
    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
    }
-  }
+  }  // namespace details

  void TestReduceLodTensors(size_t output_scope_idx) {
    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
@@ -220,7 +238,9 @@ struct TestReduceOpHandle {
    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
         ++input_scope_idx) {
      auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
-      PADDLE_ENFORCE_NOT_NULL(in_var);
+      PADDLE_ENFORCE_NOT_NULL(
+          in_var, platform::errors::NotFound(
+                      "Variable %s is not found in scope.", "input"));
      auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
      in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
      in_lod_tensor->set_lod(lod);
@@ -230,7 +250,9 @@ struct TestReduceOpHandle {
    }

    auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
-    PADDLE_ENFORCE_NOT_NULL(out_var);
+    PADDLE_ENFORCE_NOT_NULL(out_var,
+                            platform::errors::NotFound(
+                                "Variable %s is not found in scope.", "out"));
    auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();

    auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
@@ -254,7 +276,7 @@ struct TestReduceOpHandle {
      ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
    }
  }
-};
+};  // namespace details

 TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
  TestReduceOpHandle test_op;

--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -111,13 +111,12 @@ void ShareTensorBufferFunctor::CallOnce() {
    auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
    PADDLE_ENFORCE_NOT_NULL(
        in_var, platform::errors::NotFound(
-                    "The input variable(%s)to be inplaced should not be NULL.",
+                    "The variable(%s) to be inplaced is not found in scope.",
                    in_var_infos_[i]->Name()));
    PADDLE_ENFORCE_NOT_NULL(
-        out_var,
-        platform::errors::NotFound(
-            "The output variable(%s) to be inplaced should not be NULL.",
-            out_var_names_[i]));
+        out_var, platform::errors::NotFound(
+                     "The variable(%s) to be inplaced is not found in scope.",
+                     out_var_names_[i]));
    PADDLE_ENFORCE_NE(
        in_var, out_var,
        platform::errors::PreconditionNotMet(

--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
+
 #include <algorithm>
 #include <utility>
+
 #include "dgc/dgc.h"
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
      is_encoded_(is_encoded),
      nranks_(nranks) {
  // TODO(gongwb) :polish them!
-  PADDLE_ENFORCE_EQ(is_encoded, true);
+  PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument(
+                                          "The argument is_encoded is false."));
  VLOG(1) << "Use dgc allreduce mode"
          << ", nranks:" << nranks_;

-  PADDLE_ENFORCE_GT(local_scopes_.size(), 0);
+  PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The number of local scope should be > 0, but got %zu.",
+                        local_scopes_.size()));
  auto nranks_name = g_dgc_nranks;
  for (size_t i = 0; i < local_scopes_.size(); ++i) {
    auto *local_scope = local_scopes_[i];
    auto nranks_var = local_scope->FindVar(nranks_name);
-    if (nranks_var == nullptr) {
-      PADDLE_THROW("not find nranks_var:%s", nranks_name);
-    }
+
+    PADDLE_ENFORCE_NOT_NULL(
+        nranks_var, platform::errors::NotFound(
+                        "Variable %s is not found in scope.", nranks_name));

    float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>();
    *dgc_nranks = nranks;
@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), places_.size(),
-      "The NoDummyInputSize should be equal to the number of places.");
+      platform::errors::PreconditionNotMet(
+          "The number of input variables should be equal to the number of "
+          "places, but got the number of input variables is %zu and the the "
+          "number of places is %zu.",
+          in_var_handles.size(), places_.size()));
  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+      platform::errors::PreconditionNotMet(
+          "The number of input variables should be equal to the number of "
+          "output variables, but got the number of input variables is %zu and "
+          "the the number of output variables is %zu.",
+          in_var_handles.size(), out_var_handles.size()));

  std::vector<const LoDTensor *> ins;
  std::vector<LoDTensor *> gathers;
@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() {

    auto encode_var_name = original_name + g_dgc_encoded;
    auto *in_var = local_scope->FindVar(encode_var_name);
-    PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                           encode_var_name));
    auto &in = in_var->Get<LoDTensor>();
    ins.emplace_back(&in);

    auto gather_var_name = original_name + g_dgc_gather;
    auto *gather_var = local_scope->FindVar(gather_var_name);
-    PADDLE_ENFORCE_NOT_NULL(gather_var, "%s should not be null",
-                            gather_var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        gather_var, platform::errors::NotFound(
+                        "Variable %s is not found in scope.", gather_var));
    auto *gather = gather_var->GetMutable<LoDTensor>();
    gathers.emplace_back(gather);

@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
    }
  }

-  PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
-  PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
-  PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+  PADDLE_ENFORCE_EQ(
+      platform::is_gpu_place(ins[0]->place()), true,
+      platform::errors::InvalidArgument(
+          "The place of input variable should be CUDAPlace, but got %s.",
+          ins[0]->place()));
+  PADDLE_ENFORCE_EQ(
+      platform::is_gpu_place(outs[0]->place()), true,
+      platform::errors::InvalidArgument(
+          "The place of input variable should be CUDAPlace, but got %s.",
+          outs[0]->place()));
+  PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::PreconditionNotMet(
+                                          "The nccl contexts are NULL."));

  int dtype = -1;
  size_t in_numel = 0;
  size_t out_numel = 0;
-  PADDLE_ENFORCE(nranks_ > 1);
+  PADDLE_ENFORCE_GT(
+      nranks_, 1,
+      platform::errors::PreconditionNotMet(
+          "The number of ranks should be > 1, but got %d.", nranks_));
  std::vector<std::function<void()>> all_gather_calls;
  std::vector<std::function<void()>> sparse_reduce_calls;

@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() {

    dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
    in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
-    PADDLE_ENFORCE(in_numel % 2 == 0);
-    PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
+    PADDLE_ENFORCE_EQ(in_numel % 2, 0,
+                      platform::errors::InvalidArgument(
+                          "The number of elements of input variable should be "
+                          "even, but got %zu.",
+                          in_numel));
+    PADDLE_ENFORCE_EQ(in_numel / 2, static_cast<size_t>(k),
+                      platform::errors::InvalidArgument(
+                          "The number of elements of input variable should be "
+                          "even, but got %zu.",
+                          in_numel));
    out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;

    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
      PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce(
                            gather_buff, k, out_tensor_buf,
                            static_cast<int>(out_numel), nranks_, stream),
-                        true);
+                        true, platform::errors::Unavailable(
+                                  "Calling sparseReduce() failed."));
    });
  }

@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc(
 int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
  auto original_name = paddle::framework::GradOriginalVarName(grad_name);
  auto var_name = original_name + g_dgc_k;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
+  PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The number of local scope should be > 0, but got %zu.",
+                        local_scopes_.size()));

  auto *scope = local_exec_scopes_[0];
  auto var = scope->FindVar(var_name);
-  PADDLE_ENFORCE_NOT_NULL(var);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                      var_name));
  auto tensor = var->Get<LoDTensor>().data<float>();
  return *tensor;
 }
@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() {
  }
  auto counter_name = g_dgc_counter_name;
  auto step_name = g_dgc_rampup_begin_step;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
+
+  PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The number of local scope should be > 0, but got %zu.",
+                        local_scopes_.size()));

  auto *local_scope = local_exec_scopes_[0];
  auto count_var = local_scope->FindVar(counter_name);
  auto step_var = local_scope->FindVar(step_name);
-  if (count_var == nullptr || step_var == nullptr) {
-    PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
-                 step_var);
-  }
+
+  PADDLE_ENFORCE_NOT_NULL(
+      count_var, platform::errors::NotFound(
+                     "Variable %s is not found in scope.", counter_name));
+  PADDLE_ENFORCE_NOT_NULL(
+      step_var, platform::errors::NotFound("Variable %s is not found in scope.",
+                                           step_var));

  float count = *count_var->Get<LoDTensor>().data<float>();
  float step = *step_var->Get<LoDTensor>().data<float>();

--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -97,6 +97,7 @@ message AsyncConfig {
  optional int32 thread_pool_size = 6 [ default = 1 ];
  optional int32 send_wait_times = 7 [ default = 1 ];
  optional bool runtime_split_send_recv = 8 [ default = false ];
+  optional bool launch_barrier = 9 [ default = true ];
 }

 message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -127,11 +127,10 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
                        "Baidu Kunlun Card is properly installed.",
                        ret));
  ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
+  PADDLE_ENFORCE_EQ(
+      ret, XPU_SUCCESS,
+      platform::errors::External(
+          "XPU API return wrong value[%d], no enough memory", ret));
  if (FLAGS_init_allocated_mem) {
    PADDLE_THROW(platform::errors::Unimplemented(
        "xpu memory FLAGS_init_allocated_mem is not implemented."));

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -891,6 +891,28 @@ class SquareDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
  }
 };

+// log Grad: dx = dout / x
+// log Grad Grad: ddout = ddx / x; dx = -(dout / x) * (ddx / x)
+template <typename T>
+class LogDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
+ public:
+  using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("log_grad_grad");
+    op->SetInput("X", this->Input("X"));
+    // X@GRAD@GRAD: ddx
+    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
+    op->SetAttrMap(this->Attrs());
+    // X@GRAD: dx
+    op->SetOutput("DX", this->InputGrad("X"));
+    // Out@GRAD@GRAD: ddy
+    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+  }
+};
+
 DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInferer,
                           {framework::GradVarName("Out"),
                            framework::GradVarName("X")});
@@ -1272,6 +1294,35 @@ REGISTER_OP_CPU_KERNEL(
                                    ops::AbsGradGradFunctor<int64_t>>);
 /* ========================================================================== */

+/* ==========================  Log register ==================================*/
+REGISTER_OPERATOR(
+    log, ops::ActivationOp, ops::LogOpMaker, ops::ActivationOpInferVarType,
+    ops::ActivationGradOpMaker<ops::LogGradFunctor<float>::FwdDeps(),
+                               paddle::framework::OpDesc>,
+    ops::ActivationGradOpMaker<ops::LogGradFunctor<float>::FwdDeps(),
+                               paddle::imperative::OpBase>,
+    ops::ActFwdInplaceInferer);
+REGISTER_OPERATOR(log_grad, ops::ActivationOpGrad,
+                  ops::ActivationGradOpInplaceInferer,
+                  ops::LogDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::LogDoubleGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(
+    log_grad_grad,
+    ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
+    ops::ActivationDoubleGradOpInplaceInferer);
+
+REGISTER_ACTIVATION_CPU_KERNEL(log, Log, LogFunctor, LogGradFunctor);
+
+REGISTER_OP_CPU_KERNEL(
+    log_grad_grad, ops::LogDoubleGradKernel<plat::CPUDeviceContext,
+                                            ops::LogGradGradFunctor<float>>,
+    ops::LogDoubleGradKernel<plat::CPUDeviceContext,
+                             ops::LogGradGradFunctor<double>>,
+    ops::LogDoubleGradKernel<plat::CPUDeviceContext,
+                             ops::LogGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
+
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(leaky_relu)
    .AddCheckpoint(

--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -193,3 +193,15 @@ REGISTER_OP_CUDA_KERNEL(
    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
                                    ops::AbsGradGradFunctor<int64_t>>);
 /* ========================================================================== */
+
+/* ==========================  Log register ==================================*/
+REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor);
+
+REGISTER_OP_CUDA_KERNEL(
+    log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
+                                            ops::LogGradGradFunctor<float>>,
+    ops::LogDoubleGradKernel<plat::CUDADeviceContext,
+                             ops::LogGradGradFunctor<double>>,
+    ops::LogDoubleGradKernel<plat::CUDADeviceContext,
+                             ops::LogGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1663,6 +1663,10 @@ class SquareDoubleGradKernel
  }
 };

+template <typename DeviceContext, typename Functor>
+class LogDoubleGradKernel
+    : public SquareDoubleGradKernel<DeviceContext, Functor> {};
+
 template <typename DeviceContext, typename Functor>
 class ELUDoubleGradKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -1852,6 +1856,37 @@ class PowGradKernel
    functor(*place, x, out, dout, dx);
  }
 };
+
+template <typename T>
+struct LogGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* X,
+                  const framework::Tensor* ddX, framework::Tensor* ddOut,
+                  const framework::Tensor* dOut, framework::Tensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
+    // ddout = ddx / x; dx = -(dout / x) * (ddx / x)
+    // calculate dx first, so ddout can inplace ddx
+    if (dX) {
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
+      auto dx = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
+      dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
+    }
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
+      ddout.device(*d) = ddx * static_cast<T>(1) / x;
+    }
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 }  // namespace operators
 }  // namespace paddle

@@ -1872,7 +1907,6 @@ class PowGradKernel
  __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor);                          \
  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
-  __macro(log, Log, LogFunctor, LogGradFunctor);                              \
  __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                      \
  __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor);                      \
  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \

--- a/paddle/fluid/operators/activation_op_xpu.cc
+++ b/paddle/fluid/operators/activation_op_xpu.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/activation_op.h"
+#include <string>
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+
+template <typename Functor>
+class XPUActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+template <typename Functor>
+class XPUActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+template <typename DeviceContext, typename T>
+void xpu_activation_forward(const framework::ExecutionContext &ctx,
+                            xpu::Activation_t type) {
+  const auto *x = ctx.Input<Tensor>("X");
+  auto *y = ctx.Output<Tensor>("Out");
+  const T *x_data = x->data<T>();
+  T *y_data = y->mutable_data<T>(ctx.GetPlace());
+  int r = 0;
+  if (xpu::Activation_t::ACT_POW == type.type) {
+    type.pow_factor = ctx.Attr<float>("factor");
+  }
+  auto xpu_context = ctx.device_context<DeviceContext>().x_context();
+  r = xpu::activation_forward(xpu_context, type, x->numel(),
+                              reinterpret_cast<const float *>(x_data),
+                              reinterpret_cast<float *>(y_data));
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        r));
+}
+
+template <typename DeviceContext, typename T>
+void xpu_activation_backward(const framework::ExecutionContext &ctx,
+                             xpu::Activation_t type) {
+  /* TODO: relu tanh sigmoid are inplace */
+  const auto *x = ctx.Input<Tensor>("X");
+  auto *y = ctx.Input<Tensor>("Out");
+  auto *dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+  auto *dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+  const T *x_data = nullptr;
+  const T *y_data = nullptr;
+  const T *y_grad = nullptr;
+  if (x != nullptr) x_data = x->data<T>();
+  if (y != nullptr) y_data = y->data<T>();
+  if (dOut != nullptr) y_grad = dOut->data<T>();
+  T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
+  auto xpu_context = ctx.device_context<DeviceContext>().x_context();
+  int r = xpu::activation_backward(xpu_context, type, dX->numel(),
+                                   reinterpret_cast<const float *>(x_data),
+                                   reinterpret_cast<const float *>(y_data),
+                                   reinterpret_cast<const float *>(y_grad),
+                                   reinterpret_cast<float *>(x_grad));
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        r));
+}
+
+template <typename T, xpu::Activation_t::act_enum algorithm>
+struct XPUActivationFunc : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_forward<paddle::platform::XPUDeviceContext, T>(ctx,
+                                                                  algorithm);
+  }
+};
+
+template <typename T, xpu::Activation_t::act_enum algorithm>
+struct XPUActivationGradFunc : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T>(ctx,
+                                                                   algorithm);
+  }
+};
+
+template <typename T>
+using XPUReluFunctor = XPUActivationFunc<T, xpu::Activation_t::RELU>;
+template <typename T>
+using XPUSigmoidFunctor = XPUActivationFunc<T, xpu::Activation_t::SIGMOID>;
+template <typename T>
+using XPUTanhFunctor = XPUActivationFunc<T, xpu::Activation_t::TANH>;
+template <typename T>
+using XPUGeluFunctor = XPUActivationFunc<T, xpu::Activation_t::GELU>;
+template <typename T>
+using XPULogFunctor = XPUActivationFunc<T, xpu::Activation_t::LOG>;
+template <typename T>
+using XPUSquareFunctor = XPUActivationFunc<T, xpu::Activation_t::SQUARE>;
+template <typename T>
+using XPUSuareGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQUARE>;
+template <typename T>
+using XPUReluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::RELU>;
+template <typename T>
+using XPUSigmoidGradFunctor =
+    XPUActivationGradFunc<T, xpu::Activation_t::SIGMOID>;
+template <typename T>
+using XPUTanhGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::TANH>;
+template <typename T>
+using XPUGeluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::GELU>;
+template <typename T>
+using XPUSqrtFunctor = XPUActivationFunc<T, xpu::Activation_t::SQRT>;
+template <typename T>
+using XPUSqrtGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQRT>;
+template <typename T>
+using XPUACTPowFunctor = XPUActivationFunc<T, xpu::Activation_t::ACT_POW>;
+template <typename T>
+using XPUABSFunctor = XPUActivationFunc<T, xpu::Activation_t::ABS>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, functor, grad_functor)  \
+  REGISTER_OP_XPU_KERNEL(act_type,                                       \
+                         ops::XPUActivationKernel<ops::functor<float>>); \
+  REGISTER_OP_XPU_KERNEL(                                                \
+      act_type##_grad,                                                   \
+      ops::XPUActivationGradKernel<ops::grad_functor<float>>);
+
+REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(tanh, XPUTanhFunctor, XPUTanhGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor,
+                               XPUSigmoidGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(gelu, XPUGeluFunctor, XPUGeluGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSuareGradFunctor)
+REGISTER_OP_XPU_KERNEL(log,
+                       ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
+REGISTER_OP_XPU_KERNEL(pow,
+                       ops::XPUActivationKernel<ops::XPUACTPowFunctor<float>>);
+REGISTER_OP_XPU_KERNEL(abs,
+                       ops::XPUActivationKernel<ops::XPUABSFunctor<float>>);
+
+#endif  // PADDLE_WITH_XPU
--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -30,8 +30,10 @@ __global__ void ComputeDifferent(T *centers_diff, const T *X, const T *centers,

  while (idy < K) {
    int64_t id = ids[idy];
-    PADDLE_ENFORCE(id >= 0, "received id:", id);
-    PADDLE_ENFORCE(id < N, "received id:", id);
+    PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
+    PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
+                   id);
+
    T *out = centers_diff + idy * D;
    const T *x = X + idy * D;
    const T *cent = centers + id * D;
@@ -52,8 +54,9 @@ __global__ void UpdateCenters(T *centers, T *centers_diff, const int64_t *ids,
  while (idy < K) {
    int count = 1;
    int64_t id = ids[idy];
-    PADDLE_ENFORCE(id >= 0, "received id:", id);
-    PADDLE_ENFORCE(id < N, "received id:", id);
+    PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
+    PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
+                   id);

    for (int i = 0; i < K; i++) {
      if (ids[i] == id) {

--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -69,8 +69,10 @@ template <typename T>
 class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::InvalidArgument(
+                          "CTCAlign operator CUDA kernel must use CUDAPlace "
+                          "rather than CPUPlace."));
    auto* input = ctx.Input<LoDTensor>("Input");
    auto* output = ctx.Output<LoDTensor>("Output");
    const int blank = ctx.Attr<int>("blank");

--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -72,8 +72,11 @@ class CTCAlignKernel : public framework::OpKernel<T> {
      // check input dims and lod
      PADDLE_ENFORCE_EQ(
          input_dims[0], static_cast<int64_t>(input_lod[level].back()),
-          "The first dimension of Input(Input) should be equal to "
-          "the sum of all sequences' lengths.");
+          platform::errors::InvalidArgument(
+              "The first dimension %d of CTCAlign operator Input(Input) should "
+              "be equal to "
+              "the sum of all sequences' lengths %d.",
+              input_dims[0], static_cast<int64_t>(input_lod[level].back())));

      const size_t num_sequences = input_lod[level].size() - 1;


--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -175,10 +175,6 @@ void RecvGeoSparseRecords(const CommContext &rpc_ctx,

 template <typename T>
 void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto cpu_place = platform::CPUPlace();
-  auto &cpu_ctx = *pool.Get(cpu_place);
-
  distributed::RPCClient *rpc_client =
      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);

@@ -188,8 +184,13 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
  if (rpc_ctx.origin_varnames.size() == 1 &&
      rpc_ctx.splited_varnames.size() == 1) {
    auto varname = rpc_ctx.origin_varnames[0];
-    VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0];
-    rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], cpu_ctx,
+    const auto place =
+        scope.FindVar(varname)->Get<framework::LoDTensor>().place();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
+    VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? "
+            << platform::is_gpu_place(place);
+    rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx,
                                                    scope, varname, varname));

    for (size_t i = 0; i < rets.size(); i++) {

--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ElementwiseAddXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    XPUElementwise<T, XPUAddFunctor<T>>(ctx);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+    using Tensor = framework::Tensor;
+
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto dx_dims = dout->dims();
+    auto dy_dims_untrimed = dout->dims();
+    T *dx_data = NULL;
+    T *dy_data = NULL;
+
+    int axis = ctx.Attr<int>("axis");
+    PADDLE_ENFORCE_GE(dx_dims.size(), dy_dims_untrimed.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    if (dx != nullptr) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      dx_dims = dx->dims();
+      dx_data = dx->data<T>();
+    }
+
+    if (dy != nullptr) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      dy_dims_untrimed = dy->dims();
+      dy_data = dy->data<T>();
+    }
+
+    int pre, n, post, is_common_broadcast;
+    if (dx_dims == dy_dims_untrimed) {
+      pre = post = 1;
+      n = dout->numel();
+    } else {
+      axis = (axis == -1 ? dx_dims.size() - dy_dims_untrimed.size() : axis);
+      PADDLE_ENFORCE(axis >= 0 && axis < dx_dims.size(),
+                     "Axis should be in range [0, dx_dims)");
+      auto dy_dims = trim_trailing_singular_dims(dy_dims_untrimed);
+      axis = (dy_dims.size() == 0) ? dx_dims.size() : axis;
+      get_mid_dims(dx_dims, dy_dims, axis, &pre, &n, &post,
+                   &is_common_broadcast);
+    }
+    int len = pre * n * post;
+
+    auto &dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    if (post == 1) {
+      int r = xpu::matrix_vector_add_grad(
+          dev_ctx.x_context(), dout->data<T>(), dout->data<T>(),
+          dout->data<T>(), dout->data<T>(), dx_data, dy_data, pre, n);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+      return;
+    }
+
+    if (dx == nullptr) {
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void **>(&dx_data), len * sizeof(float)),
+          XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    }
+
+    if (dy == nullptr) {
+      PADDLE_ENFORCE_EQ(
+          xpu_malloc(reinterpret_cast<void **>(&dy_data), len * sizeof(float)),
+          XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
+    } else {
+      if (len != n) {
+        PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void **>(&dy_data),
+                                     len * sizeof(float)),
+                          XPU_SUCCESS, platform::errors::External(
+                                           "XPU has no enough memory"));
+      }
+    }
+
+    int r = xpu::elementwise_add_grad(
+        dev_ctx.x_context(), dout->data<T>() /*x*/, dout->data<T>() /*y*/,
+        dout->data<T>() /*out*/, dout->data<T>(), dx_data, dy_data, len);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], please check whether "
+            "Baidu Kunlun Card is properly installed.",
+            r));
+
+    if ((dy != nullptr) && (len != n)) {
+      r = xpu::reduce_ew(dev_ctx.x_context(), dy_data, dy->data<T>(), pre, n,
+                         post, xpu::ElementwiseOp::ASSIGN);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+      dev_ctx.Wait();
+      xpu_free(dy_data);
+    }
+
+    if ((dx == nullptr || dy == nullptr) && !(dy != nullptr && len != n)) {
+      dev_ctx.Wait();
+    }
+
+    if (dx == nullptr) {
+      xpu_free(dx_data);
+    }
+    if (dy == nullptr) {
+      xpu_free(dy_data);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    elementwise_add,
+    ops::ElementwiseAddXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(elementwise_add_grad,
+                       ops::ElementwiseAddGradXPUKernel<
+                           paddle::platform::XPUDeviceContext, float>);
+#endif
--- a/paddle/fluid/operators/elementwise/elementwise_xpu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct XPUAddFunctor {
+  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
+    return xpu::elementwise_add(ctx, x, y, z, len);
+  }
+};
+
+template <typename T>
+struct XPUMulFunctor {
+  int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) {
+    return xpu::elementwise_mul(ctx, x, y, z, len);
+  }
+};
+
+template <typename T, typename Functor>
+void XPUElementwise(const framework::ExecutionContext& ctx) {
+  PADDLE_ENFORCE(platform::is_xpu_place(ctx.GetPlace()),
+                 "This kernel only runs on XPU device.");
+  auto x_var = ctx.InputVar("X");
+  PADDLE_ENFORCE_NE(x_var, nullptr,
+                    platform::errors::Fatal("Cannot get input Variable X"));
+  PADDLE_ENFORCE(x_var->IsType<framework::LoDTensor>(),
+                 "XPU only support LoDTensor");
+
+  auto x = x_var->Get<framework::LoDTensor>();
+  auto* y = ctx.Input<framework::LoDTensor>("Y");
+  auto* z = ctx.Output<framework::LoDTensor>("Out");
+  z->mutable_data<T>(ctx.GetPlace());
+
+  int axis = ctx.Attr<int>("axis");
+  auto x_dims = x.dims();
+  auto y_dims_untrimed = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
+                    "Rank of first input must >= rank of second input.");
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+  auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
+  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+  int pre, n, post, is_common_broadcast;
+  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &is_common_broadcast);
+  int len = pre * n * post;
+
+  const T* x_data = x.data<T>();
+  const T* y_data = y->data<T>();
+  T* z_data = z->data<T>();
+  T* y_broadcast = nullptr;
+
+  auto& dev_ctx =
+      ctx.template device_context<paddle::platform::XPUDeviceContext>();
+
+  if (post == 1) {
+    if (std::is_same<Functor, XPUAddFunctor<T>>::value) {
+      int res = xpu::matrix_vector_add(dev_ctx.x_context(), x_data, y_data,
+                                       z_data, pre, n);
+      PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
+                     res);
+      return;
+    }
+    if (std::is_same<Functor, XPUMulFunctor<T>>::value) {
+      int res = xpu::matrix_vector_mul(dev_ctx.x_context(), x_data, y_data,
+                                       z_data, pre, n);
+      PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
+                     res);
+      return;
+    }
+  }
+
+  if (pre != 1 || post != 1) {
+    PADDLE_ENFORCE(xpu_malloc(reinterpret_cast<void**>(&y_broadcast),
+                              len * sizeof(T)) == XPU_SUCCESS);
+    int res = xpu::broadcast_ew(dev_ctx.x_context(), y_data, y_broadcast, pre,
+                                n, post, xpu::ElementwiseOp::ASSIGN);
+    PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
+                   res);
+    y_data = y_broadcast;
+  }
+
+  Functor functor;
+  int res = functor(dev_ctx.x_context(), x_data, y_data, z_data, len);
+  PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d",
+                 res);
+
+  if (pre != 1 || post != 1) {
+    dev_ctx.Wait();
+    xpu_free(y_broadcast);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+#endif
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -46,6 +46,7 @@ class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
 };

 template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, double>;

 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -52,6 +52,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
 };

 template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, double>;

 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/matmul_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return framework::make_ddim({1, x_dim[0]});
+}
+
+static framework::Tensor FoldInitDims(const framework::Tensor &input) {
+  auto output = input;
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
+  }
+  return output;
+}
+/**
+ * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return framework::make_ddim({y_dim[0], 1});
+}
+
+static void ReshapeTensorIntoMatrixSequence(
+    framework::Tensor *x, const math::MatDescriptor &descriptor) {
+  int64_t h, w;
+  h = descriptor.height_;
+  w = descriptor.width_;
+  if (descriptor.trans_) {
+    std::swap(w, h);
+  }
+  if (descriptor.batch_size_) {
+    x->Resize({descriptor.batch_size_, h, w});
+  } else {
+    x->Resize({h, w});
+  }
+}
+/**
+ * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
+ * Out = matmul(x, y)
+ *
+ * This method will first calculate X,Y matrix sequence, and then calculate
+ * the out shape.
+ *
+ * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
+ * The out = [BatchSize, H1, W2]
+ *
+ * If there is no batch size in `X` and `Y`, the out will be [H1, W2]
+ * If any of `X` and `Y` has batch size BatchSize, the out will have the
+ * BatchSize.
+ */
+static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
+                                           framework::Tensor *y,
+                                           framework::Tensor *out, bool trans_x,
+                                           bool trans_y) {
+  auto x_dim = RowMatrixFromVector(x->dims());
+  auto y_dim = ColumnMatrixFromVector(y->dims());
+  auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
+  auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
+  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
+    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
+  } else {
+    out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
+                 mat_dim_x.height_, mat_dim_y.width_});
+  }
+
+  ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
+  ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
+}
+
+template <typename DeviceContext, typename T>
+class MatMulXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<framework::Tensor>("X");
+    auto *y = context.Input<framework::Tensor>("Y");
+    auto *out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    auto mat_dim_a = math::CreateMatrixDescriptor(
+        RowMatrixFromVector(x->dims()), 0, context.Attr<bool>("transpose_X"));
+    auto mat_dim_b =
+        math::CreateMatrixDescriptor(ColumnMatrixFromVector(y->dims()), 0,
+                                     context.Attr<bool>("transpose_Y"));
+    PADDLE_ENFORCE_EQ(
+        mat_dim_a.width_, mat_dim_b.height_,
+        platform::errors::InvalidArgument("Shape mistake in matmul_op"));
+    PADDLE_ENFORCE_EQ(
+        mat_dim_a.batch_size_, mat_dim_b.batch_size_,
+        platform::errors::InvalidArgument("Shape mistake in matmul_op"));
+    T alpha = static_cast<T>(context.Attr<float>("alpha"));
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    float *data_c = out->data<T>();
+    if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
+      int r =
+          xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_,
+                        mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_,
+                        alpha, x->data<T>(), y->data<T>(), 0.0f, data_c);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    } else {
+      // batch matmul
+      int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_,
+                                      mat_dim_b.trans_, mat_dim_a.batch_size_,
+                                      mat_dim_a.height_, mat_dim_b.width_,
+                                      mat_dim_a.width_, alpha, x->data<T>(),
+                                      y->data<T>(), data_c, nullptr, nullptr);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    }
+  }
+};
+
+// Reshape a rank-3 tensor from P x M x N to M x (P * N).
+// (Warning: This requires transposing data and writes into new memory.)
+// Identity op if the tensor is not of rank 3.
+template <typename DeviceContext, typename T>
+static framework::Tensor XPUFoldHeadAndLastDims(
+    const DeviceContext &context, const framework::Tensor &input) {
+  auto in_dims = input.dims();
+  if (in_dims.size() != 3) {
+    return input;
+  }
+
+  framework::Tensor output;
+  output.Resize({in_dims[1], in_dims[0], in_dims[2]});
+  output.mutable_data<T>(context.GetPlace());
+  std::vector<int> in_shape_host = {static_cast<int>(in_dims[0]),
+                                    static_cast<int>(in_dims[1]),
+                                    static_cast<int>(in_dims[2])};
+  std::vector<int> axis_host = {1, 0, 2};
+
+  int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
+                         in_shape_host.data(), axis_host.data(), /*ndims=*/3);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU API return wrong value[%d], please check whether "
+                        "Baidu Kunlun Card is properly installed.",
+                        r));
+  output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
+
+  return output;
+}
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// transpose_X | False    | True     | False    | True
+// transpose_Y | False    | False    | True     | True
+// -----------+----------+----------+----------+-----------
+//        dX = | dOut Y^T | Y dOut^T | dOut Y   | Y^T dOut^T
+//        dY = | X^T dOut | X dOut   | dOut^T X | dOut^T X^T
+//
+// When X is a vector of size K, we treat it instead as a matrix of shape
+// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
+// a matrix of shape (K, 1).
+//
+// When X and Y are both 3-dimensional tensors, then the first dimension
+// the batch dimension can be ignored and the exact same formulas apply
+// as for two matrices.
+//
+// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
+// up with formulas like
+//
+//   dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
+//
+// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
+// to X: (P * M) x K, dOut: (P * M) x N.
+template <typename DeviceContext, typename T>
+class MatMulGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void MatMul(const framework::ExecutionContext &context,
+              const framework::Tensor &a, bool trans_a,
+              const framework::Tensor &b, bool trans_b,
+              framework::Tensor *out) const {
+    out->mutable_data<T>(context.GetPlace());
+    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+    PADDLE_ENFORCE_EQ(
+        mat_dim_a.width_, mat_dim_b.height_,
+        platform::errors::InvalidArgument("Shape mistake in matmul_grad_op"));
+    PADDLE_ENFORCE_EQ(
+        mat_dim_a.batch_size_, mat_dim_b.batch_size_,
+        platform::errors::InvalidArgument("Shape mistake in matmul_grad_op"));
+    T alpha = static_cast<T>(context.Attr<float>("alpha"));
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    float *data_c = out->data<T>();
+    if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) {
+      int r =
+          xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_,
+                        mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_,
+                        alpha, a.data<T>(), b.data<T>(), 0.0f, data_c);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    } else {
+      // batch matmul
+      int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_,
+                                      mat_dim_b.trans_, mat_dim_a.batch_size_,
+                                      mat_dim_a.height_, mat_dim_b.width_,
+                                      mat_dim_a.width_, alpha, a.data<T>(),
+                                      b.data<T>(), data_c, nullptr, nullptr);
+      PADDLE_ENFORCE_EQ(
+          r, XPU_SUCCESS,
+          platform::errors::External(
+              "XPU API return wrong value[%d], please check whether "
+              "Baidu Kunlun Card is properly installed.",
+              r));
+    }
+  }
+
+  void CalcInputGrad(const framework::ExecutionContext &context,
+                     const framework::Tensor &a, bool trans_a,
+                     bool is_fold_init_dims_a, const framework::Tensor &b,
+                     bool trans_b, bool is_fold_init_dims_b,
+                     framework::Tensor *out) const {
+    if (out == nullptr) return;
+    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
+                        out->dims().size() == 2;
+    if (!need_combine) {
+      MatMul(context, a, trans_a, b, trans_b, out);
+    } else {
+      auto &dev_ctx = context.template device_context<DeviceContext>();
+      MatMul(
+          context, is_fold_init_dims_a
+                       ? FoldInitDims(a)
+                       : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, a),
+          trans_a, is_fold_init_dims_b
+                       ? FoldInitDims(b)
+                       : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, b),
+          trans_b, out);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto x = *context.Input<framework::Tensor>("X");
+    auto y = *context.Input<framework::Tensor>("Y");
+    auto dout =
+        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+
+    framework::DDim dx_dims;
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x.dims()) {
+        dx->Resize(x.dims());
+      }
+    }
+
+    framework::DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y.dims()) {
+        dy->Resize(y.dims());
+      }
+    }
+
+    if (transpose_x && transpose_y) {
+      CalcInputGrad(context, y, true, true, dout, true, false, dx);
+      CalcInputGrad(context, dout, true, true, x, true, false, dy);
+    } else if (transpose_x) {
+      CalcInputGrad(context, y, false, false, dout, true, false, dx);
+      CalcInputGrad(context, x, false, false, dout, false, true, dy);
+    } else if (transpose_y) {
+      CalcInputGrad(context, dout, false, false, y, false, true, dx);
+      CalcInputGrad(context, dout, true, true, x, false, true, dy);
+    } else {
+      CalcInputGrad(context, dout, false, false, y, true, false, dx);
+      CalcInputGrad(context, x, true, true, dout, false, true, dy);
+    }
+
+    if (dx) {
+      if (dx_dims != x.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+
+    if (dy) {
+      if (dy_dims != y.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    matmul, ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    matmul_grad,
+    ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
--- a/paddle/fluid/operators/xpu/mul_xpu_op.cc
+++ b/paddle/fluid/operators/xpu/mul_xpu_op.cc
@@ -14,11 +14,11 @@ limitations under the License. */

 #ifdef PADDLE_WITH_XPU

+#include "paddle/fluid/operators/mul_op.h"
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "paddle/fluid/operators/mul_op.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/mv_op.cc
+++ b/paddle/fluid/operators/mv_op.cc
@@ -42,21 +42,21 @@ class MVOp : public framework::OperatorWithKernel {
    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");

    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Vec");
+    auto dim_vec = context->GetInputDim("Vec");
    PADDLE_ENFORCE_EQ(
        dim_x.size(), 2,
        platform::errors::InvalidArgument(
            "The rank of input X should be 2, but is %d", dim_x.size()));
    PADDLE_ENFORCE_EQ(
-        dim_y.size(), 1,
+        dim_vec.size(), 1,
        platform::errors::InvalidArgument(
-            "The rank of input Vec should be 1, but is %d", dim_y.size()));
-    PADDLE_ENFORCE_EQ(dim_x[1] == dim_y[0], true,
+            "The rank of input Vec should be 1, but is %d", dim_vec.size()));
+    PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0],
                      platform::errors::InvalidArgument(
-                          "The length of input X' second dim should equal the "
-                          "length of input Vec,"
-                          " but X[%d, %d], Vec[%d]",
-                          dim_x[0], dim_x[1], dim_y[0]));
+                          "X's second dimension is expected to be equal to "
+                          "Vec's first dimension"
+                          "but recieved X'shape = [%s], Vec's shape = [%s]",
+                          dim_x, dim_vec));

    framework::DDim dim_out = framework::make_ddim({dim_x[0]});


--- a/paddle/fluid/operators/mv_op.cu
+++ b/paddle/fluid/operators/mv_op.cu
@@ -19,8 +19,8 @@ namespace paddle {
 namespace operators {

 template <typename T>
-__global__ void MVGradCUDAKernel(const int m, const int n, const T *dout,
-                                 const T *vec, T *dx) {
+__global__ void MVGradDxCUDAKernel(const int m, const int n, const T *dout,
+                                   const T *vec, T *dx) {
  int idx = blockDim.x * blockIdx.x + threadIdx.x;
  for (; idx < m * n; idx += blockDim.x * gridDim.x) {
    int i = idx / n;
@@ -52,32 +52,31 @@ class MVGradKernel<platform::CUDADeviceContext, T>
    int m = dim_x[0];
    int n = dim_x[1];

-    dx->Resize(framework::make_ddim({m * n}));
-
    // get data ptr
    const T *x_data = x->data<T>();
    const T *vec_data = vec->data<T>();
    const T *dout_data = dout->data<T>();

-    T *dx_data = dx->mutable_data<T>(context.GetPlace());
-    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
-
    auto &dev_ctx =
        context.template device_context<platform::CUDADeviceContext>();
    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
-
-    // calculate dx
    auto stream = context.cuda_device_context().stream();
    auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
-    MVGradCUDAKernel<
-        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-        m, n, dout_data, vec_data, dx_data);

-    dx->Resize(framework::make_ddim({m, n}));
+    if (dx) {
+      T *dx_data = dx->mutable_data<T>(context.GetPlace());
+
+      MVGradDxCUDAKernel<
+          T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+          m, n, dout_data, vec_data, dx_data);
+    }
+
+    if (dvec) {
+      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());

-    // calculate dvec
-    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
-              static_cast<T>(0), dvec_data);
+      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+                static_cast<T>(0), dvec_data);
+    }
  }
 };


--- a/paddle/fluid/operators/mv_op.h
+++ b/paddle/fluid/operators/mv_op.h
@@ -74,30 +74,30 @@ class MVGradKernel : public framework::OpKernel<T> {
    int m = dim_x[0];
    int n = dim_x[1];

-    dx->Resize(framework::make_ddim({m * n}));
-
    // get data ptr
    const T *x_data = x->data<T>();
    const T *vec_data = vec->data<T>();
    const T *dout_data = dout->data<T>();

-    T *dx_data = dx->mutable_data<T>(context.GetPlace());
-    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    if (dx) {
+      T *dx_data = dx->mutable_data<T>(context.GetPlace());

-    // calculate dx
-    for (int i = 0; i < m; ++i) {
-      for (int j = 0; j < n; ++j)
-        dx_data[i * n + j] = dout_data[i] * vec_data[j];
+      for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+          dx_data[i * n + j] = dout_data[i] * vec_data[j];
+        }
+      }
    }

-    dx->Resize(framework::make_ddim({m, n}));
+    if (dvec) {
+      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
+
+      auto &dev_ctx = context.template device_context<DeviceContext>();
+      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);

-    // calculate dvec
-    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
-              static_cast<T>(0), dvec_data);
+      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+                static_cast<T>(0), dvec_data);
+    }
  }
 };


--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@@ -24,32 +24,45 @@ class DpsgdOp : public framework::OperatorWithKernel {

  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
-                      "Input(Param) of DpsgdOp should not be null.");
+                      platform::errors::NotFound(
+                          "Input(Param) of DpsgdOp should not be null."));
    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
-                      "Input(Grad) of DpsgdOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
-                      "Input(LearningRate) of DpsgdOp should not be null.");
+                      platform::errors::NotFound(
+                          "Input(Grad) of DpsgdOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("LearningRate"), true,
+        platform::errors::NotFound(
+            "Input(LearningRate) of DpsgdOp should not be null."));
    PADDLE_ENFORCE_EQ(
        ctx->GetInputsVarType("Param").front(),
        framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->GetInputsVarType("Param").front()));
    PADDLE_ENFORCE_EQ(
        ctx->GetInputsVarType("Grad").front(),
        framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->GetInputsVarType("Grad").front()));

    PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
-                      "Output(ParamOut) of DpsgdOp should not be null.");
+                      platform::errors::NotFound(
+                          "Output(ParamOut) of DpsgdOp should not be null."));

    auto lr_dims = ctx->GetInputDim("LearningRate");
    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 dimension");
+                      platform::errors::InvalidArgument(
+                          "Learning rate should have 1 dimension. But Received "
+                          "LearningRate's dims [%s].",
+                          framework::product(lr_dims)));
    auto param_dims = ctx->GetInputDim("Param");
    PADDLE_ENFORCE_EQ(
        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of DpsgdOp should have same dimension");
+        platform::errors::InvalidArgument(
+            "Param and Grad input of DpsgdOp should have same dimension. But "
+            "received Para's dim [%s] and Grad's dim [%s].",
+            param_dims, ctx->GetInputDim("Grad")));

    ctx->SetOutputDim("ParamOut", param_dims);
  }

--- a/paddle/fluid/operators/optimizers/dpsgd_op.h
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.h
@@ -28,17 +28,19 @@ class DpsgdOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext &ctx) const override {
    const auto *param_var = ctx.InputVar("Param");
    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      "The Var(%s)'s type should be LoDTensor, "
-                      "but the received is %s",
-                      ctx.InputNames("Param").front(),
-                      framework::ToTypeName(param_var->Type()));
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));

    const auto *grad_var = ctx.InputVar("Grad");
    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
-                      "The Var(%s)'s type should be LoDTensor, "
-                      "but the received is %s",
-                      ctx.InputNames("Grad").front(),
-                      framework::ToTypeName(grad_var->Type()));
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));

    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");


--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -40,43 +40,62 @@ class MomentumOp : public framework::OperatorWithKernel {

 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(param) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(grad) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Velocity"),
-                   "Input(velocity) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of Momentum should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of Momentum should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
-                   "Output(VelocityOut) of Momentum should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::NotFound(
+                          "Input(param) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
+                      platform::errors::NotFound(
+                          "Input(grad) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Velocity"), true,
+                      platform::errors::NotFound(
+                          "Input(velocity) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("LearningRate"), true,
+        platform::errors::NotFound(
+            "Input(LearningRate) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputsVarType("Param").front(),
+        framework::proto::VarType::LOD_TENSOR,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->GetInputsVarType("Param").front()));
+
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
+                      platform::errors::NotFound(
+                          "Output(ParamOut) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("VelocityOut"), true,
+        platform::errors::NotFound(
+            "Output(VelocityOut) of Momentum should not be null."));

    auto lr_dims = ctx->GetInputDim("LearningRate");
    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
+                      platform::errors::InvalidArgument(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning_rate should be a scalar");
+                      platform::errors::InvalidArgument(
+                          "Learning_rate should be a scalar. But Received "
+                          "LearningRate's dim [%s]",
+                          framework::product(lr_dims)));

    auto param_dim = ctx->GetInputDim("Param");
    if (ctx->GetInputsVarType("Grad")[0] ==
        framework::proto::VarType::LOD_TENSOR) {
      PADDLE_ENFORCE_EQ(
          param_dim, ctx->GetInputDim("Grad"),
-          "Param and Grad input of MomentumOp should have the same dimension.");
+          platform::errors::InvalidArgument(
+              "Param and Grad input of MomentumOp should have the same "
+              "dimension. But received Param's dim [%s] and Grad's dim [%s].",
+              param_dim, ctx->GetInputDim("Grad")));
      PADDLE_ENFORCE_EQ(
          param_dim, ctx->GetInputDim("Velocity"),
-          "Param and Velocity of MomentumOp should have the same dimension.");
+          platform::errors::InvalidArgument(
+              "Param and Velocity of MomentumOp should have the same "
+              "dimension. But received Param's dim [%s] and Velocity [%s].",
+              param_dim, ctx->GetInputDim("Velocity")));
    }

    ctx->SetOutputDim("ParamOut", param_dim);
@@ -398,10 +417,12 @@ class MomentumOpKernel : public framework::OpKernel<T> {
        for_range(functor);
      }
    } else {
-      PADDLE_THROW(
-          string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows "
-                          "gradient, but the received Variable Type is %s",
-                          framework::ToTypeName(grad_var->Type())));
+      PADDLE_ENFORCE_EQ(false, true,
+                        platform::errors::PermissionDenied(
+                            "Unsupported Variable Type of Grad "
+                            "in MomentumOp. Excepted LodTensor "
+                            "or SelectedRows, But received [%s]",
+                            paddle::framework::ToTypeName(grad_var->Type())));
    }
  }
 };

--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc
@@ -22,47 +22,75 @@ class RmspropOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("MeanSquare"),
-                   "Input(MeanSquare) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(param_out) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of RmspropOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
-                   "Output(MeanSquareOut) of RmspropOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::NotFound(
+                          "Input(Param) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("MeanSquare"), true,
+        platform::errors::NotFound(
+            "Input(MeanSquare) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("LearningRate"), true,
+        platform::errors::NotFound(
+            "Input(LearningRate) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
+                      platform::errors::NotFound(
+                          "Input(Grad) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Moment"), true,
+                      platform::errors::NotFound(
+                          "Input(Moment) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(),
+                      framework::proto::VarType::LOD_TENSOR,
+                      platform::errors::InvalidArgument(
+                          "The input var's type in RmspropOp should be "
+                          "LoDTensor, but the received is %s",
+                          ctx->GetInputsVarType("Param").front()));
+
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("ParamOut"), true,
+        platform::errors::NotFound(
+            "Output(param_out) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("MomentOut"), true,
+        platform::errors::NotFound(
+            "Output(MomentOut) of RmspropOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("MeanSquareOut"), true,
+        platform::errors::NotFound(
+            "Output(MeanSquareOut) of RmspropOp should not be null."));
    if (ctx->Attrs().Get<bool>("centered")) {
-      PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
-                     "Output(MeanGradOut) of RmspropOp should not be null.");
+      PADDLE_ENFORCE_EQ(
+          ctx->HasOutput("MeanGradOut"), true,
+          platform::errors::NotFound(
+              "Output(MeanGradOut) of RmspropOp should not be null."));
    }

    auto param_dim = ctx->GetInputDim("Param");
    PADDLE_ENFORCE_EQ(
        param_dim, ctx->GetInputDim("Grad"),
-        "Param and grad input of RmspropOp should have the same dimension.");
+        platform::errors::InvalidArgument(
+            "Param and grad input of RmspropOp should have the same dimension. "
+            "But received Param's dim [%s] and Grad's dim [%s].",
+            param_dim, ctx->GetInputDim("Grad")));
    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
-                      "Param and Momentum input of RmspropOp "
-                      "should have the same dimension.");
+                      platform::errors::InvalidArgument(
+                          "Param and Momentum input of RmspropOp "
+                          "should have the same dimension. But received "
+                          "Param's dim [%s] and Moment [%s]",
+                          param_dim, ctx->GetInputDim("Moment")));
    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
-                      "Param and Momentum input of RmspropOp "
-                      "should have the same dimension.");
+                      platform::errors::InvalidArgument(
+                          "Param and Momentum input of RmspropOp "
+                          "should have the same dimension. But received "
+                          "Param's dim [%s] and MeanSquare [%s]",
+                          param_dim, ctx->GetInputDim("MeanSquare")));

    auto lr_dim = ctx->GetInputDim("LearningRate");
    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
-                      "Learning Rate should be a scalar.");
+                      platform::errors::InvalidArgument(
+                          "Learning Rate of RmspropOp should be a scalar. But "
+                          "received LearningRate's dim [%s]",
+                          framework::product(lr_dim)));

    ctx->SetOutputDim("ParamOut", param_dim);
    ctx->SetOutputDim("MomentOut", param_dim);

--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.h
@@ -148,11 +148,15 @@ class RmspropOpKernel : public framework::OpKernel<T> {
    auto &mom_tensor = *ctx.Input<LoDTensor>("Moment");

    PADDLE_ENFORCE_EQ(&p_tensor, param_out,
-                      "Param and ParamOut must be the same Tensor");
+                      platform::errors::InvalidArgument(
+                          "Param and ParamOut must be the same Tensor"));
    PADDLE_ENFORCE_EQ(&mom_tensor, moment_out,
-                      "Moment and MomentOut must be the same Tensor");
-    PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out,
-                      "MeanSquare and MeanSquareOut must be the same Tensor");
+                      platform::errors::InvalidArgument(
+                          "Moment and MomentOut must be the same Tensor"));
+    PADDLE_ENFORCE_EQ(
+        &ms_tensor, mean_square_out,
+        platform::errors::InvalidArgument(
+            "MeanSquare and MeanSquareOut must be the same Tensor"));

    auto &dev_ctx = ctx.template device_context<DeviceContext>();
    size_t limit = static_cast<size_t>(ms_tensor.numel());
@@ -179,8 +183,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
          auto mg = EigenVector<T>::Flatten(mg_tensor);
          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-          PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
-                            "MeanGrad and MeanGradOut must be the same Tensor");
+          PADDLE_ENFORCE_EQ(
+              &mg_tensor, mean_grad_out,
+              platform::errors::InvalidArgument(
+                  "MeanGrad and MeanGradOut must be the same Tensor"));
          auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);

          mg_out.device(place) = rho * mg + (1 - rho) * g;
@@ -198,8 +204,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
        if (centered) {
          auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
          auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-          PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
-                            "MeanGrad and MeanGradOut must be the same Tensor");
+          PADDLE_ENFORCE_EQ(
+              &mg_tensor, mean_grad_out,
+              platform::errors::InvalidArgument(
+                  "MeanGrad and MeanGradOut must be the same Tensor"));
          for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
              param_out->mutable_data<T>(ctx.GetPlace()),
              mean_square_out->mutable_data<T>(ctx.GetPlace()),
@@ -233,8 +241,10 @@ class RmspropOpKernel : public framework::OpKernel<T> {
      if (centered) {
        auto &mg_tensor = *ctx.Input<LoDTensor>("MeanGrad");
        auto *mean_grad_out = ctx.Output<LoDTensor>("MeanGradOut");
-        PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out,
-                          "MeanGrad and MeanGradOut must be the same Tensor");
+        PADDLE_ENFORCE_EQ(
+            &mg_tensor, mean_grad_out,
+            platform::errors::InvalidArgument(
+                "MeanGrad and MeanGradOut must be the same Tensor"));
        for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
            param_out->mutable_data<T>(ctx.GetPlace()),
            mean_square_out->mutable_data<T>(ctx.GetPlace()),
@@ -249,7 +259,12 @@ class RmspropOpKernel : public framework::OpKernel<T> {
            rho, epsilon, momentum, grad_func));
      }
    } else {
-      PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient");
+      PADDLE_ENFORCE_EQ(false, true,
+                        platform::errors::PermissionDenied(
+                            "Unsupported Variable Type of Grad "
+                            "in RmspropOp. Excepted LodTensor "
+                            "or SelectedRows, But received [%s]",
+                            paddle::framework::ToTypeName(grad_var->Type())));
    }
  }
 };

--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -22,23 +22,31 @@ class SGDOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of SGDOp should not be null.");
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
+                      platform::errors::NotFound(
+                          "Input(Param) of SGDOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Grad"), true,
+        platform::errors::NotFound("Input(Grad) of SGDOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
+                      platform::errors::NotFound(
+                          "Input(LearningRate) of SGDOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
+                      platform::errors::NotFound(
+                          "Output(ParamOut) of SGDOp should not be null."));

    auto lr_dims = ctx->GetInputDim("LearningRate");
    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
+                      platform::errors::NotFound(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 element");
+                      platform::errors::InvalidArgument(
+                          "Learning rate should have 1 element. But received "
+                          "LearningRate dims [%s]",
+                          framework::product(lr_dims)));
    auto param_dim = ctx->GetInputDim("Param");
    if (ctx->GetInputsVarType("Grad")[0] ==
        framework::proto::VarType::LOD_TENSOR) {

--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -57,11 +57,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          paddle::framework::ToTypeName(param_var->Type())));

    auto* param = ctx.Input<framework::Tensor>("Param");
    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
@@ -91,18 +92,30 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
      // This manual optimization brings difficulty to track data dependency.
      // It's better to find a more elegant solution.
-      PADDLE_ENFORCE_EQ(param, param_out);
+      PADDLE_ENFORCE_EQ(
+          param, param_out,
+          platform::errors::InvalidArgument(
+              "The input tensor Param of SgdOp should be equal with ParamOut "
+              "if variable's type is SelectedRows."));
      auto* grad = ctx.Input<framework::SelectedRows>("Grad");

      auto in_height = grad->height();
      auto out_dims = param_out->dims();
-      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+      PADDLE_ENFORCE_EQ(in_height, out_dims[0],
+                        platform::errors::InvalidArgument(
+                            "The input tensor Grad's height of SgdOp should be "
+                            "equal with ParamOut's dims. But received Grad's "
+                            "height [%s] and ParamOut's dims [%s]",
+                            in_height, out_dims[0]));

      auto& in_value = grad->value();
      auto& in_rows = grad->rows();

      int64_t in_row_numel = in_value.numel() / in_rows.size();
-      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
+      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height,
+                        platform::errors::InvalidArgument(
+                            "The in_row_numel of SgdOp should be equal with "
+                            "param_out's numel / in_height."));

      auto* in_data = in_value.data<T>();
      auto* out_data = param_out->data<T>();
@@ -118,7 +131,12 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
          out_data, in_row_numel, in_rows.size());

    } else {
-      PADDLE_THROW("Unsupported Variable Type of Grad");
+      PADDLE_ENFORCE_EQ(false, true,
+                        platform::errors::PermissionDenied(
+                            "Unsupported Variable Type of Grad "
+                            "in SgdOp. Excepted LodTensor or "
+                            "SelectedRows, But received [%s]",
+                            paddle::framework::ToTypeName(grad_var->Type())));
    }
  }
 };

--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -44,8 +44,20 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
      if (grad_var->IsType<framework::LoDTensor>()) {
        const auto *grad = ctx.Input<framework::Tensor>("Grad");
        auto sz = param_out->numel();
-        PADDLE_ENFORCE_EQ(param->numel(), sz);
-        PADDLE_ENFORCE_EQ(grad->numel(), sz);
+        PADDLE_ENFORCE_EQ(param->numel(), sz,
+                          platform::errors::InvalidArgument(
+                              "The input tensor Param's numel of SgdOp "
+                              "should be equal with ParamOut's numel. "
+                              "But received Param's "
+                              "numel = [%s], ParamOut's numel = [%s]",
+                              param->numel(), sz));
+        PADDLE_ENFORCE_EQ(grad->numel(), sz,
+                          platform::errors::InvalidArgument(
+                              "The input tensor Grad's numel of SgdOp "
+                              "should be equal with ParamOut's numel. "
+                              "But received Grad's "
+                              "numel = [%s], ParamOut's numel = [%s]",
+                              grad->numel(), sz));

        jit::sgd_attr_t attr(1, sz, 1, sz, 1);
        const T *lr = learning_rate->data<T>();
@@ -62,7 +74,11 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
        // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
        // This manual optimization brings difficulty to track data dependency.
        // It's better to find a more elegant solution.
-        PADDLE_ENFORCE_EQ(param, param_out);
+        PADDLE_ENFORCE_EQ(param, param_out,
+                          platform::errors::InvalidArgument(
+                              "The input tensor Param of SgdOp "
+                              "should be equal with ParamOut if variable's "
+                              "type is SelectedRows. "));
        const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
        auto &grad_rows = grad->rows();

@@ -73,7 +89,13 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
        }

        auto out_dims = param_out->dims();
-        PADDLE_ENFORCE_EQ(grad->height(), out_dims[0]);
+        PADDLE_ENFORCE_EQ(
+            grad->height(), out_dims[0],
+            platform::errors::InvalidArgument(
+                "The input tensor Grad's height of SgdOp "
+                "should be equal with ParamOut's dims. But received  Grad's "
+                "height [%s] and ParamOut's dims [%s]",
+                grad->height(), out_dims[0]));
        auto &grad_value = grad->value();
        const T *param_data = param->data<T>();
        const T *grad_data = grad_value.data<T>();
@@ -87,19 +109,31 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
        attr.grad_height = grad_rows.size();  // note: it is not grad->height()
        attr.grad_width = grad_value.numel() / attr.grad_height;
        attr.selected_rows_size = grad_rows.size();
-        PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width);
+        PADDLE_ENFORCE_EQ(
+            attr.grad_width, attr.param_width,
+            platform::errors::InvalidArgument(
+                "The grad_value's numel of SgdOp "
+                "should be equal with param_out's numel. But received "
+                "grad_value's numel [%s] and param_out's numel [%s]",
+                attr.grad_width, attr.param_width));

        auto sgd =
            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
                attr);
        sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
      } else {
-        PADDLE_THROW("Unsupported Variable Type of Grad");
+        PADDLE_ENFORCE_EQ(
+            false, true,
+            platform::errors::PermissionDenied(
+                "Unsupported Variable Type of Grad in SgdOp. Excepted "
+                "LodTensor or SelectedRows, But received [%s]",
+                paddle::framework::ToTypeName(grad_var->Type())));
      }
    } else if (param_var->IsType<framework::SelectedRows>()) {
-      PADDLE_ENFORCE(grad_var->IsType<framework::SelectedRows>(),
-                     "when param "
-                     "is SelectedRows, gradient should also be SelectedRows");
+      PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
+                        platform::errors::InvalidArgument(
+                            "when param is SelectedRows, "
+                            "gradient should also be SelectedRows"));
      const auto &param = param_var->Get<framework::SelectedRows>();
      auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
      const auto &grad = grad_var->Get<framework::SelectedRows>();
@@ -112,27 +146,36 @@ class SGDOpKernel<platform::CPUDeviceContext, T>

      auto param_row_width = param.value().dims()[1];
      auto grad_row_width = grad.value().dims()[1];
-      VLOG(4) << " param rows: " << param.rows().size()
-              << " param memory rows: " << param.value().dims()[0]
-              << " grad rows: " << grad.rows().size()
-              << " grad memory rows: " << grad.value().dims()[0];
-      PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
-                        "param_row should have the same size with grad_row");
+      PADDLE_ENFORCE_EQ(
+          param_row_width, grad_row_width,
+          platform::errors::InvalidArgument(
+              "The param_row in SgdOP should have the same size with grad_row. "
+              "But received param_row's width is [%s], and grad_row's width is "
+              "[%s]",
+              param_row_width, grad_row_width));

      const auto *lr = learning_rate->data<T>();
      const auto *grad_data = grad.value().data<T>();
      auto *out_data = param_out->mutable_value()->data<T>();
      for (size_t i = 0; i < grad.rows().size(); i++) {
        int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
-        PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
-                          "id should be in the table");
+        PADDLE_ENFORCE_GE(
+            id_index, static_cast<int64_t>(0),
+            platform::errors::InvalidArgument(
+                "The id in SgdOp should be >= 0. But recevied id_index is [%s]",
+                id_index));
        for (int64_t j = 0; j < grad_row_width; j++) {
          out_data[id_index * grad_row_width + j] -=
              lr[0] * grad_data[i * grad_row_width + j];
        }
      }
    } else {
-      PADDLE_THROW("Unsupported Variable Type of Parameter");
+      PADDLE_ENFORCE_EQ(
+          false, true,
+          platform::errors::PermissionDenied(
+              "Unsupported Variable Type of Parameter in SgdOp. Excepted "
+              "LodTensor or SelectedRows, But received [%s]",
+              paddle::framework::ToTypeName(param_var->Type())));
    }
  }
 };

--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -45,8 +45,10 @@ template <typename T>
 class PoolCUDNNOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
+                                          "CUDAPlace rather than CPUPlace."));

    const Tensor *input = ctx.Input<Tensor>("X");
    Tensor *output = ctx.Output<Tensor>("Out");
@@ -175,8 +177,10 @@ template <typename T>
 class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
+                                          "CUDAPlace rather than CPUPlace."));

    const Tensor *input = ctx.Input<Tensor>("X");
    const Tensor *output = ctx.Input<Tensor>("Out");

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -38,18 +38,22 @@ int PoolOutputSize(int input_size, int filter_size, int padding_1,
  }
  PADDLE_ENFORCE_GT(
      output_size, 0,
-      "ShapeError: the output size must be greater than 0. But received: "
-      "output_size = %d due to the settings of input_size(%d), padding(%d,%d), "
-      "k_size(%d) and stride(%d). Please check again!",
-      output_size, input_size, padding_1, padding_2, filter_size, stride);
+      platform::errors::InvalidArgument(
+          "the output size must be greater than 0. But received: "
+          "output_size = %d due to the settings of input_size(%d), "
+          "padding(%d,%d), "
+          "k_size(%d) and stride(%d). Please check again!",
+          output_size, input_size, padding_1, padding_2, filter_size, stride));
  return output_size;
 }

 void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    "X(Input) of Pooling should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                    "Out(Output) of Pooling should not be null.");
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("X"), true,
+      platform::errors::NotFound("Input(X) of Pool operator is not found."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasOutput("Out"), true,
+      platform::errors::NotFound("Output(Out) of Pool operator is not found."));

  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
@@ -65,28 +69,32 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
  auto in_x_dims = ctx->GetInputDim("X");
  PADDLE_ENFORCE_EQ(
      in_x_dims.size() == 4 || in_x_dims.size() == 5, true,
-      "ShapeError: the input of Op(pool) should be 4-D or 5-D Tensor. But "
-      "received: %u-D Tensor and it's shape is [%s].",
-      in_x_dims.size(), in_x_dims);
+      platform::errors::InvalidArgument(
+          "the input of Op(pool) should be 4-D or 5-D Tensor. But "
+          "received: %u-D Tensor and it's shape is [%s].",
+          in_x_dims.size(), in_x_dims));

  PADDLE_ENFORCE_EQ(
      in_x_dims.size() - ksize.size(), 2U,
-      "ShapeError: the dimension of input minus the size of "
-      "Attr(ksize) must be euqal to 2 in Op(pool). "
-      "But received: the dimension of input minus the size "
-      "of Attr(ksize) is %d, the "
-      "input's dimension is %d, the shape of input "
-      "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
-      in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
-      ksize.size(), framework::make_ddim(ksize));
-
-  PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                    "ShapeError: the size of Attr(ksize) and Attr(strides) in "
-                    "Op(pool) must be equal. "
-                    "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
-                    "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
-                    ksize.size(), strides.size(), framework::make_ddim(ksize),
-                    framework::make_ddim(strides));
+      platform::errors::InvalidArgument(
+          "the dimension of input minus the size of "
+          "Attr(ksize) must be euqal to 2 in Op(pool). "
+          "But received: the dimension of input minus the size "
+          "of Attr(ksize) is %d, the "
+          "input's dimension is %d, the shape of input "
+          "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
+          in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
+          ksize.size(), framework::make_ddim(ksize)));
+
+  PADDLE_ENFORCE_EQ(
+      ksize.size(), strides.size(),
+      platform::errors::InvalidArgument(
+          "the size of Attr(ksize) and Attr(strides) in "
+          "Op(pool) must be equal. "
+          "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
+          "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
+          ksize.size(), strides.size(), framework::make_ddim(ksize),
+          framework::make_ddim(strides)));

  // MKL-DNN Kernels are using NCHW order of dims description
  // so we ignore data_format consideration for MKL-DNN kernel
@@ -182,9 +190,12 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar(
 }

 void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) must not be null.");
+  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                    platform::errors::NotFound(
+                        "Input(X) of Pool Gradoperator is not found."));
  PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                    "Input(X@GRAD) should not be null.");
+                    platform::errors::NotFound(
+                        "Input(X@GRAD) of Pool Gradoperator is not found."));
  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
 }

@@ -210,7 +221,8 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
  auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
  if (input_data_type == framework::proto::VarType::FP16) {
    PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
-                      "float16 can only be used when CUDNN is used");
+                      platform::errors::InvalidArgument(
+                          "Float16 can only be used when CUDNN is used"));
  }
  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                 library_);

--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -81,9 +81,11 @@ inline void UpdatePadding(std::vector<T>* paddings, const bool global_pooling,
      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
    }
  } else {
-    PADDLE_ENFORCE_EQ(
-        data_dims.size() * 2, paddings->size(),
-        "Paddings size should be the same or twice as the pooling size.");
+    PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(),
+                      platform::errors::InvalidArgument(
+                          "Paddings size %d should be the same or twice as the "
+                          "pooling size %d.",
+                          paddings->size(), data_dims.size() * 2));
  }

  // when padding_algorithm is "VALID" or "SAME"
@@ -200,7 +202,10 @@ class PoolKernel : public framework::OpKernel<T> {
                         pool_process, exclusive, adaptive, out);
        }
      } break;
-      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Pool op only supports 2D and 3D input."));
+      }
    }
  }
 };
@@ -287,7 +292,10 @@ class PoolGradKernel : public framework::OpKernel<T> {
                            adaptive, in_x_grad);
          }
        } break;
-        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+        default: {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Pool op only supports 2D and 3D input."));
+        }
      }
    }
  }

--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -46,8 +46,11 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
    bool adaptive = ctx->Attrs().Get<bool>("adaptive");

-    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
-                   "Pooling intput should be 4-D or 5-D tensor.");
+    PADDLE_ENFORCE(
+        in_x_dims.size() == 4 || in_x_dims.size() == 5,
+        platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D "
+                                          "tensor but received %dD-Tensor",
+                                          in_x_dims.size()));

    if (ctx->Attrs().Get<bool>("global_pooling")) {
      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
@@ -57,16 +60,21 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
      }
    }

-    PADDLE_ENFORCE_EQ(in_x_dims.size() - ksize.size(), 2U,
-                      platform::errors::InvalidArgument(
-                          "Input size and pooling size should be consistent."));
-    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                      platform::errors::InvalidArgument(
-                          "Strides size and pooling size should be the same."));
+    PADDLE_ENFORCE_EQ(
+        in_x_dims.size() - ksize.size(), 2U,
+        platform::errors::InvalidArgument(
+            "The input size %d minus the kernel size %d should equal to 2.",
+            in_x_dims.size(), ksize.size()));
+    PADDLE_ENFORCE_EQ(
+        ksize.size(), strides.size(),
+        platform::errors::InvalidArgument(
+            "Strides size %d and pooling size %d should be the same.",
+            strides.size(), ksize.size()));
    PADDLE_ENFORCE_EQ(
        ksize.size(), paddings.size(),
        platform::errors::InvalidArgument(
-            "Paddings size and pooling size should be the same."));
+            "Paddings size %d and pooling size %d should be the same.",
+            paddings.size(), ksize.size()));

    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
    if (adaptive) {

--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -61,7 +61,10 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
                       mask);
      } break;
-      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Pool op only supports 2D and 3D input."));
+      }
    }
  }
 };
@@ -106,7 +109,10 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
          pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                          paddings, adaptive, in_x_grad);
        } break;
-        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+        default: {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Pool op only supports 2D and 3D input."));
+        }
      }
    }
  }

--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
@@ -176,22 +176,31 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
    int height = in_dims[2];
    int width = in_dims[3];

-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      "the channels of input X should equal the product of "
-                      "output_channels x pooled_height x pooled_width");
+    PADDLE_ENFORCE_EQ(
+        input_channels, output_channels * pooled_height * pooled_width,
+        platform::errors::InvalidArgument(
+            "The channels %d of input X should equal the product of "
+            "output_channels %d x pooled_height %d x pooled_width %d.",
+            input_channels, output_channels, pooled_height, pooled_width));

    int rois_num = rois->dims()[0];
    if (rois_num == 0) return;

    auto rois_lod = rois->lod().back();
    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "The rois_batch_size and input(X) batch_size must be the same.");
+    PADDLE_ENFORCE_EQ(rois_batch_size, batch_size,
+                      platform::errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size, batch_size));
    int rois_num_with_lod = rois_lod[rois_batch_size];
    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
+                      platform::errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num, rois_num_with_lod));

    // set rois batch id
    framework::Tensor rois_batch_id_list;

--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -160,9 +160,14 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
    if (ctx.HasInput("RoisNum")) {
      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
      int rois_batch_size = rois_num_t->numel();
+
      PADDLE_ENFORCE_EQ(
          rois_batch_size, batch_size,
-          "The rois_batch_size and imgs batch_size must be the same.");
+          platform::errors::InvalidArgument(
+              "The batch size of input(ROIs) and input(X) must be the same but "
+              "received batch size of input(ROIs) and input(X) is %d and %d "
+              "respectively.",
+              rois_batch_size, batch_size));
      std::vector<int> rois_num_list(rois_batch_size);
      memory::Copy(cplace, rois_num_list.data(), gplace,
                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
@@ -178,10 +183,19 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
      int rois_batch_size = rois_lod.size() - 1;
      PADDLE_ENFORCE_EQ(
          rois_batch_size, batch_size,
-          "The rois_batch_size and imgs batch_size must be the same.");
+          platform::errors::InvalidArgument(
+              "The batch size of input(ROIs) and input(X) must be the same but "
+              "received batch size of input(ROIs) and input(X) is %d and %d "
+              "respectively.",
+              rois_batch_size, batch_size));
+
      int rois_num_with_lod = rois_lod[rois_batch_size];
      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        "The rois_num from input and lod must be the same.");
+                        platform::errors::InvalidArgument(
+                            "The number of rois from input(ROIs) and its LOD "
+                            "must be the same. Received rois %d of input(ROIs) "
+                            "but the number of rois %d from its LOD is %d",
+                            rois_num, rois_num_with_lod));
      for (int n = 0; n < rois_batch_size; ++n) {
        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
          roi_batch_id_data[i] = n;

--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -103,13 +103,13 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
             "Target sequence length for Label when Label is a 2-D tensor.")
        .AsDispensable();
    AddOutput("WarpCTCGrad",
-              "(Tensor, default: Tensor<float>), a temporary "
+              "(Tensor), a temporary "
              "output Tensor to store the gradients of warp-ctc, which is "
              "computed with loss together in one call. It is a 3-D Tensor of "
              "the shape [max_sequence_length, batch_size, num_classes + 1].")
        .AsIntermediate();
    AddOutput("Loss",
-              "(Tensor, default: Tensor<float>), the Connectionist "
+              "(Tensor), the Connectionist "
              "Temporal Classification (CTC) loss, which is a 2-D Tensor of "
              "the shape [batch_size, 1]");
    AddAttr<int>("blank",
@@ -197,7 +197,9 @@ REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
 REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp,
                  ops::WarpCTCGradOpNoNeedBufferVarInferer);
 REGISTER_OP_CPU_KERNEL(
-    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
+    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
    warpctc_grad,
-    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/warpctc_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_op.cu.cc
@@ -16,7 +16,9 @@ limitations under the License. */

 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>);
+    warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
    warpctc_grad,
-    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -27,7 +27,52 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;

+template <typename DeviceContext, typename T>
+class ComputeCtcLossFunctor {
+ public:
+  ctcStatus_t operator()(const T* const activations, T* gradients,
+                         const int* const flat_labels,
+                         const int* const label_lengths,
+                         const int* const input_lengths, int alphabet_size,
+                         int minibatch, T* costs, void* workspace,
+                         ctcOptions options) {
+    return CTC_STATUS_EXECUTION_FAILED;
+  }
+};
+
+template <typename DeviceContext>
+class ComputeCtcLossFunctor<DeviceContext, float> {
+ public:
+  ctcStatus_t operator()(const float* const activations, float* gradients,
+                         const int* const flat_labels,
+                         const int* const label_lengths,
+                         const int* const input_lengths, int alphabet_size,
+                         int minibatch, float* costs, void* workspace,
+                         ctcOptions options) {
+    return platform::dynload::compute_ctc_loss(
+        activations, gradients, flat_labels, label_lengths, input_lengths,
+        static_cast<int>(alphabet_size), static_cast<int>(minibatch), costs,
+        workspace, options);
+  }
+};
+
 template <typename DeviceContext>
+class ComputeCtcLossFunctor<DeviceContext, double> {
+ public:
+  ctcStatus_t operator()(const double* const activations, double* gradients,
+                         const int* const flat_labels,
+                         const int* const label_lengths,
+                         const int* const input_lengths, int alphabet_size,
+                         int minibatch, double* costs, void* workspace,
+                         ctcOptions options) {
+    return platform::dynload::compute_ctc_loss_double(
+        activations, gradients, flat_labels, label_lengths, input_lengths,
+        static_cast<int>(alphabet_size), static_cast<int>(minibatch), costs,
+        workspace, options);
+  }
+};
+
+template <typename DeviceContext, typename T>
 class WarpCTCFunctor {
 public:
  /*
@@ -51,21 +96,29 @@ class WarpCTCFunctor {
   * \param blank             blank label used in ctc loss function.
   * \param cpu_losss         cost of each sequence in CPU memory.
   */
-  void operator()(const framework::ExecutionContext& ctx, const float* input,
-                  float* gradient, const int* cpu_labels,
+  void operator()(const framework::ExecutionContext& ctx, const T* input,
+                  T* gradient, const int* cpu_labels,
                  const int* cpu_label_lengths, const int* cpu_input_lengths,
                  const size_t sequence_width, const size_t num_sequences,
-                  const size_t blank, float* cpu_loss) {
+                  const size_t blank, T* cpu_loss) {
    // Init warp-ctc options
    init(ctx, blank);

    // Compute the required workspace size.
    // There is no memory allocated operations within warp-ctc.
    size_t workspace_bytes = 0;
-    ctcStatus_t status = platform::dynload::get_workspace_size(
-        cpu_label_lengths, cpu_input_lengths, static_cast<int>(sequence_width),
-        static_cast<int>(num_sequences), options_, &workspace_bytes);
-
+    ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR;
+    if (sizeof(T) == 4) {
+      status = platform::dynload::get_workspace_size(
+          cpu_label_lengths, cpu_input_lengths,
+          static_cast<int>(sequence_width), static_cast<int>(num_sequences),
+          options_, &workspace_bytes);
+    } else {
+      status = platform::dynload::get_workspace_size_double(
+          cpu_label_lengths, cpu_input_lengths,
+          static_cast<int>(sequence_width), static_cast<int>(num_sequences),
+          options_, &workspace_bytes);
+    }
    PADDLE_ENFORCE_EQ(
        CTC_STATUS_SUCCESS, status,
        platform::errors::PreconditionNotMet(
@@ -79,17 +132,17 @@ class WarpCTCFunctor {
            workspace_bytes));

    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL;
-    Tensor workspace = ctx.AllocateTmpTensor<float, DeviceContext>(
+    size_t workspace_elements = workspace_bytes / sizeof(T) + 1UL;
+    Tensor workspace = ctx.AllocateTmpTensor<T, DeviceContext>(
        framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
        dev_ctx);
-    float* workspace_data = workspace.data<float>();
-    math::SetConstant<DeviceContext, float>()(
+    T* workspace_data = workspace.data<T>();
+    math::SetConstant<DeviceContext, T>()(
        ctx.template device_context<DeviceContext>(), &workspace,
-        static_cast<float>(0));
+        static_cast<T>(0));

    // compute loss and gradient
-    status = platform::dynload::compute_ctc_loss(
+    status = ComputeCtcLossFunctor<DeviceContext, T>()(
        input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths,
        static_cast<int>(sequence_width), static_cast<int>(num_sequences),
        cpu_loss, workspace_data, options_);
@@ -112,7 +165,8 @@ class WarpCTCFunctor {
                            ctx.device_context())
                            .stream();
 #else
-      PADDLE_THROW("[warpctc init] GPU is not enabled.");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "[warpctc init] GPU is not enabled."));
 #endif
    } else {
      options_.loc = CTC_CPU;
@@ -292,7 +346,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {

    const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));

-    WarpCTCFunctor<DeviceContext>()(
+    WarpCTCFunctor<DeviceContext, T>()(
        ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data,
        warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
        sequence_width, num_sequences, blank, warpctc_loss_data);

--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -53,7 +53,9 @@ extern void* warpctc_dso_handle;
  __macro(get_warpctc_version);       \
  __macro(ctcGetStatusString);        \
  __macro(compute_ctc_loss);          \
-  __macro(get_workspace_size)
+  __macro(compute_ctc_loss_double);   \
+  __macro(get_workspace_size);        \
+  __macro(get_workspace_size_double)

 WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);


--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */

 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/xpu_info.h"

 TEST(InitDevices, CPU) {
  using paddle::framework::InitDevices;

--- a/paddle/fluid/platform/xpu_header.h
+++ b/paddle/fluid/platform/xpu_header.h
@@ -15,9 +15,36 @@
 #pragma once

 #ifdef PADDLE_WITH_XPU
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/platform/errors.h"
 #include "xpu/api.h"
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"

 namespace xpu = baidu::xpu::api;
+
+class XPUActHelper {
+ public:
+  // Convert string to activation type in xpu
+  static xpu::Activation_t ConvertToXpuActType(
+      const std::string& act_type_str) {
+    static std::unordered_map<std::string, xpu::Activation_t> str2act = {
+        {"linear", xpu::Activation_t::LINEAR},
+        {"relu", xpu::Activation_t::RELU},
+        {"sigmoid", xpu::Activation_t::SIGMOID},
+        {"tanh", xpu::Activation_t::TANH},
+        {"gelu", xpu::Activation_t::GELU},
+        {"leaky_relu", xpu::Activation_t::LEAKY_RELU},
+        {"sqrt", xpu::Activation_t::SQRT},
+        {"square", xpu::Activation_t::SQUARE}};
+
+    auto res = str2act.find(act_type_str);
+    PADDLE_ENFORCE_NE(res, str2act.end(),
+                      paddle::platform::errors::InvalidArgument(
+                          "Invalid activation type(%s) in XPU", act_type_str));
+    return res->second;
+  }
+};
 #endif
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -48,6 +48,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
    {"collect_fpn_proposals",
     {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
    {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
+    {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
 };

 // NOTE(zhiqiu): Like op_ins_map.

--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -230,7 +230,6 @@ from .framework import CPUPlace  #DEFINE_ALIAS
 from .framework import CUDAPlace  #DEFINE_ALIAS
 from .framework import CUDAPinnedPlace  #DEFINE_ALIAS

-from .framework import to_variable  #DEFINE_ALIAS
 from .framework import grad  #DEFINE_ALIAS
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
@@ -258,6 +257,8 @@ from .tensor.stat import numel  #DEFINE_ALIAS
 from .device import get_cudnn_version
 from .device import set_device
 from .device import get_device
+from .device import is_compiled_with_xpu
+from .device import XPUPlace
 # from .tensor.tensor import Tensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensor        #DEFINE_ALIAS
 # from .tensor.tensor import LoDTensorArray        #DEFINE_ALIAS

--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -22,7 +22,9 @@ from paddle.fluid.dygraph.parallel import ParallelEnv
 __all__ = [
    'get_cudnn_version',
    'set_device',
-    'get_device'
+    'get_device',
+    'XPUPlace',
+    'is_compiled_with_xpu'
    #            'cpu_places',
    #            'CPUPlace',
    #            'cuda_pinned_places',
@@ -35,6 +37,37 @@ __all__ = [
 _cudnn_version = None


+def is_compiled_with_xpu():
+    """
+    Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
+
+    Returns (bool): whether paddle was built with WITH_XPU=ON
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            support_xpu = paddle.device.is_compiled_with_xpu()
+    """
+    return core.is_compiled_with_xpu()
+
+
+def XPUPlace(dev_id):
+    """
+    Return a Baidu Kunlun Place
+
+    Parameters:
+        dev_id(int): Baidu Kunlun device id
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            place = paddle.device.XPUPlace(0)
+    """
+    return core.XPUPlace(dev_id)
+
+
 def get_cudnn_version():
    """
    This funciton return the version of cudnn. the retuen value is int which represents the 

--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -30,6 +30,7 @@ __all__ = [
 ]

 fleet = Fleet()
+_final_strategy = fleet._final_strategy
 init = fleet.init
 is_first_worker = fleet.is_first_worker
 worker_index = fleet.worker_index

--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -1244,8 +1244,7 @@ class DistributedStrategy(object):
                        if getattr(self.strategy, f.name):
                            draws += border + "\n"
                            draws += h1_format.format(
-                                "{} = True, please check {}_configs".format(
-                                    f.name, f.name))
+                                "{}=True <-> {}_configs".format(f.name, f.name))
                            draws += line + "\n"
                            my_configs = getattr(self.strategy,
                                                 f.name + "_configs")

--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -119,6 +119,8 @@ class Fleet(object):
        self.strategy_compiler = None
        self._is_collective = False
        self._runtime_handle = None
+        self._util = None
+        self._context = {}

    def init(self, role_maker=None, is_collective=False):
        """
@@ -233,7 +235,7 @@ class Fleet(object):

        Returns:
            int: worker numbers
-
+        
        Examples:
            .. code-block:: python

@@ -569,8 +571,9 @@ class Fleet(object):

        if strategy == None:
            strategy = DistributedStrategy()
-        self.user_defined_strategy = strategy
-        self.valid_strategy = None
+
+        self._user_defined_strategy = copy.deepcopy(strategy)
+        self._context = {}
        return self

    @dygraph_only
@@ -909,6 +912,15 @@ class Fleet(object):
        # imitate target optimizer retrieval
        return self.user_defined_optimizer.clear_grad()

+    def _final_strategy(self):
+        if "valid_strategy" not in self._context:
+            print(
+                "WARNING: You may need to call minimize function before this function is called"
+            )
+            return {}
+        else:
+            return self._context["valid_strategy"]
+
    def minimize(self,
                 loss,
                 startup_program=None,
@@ -958,12 +970,15 @@ class Fleet(object):
                # for more examples, please reference https://github.com/PaddlePaddle/FleetX

        """
+        context = {}
+        context["user_defined_strategy"] = copy.deepcopy(
+            self._user_defined_strategy)
        if paddle.fluid.framework.in_dygraph_mode():
            # imitate target optimizer retrieval
            target_opt = self.user_defined_optimizer
+            self._context = context
            return target_opt.minimize(loss)

-        context = {}
        # cache original feed forward program
        self.origin_main_program = loss.block.program
        context["origin_main_program"] = self.origin_main_program
@@ -984,17 +999,19 @@ class Fleet(object):
            MetaOptimizerFactory()._get_valid_meta_optimizers(
                self.user_defined_optimizer)

-        context["user_defined_strategy"] = copy.copy(self.user_defined_strategy)
+        context["user_defined_strategy"] = copy.deepcopy(
+            self._user_defined_strategy)
+        copy_user_defined_strategy = copy.deepcopy(self._user_defined_strategy)

        # trigger the auto-parallel in very strict condition
        # strategy = DistributedStrategy()
        # strategy.auto = True
        # optimizer = paddle.optimizer.SGD(learning_rate=0.1)
        # optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        if self.user_defined_strategy._is_strict_auto():
+        if copy_user_defined_strategy._is_strict_auto():
            # turn on all the strategy for each optimizer
            for opt in distributed_optimizer_list:
-                opt._enable_strategy(self.user_defined_strategy, context)
+                opt._enable_strategy(copy_user_defined_strategy, context)

        valid_optimizer_list = []
        valid_graph_optimizer_list = []
@@ -1003,7 +1020,7 @@ class Fleet(object):
        for opt in distributed_optimizer_list:
            opt._set_basic_info(loss, self._role_maker,
                                self.user_defined_optimizer,
-                                self.user_defined_strategy)
+                                copy_user_defined_strategy)
            if opt._can_apply() and not opt._is_graph_out():
                valid_optimizer_list.append(opt)
            elif opt._can_apply() and opt._is_graph_out():
@@ -1014,13 +1031,15 @@ class Fleet(object):
        meta_optimizer, graph_optimizer = \
            self.strategy_compiler.generate_optimizer(
                loss, self._role_maker, self.user_defined_optimizer,
-                self.user_defined_strategy, valid_optimizer_list,
+                copy_user_defined_strategy, valid_optimizer_list,
                valid_graph_optimizer_list)

        valid_strategy = self.strategy_compiler._get_valid_strategy(
-            self.user_defined_strategy, can_not_apply_optimizer_list)
+            copy_user_defined_strategy, can_not_apply_optimizer_list)
+
+        context["valid_strategy"] = copy.deepcopy(valid_strategy)

-        context["valid_strategy"] = valid_strategy
+        self._context = context

        self.valid_strategy = valid_strategy
        self.valid_strategy._enable_env()

--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -495,7 +495,7 @@ class RoleMakerBase(object):
        Returns:
            string: all heter_trainers'endpoints
        """
-        assert self._heter_trainer_endpoints != []
+        assert self._heter_trainer_endpoints != [], "Heter Worker Endpoints Not initialized"
        return self._heter_trainer_endpoints

    def _get_heter_worker_endpoint(self):
@@ -505,10 +505,10 @@ class RoleMakerBase(object):

        e.g: if we have 4 cpu-trainer(default), 2 gpu-trainer(heter)
             then No.0 and No.2 cpu-trainer will work with No.0 gpu-trainer
-             and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainerr
+             and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainer
        """
-        assert self._heter_trainer_endpoints != []
-        return self._heter_trainer_endpoints[(self._current_id + 1) %
+        assert self._heter_trainer_endpoints != [], "Heter Worker Endpoints Not initialized"
+        return self._heter_trainer_endpoints[(self._current_id) %
                                             self._heter_worker_num()]

    def _get_heter_worker_device(self):

--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -23,6 +23,7 @@ from paddle.fluid.executor import Executor
 from paddle.fluid.parallel_executor import ParallelExecutor

 from .runtime_base import RuntimeBase
+from ..base.private_helper_function import wait_server_ready


 class ParameterServerRuntime(RuntimeBase):
@@ -94,8 +95,8 @@ class ParameterServerRuntime(RuntimeBase):
                return False

            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                            var.desc.type() == core.VarDesc.VarType.READER:
+                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                    var.desc.type() == core.VarDesc.VarType.READER:
                return False
            return var.persistable

@@ -161,6 +162,17 @@ class ParameterServerRuntime(RuntimeBase):

        trainer_config = self.async_strategy.get_trainer_runtime_config()

+        dist_strategy = self.context["valid_strategy"]
+        launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
+        if launch_barrier:
+            # for trainer wait server ready
+            wait_server_ready(self.role_maker._get_pserver_endpoints())
+
+            # for ps-heter mode, wait heter worker ready
+            if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
+            ):
+                wait_server_ready(self.role_maker._get_heter_worker_endpoints())
+
        lrs = _has_global_step(_get_lr_ops(self.origin_main_program))

        if lrs:
@@ -312,7 +324,7 @@ class ParameterServerRuntime(RuntimeBase):
        opts = _get_optimize_ops(self.origin_main_program)
        for op in opts:
            if "Param" in op.input_names and \
-                            "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
+                    "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
                return op

    def _save_dense_params(self, executor, dirname, context, main_program):

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1291,17 +1291,17 @@ def append_backward(loss,
    It will be automatically invoked by the optimizer's `minimize` function.

    Parameters:
-        loss( :ref:`api_guide_Variable_en` ): The loss variable of the network.
-        parameter_list(list[Variable|str], optional): List of Parameters or Parameter.names
+        loss(Tensor): The loss Tensor of the network.
+        parameter_list(list[Tensor|str], optional): List of Parameters or Parameter.names
                                           that need to be updated by optimizers.
                                           If it is None, all parameters
                                           will be updated.
                                           Default: None.
-        no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
-                               should be ignored. All variables with
+        no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
+                               should be ignored. All Tensors with
                               `stop_gradient=True` from all blocks will
                               be automatically added into this set.
-                               If this parameter is not None, the Variables or Variable.names in this set will be added to the default set.
+                               If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
                               Default: None.
        callbacks(list[callable object], optional): List of callback functions.
                                               The callbacks are used for
@@ -1312,70 +1312,73 @@ def append_backward(loss,
                                               new gradient operator is added
                                               into the program. The callable
                                               object must have two input
-                                               parameters: 'block' and 'context'.
-                                               The 'block' is the :ref:`api_guide_Block_en` which
+                                               parameters: ``block`` and ``context`` .
+                                               The ``block`` is the :ref:`api_guide_Block_en` which
                                               the new gradient operator will
-                                               be added to. The 'context' is a
+                                               be added to. The ``context`` is a
                                               map, whose keys are gradient
-                                               variable names and values are
-                                               corresponding original :ref:`api_guide_Variable_en` .
-                                               In addition to this, the 'context'
+                                               Tensor names and values are
+                                               corresponding original :ref:`api_guide_tensor_en` .
+                                               In addition to this, the ``context``
                                               has another special key-value pair:
-                                               the key is string '__current_op_desc__'
+                                               the key is string ``__current_op_desc__``
                                               and the value is the op_desc of the
                                               gradient operator who has just
                                               triggered the callable object.
                                               Default: None.

    Returns:
-        list of tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ): Pairs of parameter and its corresponding gradients.
-        The key is the parameter and the value is gradient variable.
+        list of tuple ( :ref:`api_guide_tensor_en` , :ref:`api_guide_tensor_en` ): Pairs of parameter and its corresponding gradients.
+        The key is the parameter and the value is gradient Tensor.

    Raises:
-        AssertionError: If `loss` is not an instance of Variable.
+        AssertionError: If ``loss`` is not an instance of Tensor.

    Examples:
        .. code-block:: python

-            import paddle.fluid as fluid
+            import paddle
+            import paddle.nn.functional as F

-            x = fluid.data(name='x', shape=[None, 13], dtype='int64')
-            y = fluid.data(name='y', shape=[None, 1], dtype='float32')
-            x_emb = fluid.embedding(x, size=[100, 256])
-            y_predict = fluid.layers.fc(input=x_emb, size=1, act=None, name='my_fc')
-            loss = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_loss = fluid.layers.mean(loss)
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[None, 13], dtype='int64')
+            y = paddle.static.data(name='y', shape=[None, 1], dtype='float32')
+            x_emb = paddle.static.nn.embedding(x, size=[100, 256])
+            y_predict = paddle.static.nn.fc(input=x_emb, size=1, act=None, name='my_fc')
+            loss = F.square_error_cost(input=y_predict, label=y)
+            avg_loss = paddle.mean(loss)

            # Get all weights in main_program, not include bias.
-            all_weights = [param for param in fluid.default_main_program().block(0).all_parameters() if 'w_' in param.name]
+            all_weights = [param for param in paddle.static.default_main_program().block(0).all_parameters() if 'w_' in param.name]
            all_weights_name = [w.name for w in all_weights]

            # return all param_grads needed to be updated if parameter_list set default None.
-            p_g_list1 = fluid.backward.append_backward(loss=avg_loss)
+            p_g_list1 = paddle.static.append_backward(loss=avg_loss)
            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]

-            # return the param_grads corresponding to parameter_list that can be list of param (Variable).
-            p_g_list2 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights)
+            # return the param_grads corresponding to parameter_list that can be list of param (Tensor).
+            p_g_list2 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights)
            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]

            # parameter_list can be list of param.name (str).
-            p_g_list3 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights_name)
+            p_g_list3 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights_name)
            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]

-            # no_grad_set can be set of Variables that means grad will be cut off from these Variables.
-            p_g_list4 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
+            # no_grad_set can be set of Tensors that means grad will be cut off from these Tensors.
+            p_g_list4 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
            # output: [(my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]

-            # no_grad_set can be set of Variable.name when the Variable is created inside layers and can't be specified explicitly.
-            p_g_list5 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
+            # no_grad_set can be set of Tensor.name when the Tensor is created inside layers and can't be specified explicitly.
+            p_g_list5 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]

            # return [] because all param_grads are filtered by no_grad_set.
-            p_g_list6 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
+            p_g_list6 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))

    """
    check_type(loss, 'loss', framework.Variable,
-               'fluid.backward.append_backward')
+               'paddle.static.append_backward')

    if loss.op is None:
        # the loss is from a cloned program. Find loss op manually.
@@ -1387,7 +1390,7 @@ def append_backward(loss,

    if callbacks is not None:
        check_type(callbacks, 'callbacks', list,
-                   'fluid.backward.append_backward')
+                   'paddle.static.append_backward')

    program = loss.block.program
    root_block = program.block(0)
@@ -1727,21 +1730,21 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
    Backpropagate the gradients of targets to inputs.

    Args:
-        targets(Variable|list[Variable]): The target variables
-        inputs(Variable|list[Variable]): The input variables
-        target_gradients (Variable|list[Variable], optional): The gradient variables
+        targets(Tensor|list[Tensor]): The target Tensors
+        inputs(Tensor|list[Tensor]): The input Tensors
+        target_gradients (Tensor|list[Tensor], optional): The gradient Tensors
            of targets which has the same shape with targets, If None, ones will
            be created for them.
-        no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
-                               should be ignored. All variables with
+        no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
+                               should be ignored. All Tensors with
                               `stop_gradient=True` from all blocks will
                               be automatically added into this set.
-                               If this parameter is not None, the Variables or Variable.names in this set will be added to the default set.
+                               If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
                               Default: None.

    Return:
-        (list[Variable]): A list of gradients for inputs
-        If an input does not affect targets, the corresponding gradient variable
+        (list[Tensor]): A list of gradients for inputs
+        If an input does not affect targets, the corresponding gradient Tensor
        will be None
    """
    targets = _as_list(targets)
@@ -1865,41 +1868,42 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
    Backpropagate the gradients of targets to inputs.

    Args:
-        targets (Variable|list[Variable]): The target variables.
-        inputs (Variable|list[Variable]): The input variables.
-        target_gradients (Variable|list[Variable], optional): The gradient variables
+        targets (Tensor|list[Tensor]): The target Tensors.
+        inputs (Tensor|list[Tensor]): The input Tensors.
+        target_gradients (Tensor|list[Tensor], optional): The gradient Tensor
            of targets which has the same shape with targets, If None, ones will
            be created for them.
-        no_grad_set (set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
-            should be ignored. All variables with `stop_gradient=True` from all blocks will
-            be automatically added into this set. If this parameter is not None, the Variables or Variable.names
+        no_grad_set (set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
+            should be ignored. All Tensors with ``stop_gradient=True`` from all blocks will
+            be automatically added into this set. If this parameter is not None, the Tensors or Tensor.names
            in this set will be added to the default set. Default: None.

    Return:
-        (list[Variable]): A list of gradients for inputs
-        If an input does not affect targets, the corresponding gradient variable
+        (list[Tensor]): A list of gradients for inputs
+        If an input does not affect targets, the corresponding gradient Tensor
        will be None.

    Examples:
        .. code-block:: python

-            import paddle.fluid as fluid
+            import paddle
+            import paddle.nn.functional as F
+
+            paddle.enable_static()

-            x = fluid.data(name='x', shape=[None,2,8,8], dtype='float32')
+            x = paddle.static.data(name='x', shape=[None, 2, 8, 8], dtype='float32')
            x.stop_gradient=False
-            y = fluid.layers.conv2d(x, 4, 1, bias_attr=False)
-            y = fluid.layers.relu(y)
-            y = fluid.layers.conv2d(y, 4, 1, bias_attr=False)
-            y = fluid.layers.relu(y)
-            z = fluid.gradients([y], x)
-            print(z)
+            y = paddle.static.nn.conv2d(x, 4, 1, bias_attr=False)
+            y = F.relu(y)
+            z = paddle.static.gradients([y], x)
+            print(z) # [var x@GRAD : fluid.VarType.LOD_TENSOR.shape(-1L, 2L, 8L, 8L).astype(VarType.FP32)]
    """
    check_type(targets, 'targets', (framework.Variable, list),
-               'fluid.backward.gradients')
+               'paddle.static.gradients')
    check_type(inputs, 'inputs', (framework.Variable, list),
-               'fluid.backward.gradients')
+               'paddle.static.gradients')
    check_type(target_gradients, 'target_gradients', (
-        framework.Variable, list, type(None)), 'fluid.backward.gradients')
+        framework.Variable, list, type(None)), 'paddle.static.gradients')

    outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
    return _as_list(outs)
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -3230,14 +3230,11 @@ class Flatten(layers.Layer):
        .. code-block:: python

          import paddle
-          from paddle import to_variable
          import numpy as np
+          paddle.disable_static()

          inp_np = np.ones([5, 2, 3, 4]).astype('float32')
-          
-          paddle.disable_static()
-          
-          inp_np = to_variable(inp_np)
+          inp_np = paddle.to_tensor(inp_np)
          flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
          flatten_res = flatten(inp_np)


--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -5396,13 +5396,13 @@ def program_guard(main_program, startup_program=None):
    """
    :api_attr: Static Graph

-    Change the global main program and startup program with `"with"` statement.
-    Layer functions in the Python `"with"` block will append operators and
-    variables to the new main programs.
+    Change the global main program and startup program with ``with`` statement.
+    Layer functions in the Python ``with`` block will append operators and
+    Tensors to the new main programs.

    Args:
-        main_program(Program): New main program inside `"with"` statement.
-        startup_program(Program, optional): New startup program inside `"with"` 
+        main_program(Program): New main program inside ``with`` statement.
+        startup_program(Program, optional): New startup program inside ``with`` 
            statement. :code:`None` means not changing startup program, 
            default_startup_program is still used.
            Default: None.
@@ -5410,13 +5410,14 @@ def program_guard(main_program, startup_program=None):
    Examples:
       .. code-block:: python
       
-         import paddle.fluid as fluid
+          import paddle

-         main_program = fluid.Program()
-         startup_program = fluid.Program()
-         with fluid.program_guard(main_program, startup_program):
-             data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
-             hidden = fluid.layers.fc(input=data, size=10, act='relu')
+          paddle.enable_static()
+          main_program = paddle.static.Program()
+          startup_program = paddle.static.Program()
+          with paddle.static.program_guard(main_program, startup_program):
+              data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+              hidden = paddle.static.nn.fc(input=data, size=10, act='relu')

    Notes: The temporary :code:`Program` can be used if the user does not need
    to construct either of startup program or main program.
@@ -5424,20 +5425,22 @@ def program_guard(main_program, startup_program=None):
    Examples:
       .. code-block:: python

-         import paddle.fluid as fluid
+          import paddle

-         main_program = fluid.Program()
-         # does not care about startup program. Just pass a temporary value.
-         with fluid.program_guard(main_program, fluid.Program()):
-             data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
+          paddle.enable_static()
+          main_program = paddle.static.Program()
+          # does not care about startup program. Just pass a temporary value.
+          with paddle.static.program_guard(main_program, paddle.static.Program()):
+              data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
    
    """
    from .data_feeder import check_type
-    check_type(main_program, 'main_program', Program, 'fluid.program_guard')
+    check_type(main_program, 'main_program', Program,
+               'paddle.static.program_guard')
    main_program = switch_main_program(main_program)
    if startup_program is not None:
        check_type(startup_program, 'startup_program', Program,
-                   'fluid.program_guard')
+                   'paddle.static.program_guard')
        startup_program = switch_startup_program(startup_program)
    try:
        yield

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -2488,9 +2488,6 @@ def _error_message(what, arg_name, op_name, right_value, error_value):
 def case(pred_fn_pairs, default=None, name=None):
    '''
    :api_attr: Static Graph
-	:alias_main: paddle.nn.case
-	:alias: paddle.nn.case,paddle.nn.control_flow.case
-	:old_api: paddle.fluid.layers.case

    This operator works like an if-elif-elif-else chain.

@@ -2500,7 +2497,7 @@ def case(pred_fn_pairs, default=None, name=None):
        name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.

    Returns:
-        Variable|list(Variable): Tensors returned by the callable from the first pair whose pred is True,
+        Tensor|list(Tensor): Tensors returned by the callable from the first pair whose pred is True,
        or Tensors returned by ``default`` if no pred in ``pred_fn_pairs`` is True and ``default`` is not None,
        or Tensors returned by the last callable in ``pred_fn_pairs``  if no pred in ``pred_fn_pairs`` is True and ``default`` is None.

@@ -2508,45 +2505,47 @@ def case(pred_fn_pairs, default=None, name=None):
        TypeError: If the type of ``pred_fn_pairs`` is not list or tuple.
        TypeError: If the type of elements in ``pred_fn_pairs`` is not tuple.
        TypeError: If the size of tuples in ``pred_fn_pairs`` is not 2.
-        TypeError: If the first element of 2-tuple in ``pred_fn_pairs`` is not Variable.
+        TypeError: If the first element of 2-tuple in ``pred_fn_pairs`` is not a Tensor.
        TypeError: If the second element of 2-tuple in ``pred_fn_pairs`` is not callable.
        TypeError: If ``default`` is not None but it is not callable.

    Examples:
        .. code-block:: python

-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            import paddle
+
+            paddle.enable_static()

            def fn_1():
-                return layers.fill_constant(shape=[1, 2], dtype='float32', value=1)
+                return paddle.fill_constant(shape=[1, 2], dtype='float32', value=1)

            def fn_2():
-                return layers.fill_constant(shape=[2, 2], dtype='int32', value=2)
+                return paddle.fill_constant(shape=[2, 2], dtype='int32', value=2)

            def fn_3():
-                return layers.fill_constant(shape=[3], dtype='int32', value=3)
+                return paddle.fill_constant(shape=[3], dtype='int32', value=3)

-            main_program = fluid.default_startup_program()
-            startup_program = fluid.default_main_program()
-            with fluid.program_guard(main_program, startup_program):
-                x = layers.fill_constant(shape=[1], dtype='float32', value=0.3)
-                y = layers.fill_constant(shape=[1], dtype='float32', value=0.1)
-                z = layers.fill_constant(shape=[1], dtype='float32', value=0.2)
+            main_program = paddle.static.default_startup_program()
+            startup_program = paddle.static.default_main_program()
+
+            with paddle.static.program_guard(main_program, startup_program):
+                x = paddle.fill_constant(shape=[1], dtype='float32', value=0.3)
+                y = paddle.fill_constant(shape=[1], dtype='float32', value=0.1)
+                z = paddle.fill_constant(shape=[1], dtype='float32', value=0.2)

-                pred_1 = layers.less_than(z, x)  # true: 0.2 < 0.3
-                pred_2 = layers.less_than(x, y)  # false: 0.3 < 0.1
-                pred_3 = layers.equal(x, y)      # false: 0.3 == 0.1
+                pred_1 = paddle.less_than(z, x)  # true: 0.2 < 0.3
+                pred_2 = paddle.less_than(x, y)  # false: 0.3 < 0.1
+                pred_3 = paddle.equal(x, y)      # false: 0.3 == 0.1

                # Call fn_1 because pred_1 is True
-                out_1 = layers.case(
+                out_1 = paddle.static.nn.case(
                    pred_fn_pairs=[(pred_1, fn_1), (pred_2, fn_2)], default=fn_3)

                # Argument default is None and no pred in pred_fn_pairs is True. fn_3 will be called.
                # because fn_3 is the last callable in pred_fn_pairs.
-                out_2 = layers.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])
+                out_2 = paddle.static.nn.case(pred_fn_pairs=[(pred_2, fn_2), (pred_3, fn_3)])

-                exe = fluid.Executor(fluid.CPUPlace())
+                exe = paddle.static.Executor(paddle.CPUPlace())
                res_1, res_2 = exe.run(main_program, fetch_list=[out_1, out_2])
                print(res_1)  # [[1. 1.]]
                print(res_2)  # [3 3 3]
@@ -3610,18 +3609,18 @@ def switch_case(branch_index, branch_fns, default=None, name=None):
    This operator is like a C++ switch/case statement.

    Args:
-        branch_index(Variable): A Tensor with shape [1] to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``.
+        branch_index(Tensor): A Tensor with shape [1] to specify which branch to execute. The data type is ``int32``, ``int64`` or ``uint8``.
        branch_fns(dict|list|tuple): If it's a list or tuple, the elements in it could be pairs of (int, callable) or simple callables whose actual index will be used as the index of callable. If it's a dict, its key is a python integer and the value is a callable. All callables return the same structure of Tensors.
        default(callable, optional): Callable that returns a structure of Tensors.
        name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.

    Returns:
-        Variable|list(Variable): Tensors returned by the callable specified by ``branch_index`` in ``branch_fns``,
+        Tensor|list(Tensor): Tensors returned by the callable specified by ``branch_index`` in ``branch_fns``,
        or Tensors returned by ``default`` if ``default`` is not None and no index matches in ``branch_fns``,
        or Tensors returned by the callable with the max index in ``branch_fns`` if ``default`` is None and no index matches in ``branch_fns``.

    Raises:
-        TypeError: If the type of ``branch_index`` is not Variable.
+        TypeError: If the type of ``branch_index`` is not Tensor.
        TypeError: If the data type of ``branch_index`` is not ``int32``, ``int64`` or ``uint8``.
        TypeError: If the type of ``branch_fns`` is not dict, list or tuple.
        TypeError: If the elements of ``branch_fns`` is not 2-tuple.
@@ -3633,40 +3632,41 @@ def switch_case(branch_index, branch_fns, default=None, name=None):
    Examples:
        .. code-block:: python

-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            import paddle
+
+            paddle.enable_static()

            def fn_1():
-                return layers.fill_constant(shape=[1, 2], dtype='float32', value=1)
+                return paddle.fill_constant(shape=[1, 2], dtype='float32', value=1)

            def fn_2():
-                return layers.fill_constant(shape=[2, 2], dtype='int32', value=2)
+                return paddle.fill_constant(shape=[2, 2], dtype='int32', value=2)

            def fn_3():
-                return layers.fill_constant(shape=[3], dtype='int32', value=3)
+                return paddle.fill_constant(shape=[3], dtype='int32', value=3)

-            main_program = fluid.default_startup_program()
-            startup_program = fluid.default_main_program()
-            with fluid.program_guard(main_program, startup_program):
-                index_1 = layers.fill_constant(shape=[1], dtype='int32', value=1)
-                index_2 = layers.fill_constant(shape=[1], dtype='int32', value=2)
+            main_program = paddle.static.default_startup_program()
+            startup_program = paddle.static.default_main_program()
+            with paddle.static.program_guard(main_program, startup_program):
+                index_1 = paddle.fill_constant(shape=[1], dtype='int32', value=1)
+                index_2 = paddle.fill_constant(shape=[1], dtype='int32', value=2)

-                out_1 = layers.switch_case(
+                out_1 = paddle.static.nn.switch_case(
                    branch_index=index_1,
                    branch_fns={1: fn_1, 2: fn_2},
                    default=fn_3)

-                out_2 = layers.switch_case(
+                out_2 = paddle.static.nn.switch_case(
                    branch_index=index_2,
                    branch_fns=[(1, fn_1), (2, fn_2)],
                    default=fn_3)

                # Argument default is None and no index matches. fn_3 will be called because of the max index 7.
-                out_3 = layers.switch_case(
+                out_3 = paddle.static.nn.switch_case(
                    branch_index=index_2,
                    branch_fns=[(0, fn_1), (4, fn_2), (7, fn_3)])

-                exe = fluid.Executor(fluid.CPUPlace())
+                exe = paddle.static.Executor(paddle.CPUPlace())
                res_1, res_2, res_3 = exe.run(main_program, fetch_list=[out_1, out_2, out_3])
                print(res_1)  # [[1. 1.]]
                print(res_2)  # [[2 2] [2 2]]

--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -541,7 +541,7 @@ def warpctc(input,
         (not including the blank label). When it is a 3-D Tensor, its shape 
         is `[max_logit_length, batch_size, num_classes + 1]`,
         where `max_logit_length` is the longest length of
-         input logit sequence. The data type must be float32.
+         input logit sequence. The data type should be float32 or float64.
       label (Variable): The ground truth of variable-length sequence,
         which must be a 2-D Tensor with LoD information or a 3-D Tensor without
         LoD information, needs to be consistent with the coressponding input. 
@@ -571,6 +571,7 @@ def warpctc(input,
        .. code-block:: python

            # using LoDTensor
+            import paddle
            import paddle.fluid as fluid
            import numpy as np

@@ -581,6 +582,7 @@ def warpctc(input,
            # class num
            class_num = 5

+            paddle.enable_static()
            logits = fluid.data(name='logits',shape=[None, class_num+1],
                                 dtype='float32',lod_level=1)
            label = fluid.data(name='label', shape=[None, 1],
@@ -602,6 +604,7 @@ def warpctc(input,
        .. code-block:: python

            # using Tensor
+            import paddle
            import paddle.fluid as fluid
            import numpy as np

@@ -613,6 +616,7 @@ def warpctc(input,
            batch_size = 16
            # class num
            class_num = 5
+            paddle.enable_static()
            logits = fluid.data(name='logits',
                           shape=[max_seq_length, batch_size, class_num+1],
                           dtype='float32')
@@ -637,8 +641,23 @@ def warpctc(input,
                                  fetch_list=[cost.name])
            print(output)
    """
+    if in_dygraph_mode():
+        if input_length is None or label_length is None:
+            raise ValueError(
+                "input_length and label_length must not be None in dygraph mode!"
+            )
+        grad, loss_out = core.ops.warpctc(
+            input,
+            label,
+            input_length,
+            label_length,
+            'blank',
+            blank,
+            'norm_by_times',
+            norm_by_times, )
+        return loss_out
    helper = LayerHelper('warpctc', **locals())
-    check_variable_and_dtype(input, 'input', ['float32'], "warpctc")
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc")
    check_variable_and_dtype(label, 'label', ['int32'], "warpctc")
    this_inputs = {'Logits': [input], 'Label': [label]}
    if input_length is not None and label_length is not None:

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -394,7 +394,8 @@ foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
-py_test_modules(test_warpctc_op MODULES test_warpctc_op)
+# disable test_warpctc_op
+# py_test_modules(test_warpctc_op MODULES test_warpctc_op)
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
@@ -531,15 +532,15 @@ if(NOT WIN32)
 endif()

 if(NOT APPLE AND NOT WIN32)
-    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140)
-    bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140)
+    bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140   LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
+    bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140  LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 endif()

 add_subdirectory(sequence)

--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
@@ -153,7 +153,7 @@ def gen_fake_line(dnn_data_num=7,
    return line


-def prepare_fake_data(file_nums=9, file_lines=1000):
+def prepare_fake_data(file_nums=6, file_lines=1000):
    """
    Create fake data with same type as avazu_ctr_data
    """

--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -206,13 +206,6 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
                debug=int(os.getenv("Debug", "0")))
            pass_time = time.time() - pass_start
            print("do_dataset_training done. using time {}".format(pass_time))
-        if os.getenv("SAVE_MODEL") == "1":
-            model_dir = tempfile.mkdtemp()
-            fleet.save_inference_model(exe, model_dir,
-                                       [feed.name for feed in self.feeds],
-                                       self.avg_cost)
-            self.check_model_right(model_dir)
-            shutil.rmtree(model_dir)

        fleet.stop_worker()
        print("do_dataset_training stop worker.")

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -217,6 +217,9 @@ class OpTest(unittest.TestCase):
                    return False
            return True

+        def is_xpu_op_test():
+            return hasattr(cls, "use_xpu") and cls.use_xpu == True
+
        def is_mkldnn_op_test():
            return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True

@@ -239,6 +242,7 @@ class OpTest(unittest.TestCase):
            if cls.dtype in [np.float32, np.float64] \
                and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \
                and not hasattr(cls, 'exist_fp64_check_grad') \
+                and not is_xpu_op_test() \
                and not is_mkldnn_op_test():
                raise AssertionError(
                    "This test of %s op needs check_grad with fp64 precision." %
@@ -336,6 +340,11 @@ class OpTest(unittest.TestCase):
                    self.attrs["use_mkldnn"] == True):
            self.__class__.use_mkldnn = True

+        if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
+            (hasattr(self, "attrs") and "use_xpu" in self.attrs and \
+                    self.attrs["use_xpu"] == True):
+            self.__class__.use_xpu = True
+
        op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
        "infer datatype from inputs and outputs for this test case"
        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
@@ -932,6 +941,8 @@ class OpTest(unittest.TestCase):
        need_run_ops = self._get_need_run_ops(op_desc)

        res = {}
+        if hasattr(self, 'attrs') and bool(self.attrs.get('use_xpu', False)):
+            return
        for op_desc, father_op_desc in reversed(need_run_ops):
            # The first one is the forward op
            has_infer_inplace = fluid.core.has_infer_inplace(op_desc.type())
@@ -1203,6 +1214,11 @@ class OpTest(unittest.TestCase):
                    self.attrs["use_mkldnn"] == True):
            self.__class__.use_mkldnn = True

+        if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
+            (hasattr(self, "attrs") and "use_xpu" in self.attrs and \
+                    self.attrs["use_xpu"] == True):
+            self.__class__.use_xpu = True
+
        places = self._get_places()
        for place in places:
            res = self.check_output_with_place(place, atol, no_check_set,

--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -78,15 +78,17 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase):
 class TestELUDoubleGradCheck(unittest.TestCase):
    @prog_scope()
    def func(self, place):
-        shape = [2, 3, 7, 9]
+        shape = [2, 3, 6, 6]
        eps = 1e-6
        alpha = 1.1
        dtype = np.float64
+        SEED = 0

        x = layers.data('x', shape, False, dtype)
        x.persistable = True

        y = layers.elu(x, alpha=alpha)
+        np.random.RandomState(SEED)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
        gradient_checker.double_grad_check(
            [x], y, x_init=x_arr, place=place, eps=eps)
@@ -171,5 +173,29 @@ class TestAbsDoubleGradCheck(unittest.TestCase):
            self.func(p)


+class TestLogDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        shape = [2, 3, 7, 9]
+        eps = 1e-6
+        dtype = np.float64
+
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.log(x)
+
+        x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -238,7 +238,7 @@ class TestTanhAPI(unittest.TestCase):

    def test_dygraph_api(self):
        paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x_np)
+        x = paddle.to_tensor(self.x_np)
        out1 = F.tanh(x)
        out2 = paddle.tanh(x)
        th = paddle.nn.Tanh()
@@ -596,7 +596,7 @@ class TestHardShrinkAPI(unittest.TestCase):

    def test_dygraph_api(self):
        paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x_np)
+        x = paddle.to_tensor(self.x_np)
        out1 = F.hardshrink(x)
        hd = paddle.nn.Hardshrink()
        out2 = hd(x)
@@ -666,7 +666,7 @@ class TestHardtanhAPI(unittest.TestCase):

    def test_dygraph_api(self):
        paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x_np)
+        x = paddle.to_tensor(self.x_np)
        out1 = F.hardtanh(x)
        m = paddle.nn.Hardtanh()
        out2 = m(x)
@@ -1112,7 +1112,7 @@ class TestLeakyReluAPI(unittest.TestCase):

    def test_dygraph_api(self):
        paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x_np)
+        x = paddle.to_tensor(self.x_np)
        out1 = F.leaky_relu(x)
        m = paddle.nn.LeakyReLU()
        out2 = m(x)

--- a/python/paddle/fluid/tests/unittests/test_adamax_api.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py
@@ -25,7 +25,7 @@ class TestAdamaxAPI(unittest.TestCase):
    def test_adamax_api_dygraph(self):
        paddle.disable_static()
        value = np.arange(26).reshape(2, 13).astype("float32")
-        a = paddle.to_variable(value)
+        a = paddle.to_tensor(value)
        linear = paddle.nn.Linear(13, 5)
        adam = paddle.optimizer.Adamax(
            learning_rate=0.01,

--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -22,7 +22,7 @@ class TestAdamWOp(unittest.TestCase):
    def test_adamw_op_dygraph(self):
        paddle.disable_static()
        value = np.arange(26).reshape(2, 13).astype("float32")
-        a = paddle.to_variable(value)
+        a = paddle.to_tensor(value)
        linear = paddle.nn.Linear(13, 5)
        adam = paddle.optimizer.AdamW(
            learning_rate=0.01,
@@ -37,7 +37,7 @@ class TestAdamWOp(unittest.TestCase):
    def test_adamw_op_coverage(self):
        paddle.disable_static()
        value = np.arange(26).reshape(2, 13).astype("float32")
-        a = paddle.to_variable(value)
+        a = paddle.to_tensor(value)
        linear = paddle.nn.Linear(13, 5)
        adam = paddle.optimizer.AdamW(
            learning_rate=0.0,

--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
@@ -147,7 +147,7 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
                         if core.is_compiled_with_cuda() else [False]):
            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
            paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)

            out_1 = paddle.nn.functional.adaptive_avg_pool2d(
                x=x, output_size=[3, 3])
@@ -245,7 +245,7 @@ class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase):
                         if core.is_compiled_with_cuda() else [False]):
            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
            paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)

            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
            out_1 = adaptive_avg_pool(x=x)

--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
@@ -162,7 +162,7 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
                         if core.is_compiled_with_cuda() else [False]):
            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
            paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)

            out_1 = paddle.nn.functional.adaptive_avg_pool3d(
                x=x, output_size=[3, 3, 3])
@@ -262,7 +262,7 @@ class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase):
                         if core.is_compiled_with_cuda() else [False]):
            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
            paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)

            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
                output_size=[3, 3, 3])

--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
@@ -147,7 +147,7 @@ class TestAdaptiveMaxPool2dAPI(unittest.TestCase):
                         if core.is_compiled_with_cuda() else [False]):
            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
            paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)

            out_1 = paddle.nn.functional.adaptive_max_pool2d(
                x=x, return_indices=False, output_size=[3, 3])
@@ -240,7 +240,7 @@ class TestAdaptiveMaxPool2dClassAPI(unittest.TestCase):
                         if core.is_compiled_with_cuda() else [False]):
            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
            paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)

            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
            out_1 = adaptive_max_pool(x=x)

--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
@@ -162,7 +162,7 @@ class TestAdaptiveMaxPool3dAPI(unittest.TestCase):
                         if core.is_compiled_with_cuda() else [False]):
            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
            paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)

            out_1 = paddle.nn.functional.adaptive_max_pool3d(
                x=x, output_size=[3, 3, 3])
@@ -257,7 +257,7 @@ class TestAdaptiveMaxPool3dClassAPI(unittest.TestCase):
                         if core.is_compiled_with_cuda() else [False]):
            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
            paddle.disable_static(place=place)
-            x = paddle.to_variable(self.x_np)
+            x = paddle.to_tensor(self.x_np)

            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
                output_size=[3, 3, 3])

--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -244,9 +244,9 @@ class TestAddMMAPI(unittest.TestCase):

        def test_error1():
            data_x_wrong = np.ones((2, 3)).astype(np.float32)
-            x = paddle.to_variable(data_x_wrong)
-            y = paddle.to_variable(data_y)
-            input = paddle.to_variable(data_input)
+            x = paddle.to_tensor(data_x_wrong)
+            y = paddle.to_tensor(data_y)
+            input = paddle.to_tensor(data_input)
            out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
        self.assertRaises(ValueError, test_error1)
 '''

--- a/python/paddle/fluid/tests/unittests/test_arange.py
+++ b/python/paddle/fluid/tests/unittests/test_arange.py
@@ -98,9 +98,9 @@ class TestArangeImperative(unittest.TestCase):
        x2 = paddle.tensor.arange(5)
        x3 = paddle.tensor.creation.arange(5)

-        start = paddle.to_variable(np.array([0], 'float32'))
-        end = paddle.to_variable(np.array([5], 'float32'))
-        step = paddle.to_variable(np.array([1], 'float32'))
+        start = paddle.to_tensor(np.array([0], 'float32'))
+        end = paddle.to_tensor(np.array([5], 'float32'))
+        step = paddle.to_tensor(np.array([1], 'float32'))
        x4 = paddle.arange(start, end, step, 'int64')
        paddle.enable_static()


--- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
@@ -96,7 +96,7 @@ class TestDygraph(unittest.TestCase):
        a = np.random.rand(3, 3)
        a_t = np.transpose(a, [1, 0])
        x_data = np.matmul(a, a_t) + 1e-03
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor(x_data)
        out = paddle.cholesky(x, upper=False)



--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -168,9 +168,9 @@ class TestClipAPI(unittest.TestCase):
        paddle.disable_static(place)
        data_shape = [1, 9, 9, 4]
        data = np.random.random(data_shape).astype('float32')
-        images = paddle.to_variable(data, dtype='float32')
-        v_min = paddle.to_variable(np.array([0.2], dtype=np.float32))
-        v_max = paddle.to_variable(np.array([0.8], dtype=np.float32))
+        images = paddle.to_tensor(data, dtype='float32')
+        v_min = paddle.to_tensor(np.array([0.2], dtype=np.float32))
+        v_max = paddle.to_tensor(np.array([0.8], dtype=np.float32))

        out_1 = paddle.clip(images, min=0.2, max=0.8)
        out_2 = paddle.clip(images, min=0.2, max=0.9)

--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -113,6 +113,7 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
        strategy.a_sync_configs = {"k_steps": 100}
+        strategy.a_sync_configs = {"launch_barrier": False}

        if training_role == "TRAINER":
            self.run_trainer(role, strategy)

--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
@@ -51,6 +51,7 @@ class TestCommunicator(unittest.TestCase):

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False
+        strategy.a_sync_configs = {"launch_barrier": False}

        optimizer = fleet.distributed_optimizer(optimizer, strategy)
        optimizer.minimize(avg_cost)

--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -285,9 +285,9 @@ class TestConcatAPI(unittest.TestCase):
        in2 = np.array([[11, 12, 13], [14, 15, 16]])
        in3 = np.array([[21, 22], [23, 24]])
        paddle.disable_static()
-        x1 = paddle.to_variable(in1)
-        x2 = paddle.to_variable(in2)
-        x3 = paddle.to_variable(in3)
+        x1 = paddle.to_tensor(in1)
+        x2 = paddle.to_tensor(in2)
+        x3 = paddle.to_tensor(in3)
        out1 = fluid.layers.concat(input=[x1, x2, x3], axis=-1)
        out2 = paddle.concat(x=[x1, x2], axis=0)
        np_out1 = np.concatenate([in1, in2, in3], axis=-1)

--- a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
+++ b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py
@@ -75,8 +75,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
        np_x2 = np.random.rand(*shape).astype(np.float32)
        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)

-        tesnor_x1 = paddle.to_variable(np_x1)
-        tesnor_x2 = paddle.to_variable(np_x2)
+        tesnor_x1 = paddle.to_tensor(np_x1)
+        tesnor_x2 = paddle.to_tensor(np_x2)
        y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)

        self.assertTrue(np.allclose(y.numpy(), np_out))
@@ -92,8 +92,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
        np_x2 = np.random.rand(*shape).astype(np.float32)
        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)

-        tesnor_x1 = paddle.to_variable(np_x1)
-        tesnor_x2 = paddle.to_variable(np_x2)
+        tesnor_x1 = paddle.to_tensor(np_x1)
+        tesnor_x2 = paddle.to_tensor(np_x2)
        y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)

        self.assertTrue(np.allclose(y.numpy(), np_out))
@@ -110,8 +110,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
        np_x2 = np.random.rand(*shape2).astype(np.float32)
        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)

-        tesnor_x1 = paddle.to_variable(np_x1)
-        tesnor_x2 = paddle.to_variable(np_x2)
+        tesnor_x1 = paddle.to_tensor(np_x1)
+        tesnor_x2 = paddle.to_tensor(np_x2)
        y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps)

        self.assertTrue(np.allclose(y.numpy(), np_out))
@@ -129,8 +129,8 @@ class TestCosineSimilarityAPI(unittest.TestCase):
        np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps)

        cos_sim_func = nn.CosineSimilarity(axis=axis, eps=eps)
-        tesnor_x1 = paddle.to_variable(np_x1)
-        tesnor_x2 = paddle.to_variable(np_x2)
+        tesnor_x1 = paddle.to_tensor(np_x1)
+        tesnor_x2 = paddle.to_tensor(np_x2)
        y = cos_sim_func(tesnor_x1, tesnor_x2)

        self.assertTrue(np.allclose(y.numpy(), np_out))

--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -21,13 +21,12 @@ import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
-from paddle import to_variable


 class TestCumsumOp(unittest.TestCase):
    def run_cases(self):
        data_np = np.arange(12).reshape(3, 4)
-        data = to_variable(data_np)
+        data = paddle.to_tensor(data_np)

        y = paddle.cumsum(data)
        z = np.cumsum(data_np)

--- a/python/paddle/fluid/tests/unittests/test_default_dtype.py
+++ b/python/paddle/fluid/tests/unittests/test_default_dtype.py
@@ -20,7 +20,6 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
 import paddle.fluid.core as core
-from paddle import to_variable


 class TestDefaultType(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -36,7 +36,7 @@ class TestDirectory(unittest.TestCase):
    def test_new_directory(self):
        new_directory = [
            'paddle.enable_static', 'paddle.disable_static',
-            'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad',
+            'paddle.in_dynamic_mode', 'paddle.to_tensor', 'paddle.grad',
            'paddle.no_grad', 'paddle.save', 'paddle.load',
            'paddle.static.save', 'paddle.static.load',
            'paddle.distributed.ParallelEnv',

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -52,6 +52,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
+        strategy.a_sync_configs = {"launch_barrier": False}
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)
@@ -92,6 +93,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = True
+        strategy.a_sync_configs = {"launch_barrier": False}
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(fleet._final_strategy().a_sync)
+        a_sync_configs = fleet._final_strategy().a_sync_configs
        self.assertTrue(a_sync_configs['k_steps'] == 0)



--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -72,8 +72,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(fleet._final_strategy().a_sync)
+        a_sync_configs = fleet._final_strategy().a_sync_configs
        self.assertTrue(a_sync_configs['k_steps'] == 0)



--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(fleet._final_strategy().a_sync)
+        a_sync_configs = fleet._final_strategy().a_sync_configs
        self.assertTrue(a_sync_configs['k_steps'] == 800)



--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -44,6 +44,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):

        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.a_sync = False
+        strategy.a_sync_configs = {"launch_barrier": False}
        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -312,9 +312,6 @@ class TestFleetBase(unittest.TestCase):
                "========================Error tr1_err end==========================="
            )

-        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
-        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
-
        # close trainer file
        tr0_pipe.close()
        tr1_pipe.close()
@@ -325,6 +322,8 @@ class TestFleetBase(unittest.TestCase):
        ps1.terminate()

        shutil.rmtree(gloo_path)
+        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
+        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
        return 0, 0

    def check_with_place(self,

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
@@ -81,7 +81,7 @@ class FleetDistHeterRunnerBase(object):
    def build_strategy(self, args):
        self.strategy = paddle.distributed.fleet.DistributedStrategy()
        self.strategy.a_sync = True
-
+        self.strategy.a_sync_configs = {"launch_barrier": True}
        return self.strategy

    def build_optimizer(self, avg_cost, strategy):
@@ -237,7 +237,10 @@ class TestFleetHeterBase(unittest.TestCase):
        return heter0_proc, heter1_proc, heter0_pipe, heter1_pipe

    def _run_cluster(self, model, envs):
-        env = {'GRAD_CLIP': str(self._grad_clip_mode)}
+        env = {
+            'GRAD_CLIP': str(self._grad_clip_mode),
+            'FLAGS_eager_delete_tensor_gb': str(-1)
+        }
        python_path = self._python_interp
        gloo_path = tempfile.mkdtemp()

@@ -286,27 +289,6 @@ class TestFleetHeterBase(unittest.TestCase):

        tr0_ret = tr0.returncode
        tr1_ret = tr0.returncode
-        print("tr get returncode: {}".format(tr0_ret))
-        if tr0_ret != 0:
-            print(
-                "========================Error tr0_err begin==========================="
-            )
-            os.system("cat {}".format(tempfile.gettempdir() + "/tr0_err.log"))
-            print(
-                "========================Error tr0_err end==========================="
-            )
-
-        if tr1_ret != 0:
-            print(
-                "========================Error tr1_err begin==========================="
-            )
-            os.system("cat {}".format(tempfile.gettempdir() + "/tr1_err.log"))
-            print(
-                "========================Error tr1_err end==========================="
-            )
-
-        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
-        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")

        # close trainer file
        tr0_pipe.close()
@@ -320,7 +302,8 @@ class TestFleetHeterBase(unittest.TestCase):
        ps1.terminate()
        heter0.terminate()
        heter1.terminate()
-
+        self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
+        self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
        shutil.rmtree(gloo_path)
        return 0, 0


--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
@@ -23,38 +23,6 @@ import paddle
 paddle.enable_static()


-class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
-    def _setup_config(self):
-        self._mode = "async"
-        self._reader = "dataset"
-
-    def check_with_place(self,
-                         model_file,
-                         delta=1e-3,
-                         check_error_log=False,
-                         need_envs={}):
-        required_envs = {
-            "PATH": os.getenv("PATH", ""),
-            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
-            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
-            "http_proxy": "",
-            "CPU_NUM": "3"
-        }
-
-        required_envs.update(need_envs)
-
-        if check_error_log:
-            required_envs["GLOG_v"] = "3"
-            required_envs["GLOG_logtostderr"] = "1"
-
-        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
-
-    def test_dist_train(self):
-        self.check_with_place(
-            "dist_fleet_heter_ctr.py", delta=1e-5, check_error_log=True)
-
-
 class TestDistHeterPyreaderAsync2x2(TestFleetHeterBase):
    def _setup_config(self):
        self._mode = "async"

--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -195,7 +195,7 @@ class TestFlattenPython(unittest.TestCase):

        def test_Negative():
            paddle.disable_static()
-            img = paddle.to_variable(x)
+            img = paddle.to_tensor(x)
            out = paddle.flatten(img, start_axis=-2, stop_axis=-1)
            return out.numpy().shape


--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -18,6 +18,8 @@ import unittest
 import paddle
 import os

+paddle.enable_static()
+

 class TestFleetAMPOptimizer(unittest.TestCase):
    def setUp(self):
@@ -55,6 +57,8 @@ class TestFleetAMPOptimizer(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)

+        strategy = fleet._final_strategy()
+
        ops = [op.type for op in avg_cost.block.ops]
        self.assertIn('cast', ops)
        self.assertIn('check_finite_and_unscale', ops)

--- a/python/paddle/fluid/tests/unittests/test_fleet_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
@@ -18,6 +18,8 @@ import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker

+paddle.enable_static()
+

 class TestDistributedStrategyAuto(unittest.TestCase):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -167,6 +167,8 @@ class TestFleetDygraph(unittest.TestCase):
        state_dict = adam.state_dict()
        adam.set_state_dict(state_dict)

+        final_strategy = fleet._final_strategy()
+

 class TestFleetBaseSingleRunCollective(unittest.TestCase):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -19,6 +19,8 @@ import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker

+paddle.enable_static()
+

 class TestFleetLambMetaOptimizer(unittest.TestCase):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -19,6 +19,8 @@ import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker

+paddle.enable_static()
+

 class TestFleetLarsMetaOptimizer(unittest.TestCase):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -211,7 +211,7 @@ class TestImperative(unittest.TestCase):
        paddle.disable_static()
        self.assertTrue(paddle.in_dynamic_mode())
        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        var_inp = paddle.to_variable(np_inp)
+        var_inp = paddle.to_tensor(np_inp)
        mlp = MLP(input_size=2)
        out = mlp(var_inp)
        dy_out1 = out.numpy()
@@ -221,7 +221,7 @@ class TestImperative(unittest.TestCase):
        self.assertFalse(paddle.in_dynamic_mode())
        paddle.disable_static()
        self.assertTrue(paddle.in_dynamic_mode())
-        var_inp = paddle.to_variable(np_inp)
+        var_inp = paddle.to_tensor(np_inp)
        mlp = MLP(input_size=2)
        out = mlp(var_inp)
        dy_out2 = out.numpy()

--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -54,7 +54,7 @@ class TestSimpleNet(unittest.TestCase):
                    # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)

                    input_word = np.array([[1, 2], [2, 1]]).astype('int64')
-                    input = paddle.to_variable(input_word)
+                    input = paddle.to_tensor(input_word)

                    simplenet = SimpleNet(20, 32, dtype)
                    adam = SGDOptimizer(

--- a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py
@@ -41,7 +41,7 @@ def run_dygraph(x_np, op_str, use_gpu=True):
    if use_gpu and fluid.core.is_compiled_with_cuda():
        place = paddle.CUDAPlace(0)
    paddle.disable_static(place)
-    x = paddle.to_variable(x_np)
+    x = paddle.to_tensor(x_np)
    dygraph_result = getattr(paddle.tensor, op_str)(x)
    return dygraph_result


--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -543,9 +543,9 @@ class TestJitSaveMultiCases(unittest.TestCase):
        loaded_layer = paddle.jit.load(model_path)
        loaded_layer.eval()
        # inference & compare
-        x = paddle.to_variable(np.random.random((1, 784)).astype('float32'))
+        x = paddle.to_tensor(np.random.random((1, 784)).astype('float32'))
        if with_label:
-            y = paddle.to_variable(np.random.random((1, 1)).astype('int64'))
+            y = paddle.to_tensor(np.random.random((1, 1)).astype('int64'))
            pred, _ = layer(x, y)
            pred = pred.numpy()
        else:
@@ -677,7 +677,7 @@ class TestJitSaveMultiCases(unittest.TestCase):

        model_path = "test_not_prune_output_spec_name_warning"
        configs = paddle.SaveLoadConfig()
-        out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
+        out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
        configs.output_spec = [out]
        paddle.jit.save(layer, model_path, configs=configs)

@@ -709,7 +709,7 @@ class TestJitSaveMultiCases(unittest.TestCase):

        model_path = "test_prune_to_static_after_train"
        configs = paddle.SaveLoadConfig()
-        out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
+        out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
        configs.output_spec = [out]
        with self.assertRaises(ValueError):
            paddle.jit.save(
@@ -730,7 +730,7 @@ class TestJitSaveLoadEmptyLayer(unittest.TestCase):

    def test_save_load_empty_layer(self):
        layer = EmptyLayer()
-        x = paddle.to_variable(np.random.random((10)).astype('float32'))
+        x = paddle.to_tensor(np.random.random((10)).astype('float32'))
        out = layer(x)
        paddle.jit.save(layer, self.model_path)
        load_layer = paddle.jit.load(self.model_path)
@@ -746,8 +746,8 @@ class TestJitSaveLoadNoParamLayer(unittest.TestCase):

    def test_save_load_no_param_layer(self):
        layer = NoParamLayer()
-        x = paddle.to_variable(np.random.random((5)).astype('float32'))
-        y = paddle.to_variable(np.random.random((5)).astype('float32'))
+        x = paddle.to_tensor(np.random.random((5)).astype('float32'))
+        y = paddle.to_tensor(np.random.random((5)).astype('float32'))
        out = layer(x, y)
        paddle.jit.save(layer, self.model_path)
        load_layer = paddle.jit.load(self.model_path)

--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
@@ -90,7 +90,7 @@ class TestKLDivLossDygraph(unittest.TestCase):
        with paddle.fluid.dygraph.guard():
            kldiv_criterion = paddle.nn.KLDivLoss(reduction)
            pred_loss = kldiv_criterion(
-                paddle.to_variable(x), paddle.to_variable(target))
+                paddle.to_tensor(x), paddle.to_tensor(target))
            self.assertTrue(np.allclose(pred_loss.numpy(), gt_loss))

    def test_kl_loss_batchmean(self):

--- a/python/paddle/fluid/tests/unittests/test_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py
@@ -26,8 +26,8 @@ class TestFunctionalL1Loss(unittest.TestCase):
        self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)

    def run_imperative(self):
-        input = paddle.to_variable(self.input_np)
-        label = paddle.to_variable(self.label_np)
+        input = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np)
        dy_result = paddle.nn.functional.l1_loss(input, label)
        expected = np.mean(np.abs(self.input_np - self.label_np))
        self.assertTrue(np.allclose(dy_result.numpy(), expected))
@@ -106,8 +106,8 @@ class TestClassL1Loss(unittest.TestCase):
        self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32)

    def run_imperative(self):
-        input = paddle.to_variable(self.input_np)
-        label = paddle.to_variable(self.label_np)
+        input = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np)
        l1_loss = paddle.nn.loss.L1Loss()
        dy_result = l1_loss(input, label)
        expected = np.mean(np.abs(self.input_np - self.label_np))

--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -96,7 +96,7 @@ class TestNNLogSoftmaxAPI(unittest.TestCase):

        # test dygrapg api
        paddle.disable_static()
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
        y = logsoftmax(x)
        self.assertTrue(np.allclose(y.numpy(), ref_out))
        paddle.enable_static()
@@ -127,7 +127,7 @@ class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
        self.assertTrue(np.allclose(out[0], ref_out))

        paddle.disable_static()
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
        y = F.log_softmax(x, axis, dtype)
        self.assertTrue(np.allclose(y.numpy(), ref_out), True)
        paddle.enable_static()

--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -111,7 +111,7 @@ class TestLogsumexpAPI(unittest.TestCase):
        self.assertTrue(np.allclose(res[0], out_ref))

        paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
        out = paddle.logsumexp(x, axis, keepdim)
        self.assertTrue(np.allclose(out.numpy(), out_ref))
        paddle.enable_static()
@@ -126,7 +126,7 @@ class TestLogsumexpAPI(unittest.TestCase):

    def test_alias(self):
        paddle.disable_static(self.place)
-        x = paddle.to_variable(self.x)
+        x = paddle.to_tensor(self.x)
        out1 = paddle.logsumexp(x)
        out2 = paddle.tensor.logsumexp(x)
        out3 = paddle.tensor.math.logsumexp(x)

--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -341,10 +341,12 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
            np.array_equal(x.rank().numpy(), paddle.rank(x).numpy()))
        self.assertTrue(
            np.array_equal(x[0].t().numpy(), paddle.t(x[0]).numpy()))
-        m = paddle.to_tensor(np.random.uniform(1, 2, [3, 3]), 'float32')
-        m = m.matmul(m.t())
+        d = paddle.to_tensor([[1.2285208, 1.3491015, 1.4899898],
+                              [1.30058, 1.0688717, 1.4928783],
+                              [1.0958099, 1.3724753, 1.8926544]])
+        d = d.matmul(d.t())
        self.assertTrue(
-            np.array_equal(m.cholesky().numpy(), paddle.cholesky(m).numpy()))
+            np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy()))

        self.assertTrue(
            np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))

--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -14,6 +14,7 @@

 from __future__ import print_function

+import paddle.fluid.core as core
 import unittest
 import numpy as np
 from op_test import OpTest

--- a/python/paddle/fluid/tests/unittests/test_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_max_op.py
@@ -80,7 +80,7 @@ class ApiMaxTest(unittest.TestCase):
    def test_imperative_api(self):
        paddle.disable_static()
        np_x = np.array([10, 10]).astype('float64')
-        x = paddle.to_variable(np_x)
+        x = paddle.to_tensor(np_x)
        z = paddle.max(x, axis=0)
        np_z = z.numpy()
        z_expected = np.array(np.max(np_x, axis=0))

--- a/python/paddle/fluid/tests/unittests/test_maximum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maximum_op.py
@@ -61,8 +61,8 @@ class ApiMaximumTest(unittest.TestCase):
    def test_dynamic_api(self):
        paddle.disable_static()
        np_x = np.array([10, 10]).astype('float64')
-        x = paddle.to_variable(self.input_x)
-        y = paddle.to_variable(self.input_y)
+        x = paddle.to_tensor(self.input_x)
+        y = paddle.to_tensor(self.input_y)
        z = paddle.maximum(x, y)
        np_z = z.numpy()
        z_expected = np.array(np.maximum(self.input_x, self.input_y))
@@ -73,8 +73,8 @@ class ApiMaximumTest(unittest.TestCase):
        np_x = np.random.rand(5, 4, 3, 2).astype("float64")
        np_y = np.random.rand(4, 3).astype("float64")

-        x = paddle.to_variable(self.input_x)
-        y = paddle.to_variable(self.input_y)
+        x = paddle.to_tensor(self.input_x)
+        y = paddle.to_tensor(self.input_y)
        result_1 = paddle.maximum(x, y, axis=1)
        result_2 = paddle.maximum(x, y, axis=-2)
        self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True)
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -204,7 +204,7 @@ class TestMeanAPI(unittest.TestCase):
        paddle.disable_static(self.place)

        def test_case(x, axis=None, keepdim=False):
-            x_tensor = paddle.to_variable(x)
+            x_tensor = paddle.to_tensor(x)
            out = paddle.mean(x_tensor, axis, keepdim)
            if isinstance(axis, list):
                axis = tuple(axis)

--- a/python/paddle/fluid/tests/unittests/test_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_min_op.py
@@ -80,7 +80,7 @@ class ApiMinTest(unittest.TestCase):
    def test_imperative_api(self):
        paddle.disable_static()
        np_x = np.array([10, 10]).astype('float64')
-        x = paddle.to_variable(np_x)
+        x = paddle.to_tensor(np_x)
        z = paddle.min(x, axis=0)
        np_z = z.numpy()
        z_expected = np.array(np.min(np_x, axis=0))

--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -18,6 +18,8 @@ import unittest
 import numpy as np
 import paddle
 import paddle.fluid.core as core
+import sys
+sys.path.append("..")
 from op_test import OpTest
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
@@ -175,57 +177,5 @@ class TestFP16MulOp2(TestMulOp2):
                no_grad_set=set('Y'))


-@unittest.skipIf(not core.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUMulOp1(TestMulOp):
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        place = core.XPUPlace(0)
-        self.check_output_with_place(place, atol=1e-1)
-
-    def test_check_grad_normal(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', max_relative_error=0.5)
-
-    def test_check_grad_ingore_x(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-
-
-@unittest.skipIf(not core.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUMulOp2(TestMulOp2):
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        place = core.XPUPlace(0)
-        self.check_output_with_place(place, atol=2e-1)
-
-    def test_check_grad_normal(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', max_relative_error=0.9)
-
-    def test_check_grad_ingore_x(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
-    def test_check_grad_ingore_y(self):
-        place = core.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
-
-
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_mv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mv_op.py
@@ -20,6 +20,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
+from paddle.static import program_guard, Program
 from op_test import OpTest


@@ -37,7 +38,7 @@ class TestMVOp(OpTest):
        self.check_grad(['X', 'Vec'], 'Out')

    def init_config(self):
-        self.x = np.random.random((5, 100)).astype("float64")
+        self.x = np.random.random((2, 100)).astype("float64")
        self.vec = np.random.random((100)).astype("float64")


@@ -57,21 +58,36 @@ class TestMVAPI(unittest.TestCase):
        paddle.enable_static()

    def test_static_graph(self):
-        paddle.enable_static()
+        for x_stop_gradient in [False, True]:
+            for vec_stop_gradient in [False, True]:
+
+                paddle.enable_static()
+
+                train_program = Program()
+                startup_program = Program()
+
+                self.input_x = np.random.rand(5, 100).astype("float64")
+                self.input_vec = np.random.rand(100).astype("float64")
+
+                with program_guard(train_program, startup_program):
+                    data_x = paddle.static.data(
+                        "x", shape=[5, 100], dtype="float64")
+                    data_vec = paddle.static.data(
+                        "vec", shape=[100], dtype="float64")
+
+                    data_x.stop_gradient = x_stop_gradient
+                    data_vec.stop_gradient = vec_stop_gradient
+
+                    result_vec = paddle.mv(data_x, data_vec)

-        self.input_x = np.random.rand(5, 100).astype("float64")
-        self.input_vec = np.random.rand(100).astype("float64")
-
-        data_x = paddle.static.data("x", shape=[5, 100], dtype="float64")
-        data_vec = paddle.static.data("vec", shape=[100], dtype="float64")
-        result_vec = paddle.mv(data_x, data_vec)
-        self.place = paddle.CPUPlace()
-        exe = paddle.static.Executor(self.place)
-        res, = exe.run(feed={"x": self.input_x,
-                             "vec": self.input_vec},
-                       fetch_list=[result_vec])
-        z_expected = np.array(np.dot(self.input_x, self.input_vec))
-        self.assertTrue(np.allclose(res, z_expected))
+                    self.place = paddle.CPUPlace()
+                    exe = paddle.static.Executor(self.place)
+                    res, = exe.run(
+                        feed={"x": self.input_x,
+                              "vec": self.input_vec},
+                        fetch_list=[result_vec])
+                    z_expected = np.array(np.dot(self.input_x, self.input_vec))
+                    self.assertTrue(np.allclose(res, z_expected))


 class TestMVError(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -195,6 +195,23 @@ class TestPool1d_API(unittest.TestCase):
            result = max_pool1d_dg(input)
            self.assertTrue(np.allclose(result.numpy(), result_np))

+    def check_max_dygraph_return_index_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result, index = F.max_pool1d(
+                input, kernel_size=2, stride=2, padding=0, return_indices=True)
+
+            result_np = max_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[0])
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool1d_dg = paddle.nn.layer.MaxPool1d(
+                kernel_size=2, stride=None, padding=0)
+            result = max_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
    def check_max_dygraph_padding_same(self, place):
        with fluid.dygraph.guard(place):
            input_np = np.random.random([2, 3, 32]).astype("float32")
@@ -228,6 +245,7 @@ class TestPool1d_API(unittest.TestCase):
            self.check_avg_static_results(place)
            self.check_max_dygraph_padding_same(place)
            self.check_avg_dygraph_padding_same(place)
+            self.check_max_dygraph_return_index_results(place)


 class TestPool2dError_API(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/test_randn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randn_op.py
@@ -63,7 +63,7 @@ class TestRandnOpForDygraph(unittest.TestCase):
        dim_2 = paddle.fill_constant([1], "int32", 50)
        x3 = paddle.randn(shape=[dim_1, dim_2, 784])

-        var_shape = paddle.to_variable(np.array(shape))
+        var_shape = paddle.to_tensor(np.array(shape))
        x4 = paddle.randn(var_shape)

        for out in [x1, x2, x3, x4]:

--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -105,8 +105,8 @@ class TestRetainGraph(unittest.TestCase):
        A = np.random.rand(2, 3, 32, 32).astype('float32')
        B = np.random.rand(2, 3, 32, 32).astype('float32')

-        realA = paddle.to_variable(A)
-        realB = paddle.to_variable(B)
+        realA = paddle.to_tensor(A)
+        realB = paddle.to_tensor(B)
        fakeB = g(realA)

        optim_d.clear_gradients()

--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -487,24 +487,24 @@ class TestTransformer(unittest.TestCase):
                dropout=dropout,
                weight_attr=[None],
                bias_attr=[False])
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                np.random.rand(batch_size, source_length, d_model).astype(
                    "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                np.random.rand(batch_size, target_length, d_model).astype(
                    "float32"))
            src_mask = np.zeros((batch_size, n_head, source_length,
                                 source_length)).astype("float32")
            src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
            tgt_mask = np.zeros((batch_size, n_head, target_length,
                                 target_length)).astype("float32")
            tgt_mask[0][0][0][0] = -1e9
            memory_mask = np.zeros((batch_size, n_head, target_length,
                                    source_length)).astype("float32")
            memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
            trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                       memory_mask)

@@ -521,24 +521,24 @@ class TestTransformer(unittest.TestCase):
                dropout=dropout,
                weight_attr=[None, None],
                bias_attr=[False, False])
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                np.random.rand(batch_size, source_length, d_model).astype(
                    "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                np.random.rand(batch_size, target_length, d_model).astype(
                    "float32"))
            src_mask = np.zeros((batch_size, n_head, source_length,
                                 source_length)).astype("float32")
            src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
            tgt_mask = np.zeros((batch_size, n_head, target_length,
                                 target_length)).astype("float32")
            tgt_mask[0][0][0][0] = -1e9
            memory_mask = np.zeros((batch_size, n_head, target_length,
                                    source_length)).astype("float32")
            memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
            trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                       memory_mask)

@@ -555,24 +555,24 @@ class TestTransformer(unittest.TestCase):
                dropout=dropout,
                weight_attr=[None, None, None],
                bias_attr=[False, False, True])
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                np.random.rand(batch_size, source_length, d_model).astype(
                    "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                np.random.rand(batch_size, target_length, d_model).astype(
                    "float32"))
            src_mask = np.zeros((batch_size, n_head, source_length,
                                 source_length)).astype("float32")
            src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
            tgt_mask = np.zeros((batch_size, n_head, target_length,
                                 target_length)).astype("float32")
            tgt_mask[0][0][0][0] = -1e9
            memory_mask = np.zeros((batch_size, n_head, target_length,
                                    source_length)).astype("float32")
            memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
            trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                       memory_mask)

@@ -588,24 +588,24 @@ class TestTransformer(unittest.TestCase):
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                bias_attr=False)
-            src = paddle.to_variable(
+            src = paddle.to_tensor(
                np.random.rand(batch_size, source_length, d_model).astype(
                    "float32"))
-            tgt = paddle.to_variable(
+            tgt = paddle.to_tensor(
                np.random.rand(batch_size, target_length, d_model).astype(
                    "float32"))
            src_mask = np.zeros((batch_size, n_head, source_length,
                                 source_length)).astype("float32")
            src_mask[0][0][0][0] = -np.inf
-            src_mask = paddle.to_variable(src_mask)
+            src_mask = paddle.to_tensor(src_mask)
            tgt_mask = np.zeros((batch_size, n_head, target_length,
                                 target_length)).astype("float32")
            tgt_mask[0][0][0][0] = -1e9
            memory_mask = np.zeros((batch_size, n_head, target_length,
                                    source_length)).astype("float32")
            memory_mask[0][0][0][0] = -1e9
-            tgt_mask, memory_mask = paddle.to_variable(
-                tgt_mask), paddle.to_variable(memory_mask)
+            tgt_mask, memory_mask = paddle.to_tensor(
+                tgt_mask), paddle.to_tensor(memory_mask)
            trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                       memory_mask)


--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -24,7 +24,7 @@ from paddle.fluid import Program, program_guard
 import paddle
 import paddle.nn.functional as F

-CUDA_BLOCK_SIZE = 512
+CUDA_BLOCK_SIZE = 32


 class CTCForward(object):
@@ -41,8 +41,8 @@ class CTCForward(object):
        self.num_classes = num_classes
        self.batch_size = batch_size

-        self.loss = np.zeros([self.batch_size, 1], dtype="float32")
-        self.gradient = np.zeros(self.softmax.shape, dtype="float32")
+        self.loss = np.zeros([self.batch_size, 1], dtype=softmax.dtype)
+        self.gradient = np.zeros(self.softmax.shape, dtype=softmax.dtype)

        # float64
        self.EXP_MAX = sys.float_info.max
@@ -112,13 +112,15 @@ class CTCForward(object):
        # calculate the forward and backward variables,
        # reference Chapter 7.3 of "Alex Grave, Supervised Sequence
        # Labelling with Recurrent Neural Networks"
-        log_acts = np.zeros([total_times, self.num_classes], dtype="float32")
+        log_acts = np.zeros(
+            [total_times, self.num_classes], dtype=softmax_a_sequence.dtype)
        for i in range(total_times):
            for j in range(self.num_classes):
                log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j])

        # calculate the forward variables
-        forward_vars = np.zeros([total_times, total_segments], dtype="float32")
+        forward_vars = np.zeros(
+            [total_times, total_segments], dtype=softmax_a_sequence.dtype)
        for i in range(total_times):
            for j in range(total_segments):
                forward_vars[i, j] = self.LOG_ZERO
@@ -219,7 +221,7 @@ class TestWarpCTCOp(OpTest):
                                      self.logits_lod[0][i])
        self.gradient = np.zeros(
            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
+            dtype=logits.dtype)

        self.inputs = {
            "Logits": (logits, self.logits_lod),
@@ -287,7 +289,7 @@ class TestWarpCTCOpWithPadding(OpTest):
        # reshape logits to T*N*S
        new_logits = np.zeros(
            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
+            dtype=logits.dtype)

        cur = 0
        for batch_id in range(self.batch_size):
@@ -312,7 +314,7 @@ class TestWarpCTCOpWithPadding(OpTest):

        self.gradient = np.zeros(
            [max_sequence_length, self.batch_size, self.num_classes],
-            dtype="float32")
+            dtype=logits.dtype)

        self.inputs = {
            "Logits": new_logits,
@@ -347,6 +349,90 @@ class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
        self.norm_by_times = False


+class TestWarpCTCOpFp64(OpTest):
+    def config(self):
+        self.batch_size = 4
+        self.num_classes = 8
+        self.logits_lod = [[4, 1, 5, 5]]
+        self.labels_lod = [[3, 1, 4, 2]]
+        self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64)
+        self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64)
+        self.blank = self.num_classes - 1
+        self.norm_by_times = False
+
+    def setUp(self):
+        self.op_type = "warpctc"
+        self.config()
+
+        logits = np.random.uniform(
+            0.1, 1.0,
+            [sum(self.logits_length), self.num_classes]).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            0,
+            self.num_classes - 1, [sum(self.labels_length), 1],
+            dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
+                         self.num_classes, self.batch_size, self.blank,
+                         self.norm_by_times)
+        loss = ctc.forward()
+
+        max_sequence_length = 0
+        for i in range(self.batch_size):
+            max_sequence_length = max(max_sequence_length,
+                                      self.logits_length[i])
+        # reshape logits to T*N*S
+        new_logits = np.zeros(
+            [max_sequence_length, self.batch_size, self.num_classes],
+            dtype=logits.dtype)
+
+        cur = 0
+        for batch_id in range(self.batch_size):
+            for i in range(self.logits_length[batch_id]):
+                for j in range(self.num_classes):
+                    new_logits[i, batch_id, j] = logits[cur + i, j]
+            cur = cur + self.logits_length[batch_id]
+
+        # reshape labels to N*S
+        max_target_seq_length = 0
+        for i in range(self.batch_size):
+            max_target_seq_length = max(max_target_seq_length,
+                                        self.labels_length[i])
+        new_labels = np.zeros(
+            [self.batch_size, max_target_seq_length], dtype="int32")
+
+        cur = 0
+        for batch_id in range(self.batch_size):
+            for i in range(self.labels_length[batch_id]):
+                new_labels[batch_id, i] = labels[cur + i]
+            cur = cur + self.labels_length[batch_id]
+
+        self.gradient = np.zeros(
+            [max_sequence_length, self.batch_size, self.num_classes],
+            dtype=logits.dtype)
+
+        self.inputs = {
+            "Logits": new_logits,
+            "Label": new_labels,
+            "LogitsLength": self.logits_length,
+            "LabelLength": self.labels_length
+        }
+        self.outputs = {"Loss": loss}
+        self.attrs = {
+            "blank": self.blank,
+            "norm_by_times": self.norm_by_times,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.outputs['WarpCTCGrad'] = self.gradient
+        self.check_grad(["Logits"], "Loss")
+
+
 class TestWarpCTCOpError(unittest.TestCase):
    def test_errors(self):
        with program_guard(Program(), Program()):
@@ -359,7 +445,7 @@ class TestWarpCTCOpError(unittest.TestCase):
                name='labels_length', shape=[None], dtype='int64')

            def test_logits_Variable():
-                logits_data = np.random.rand(5, 16, 6).astype("float32")
+                logits_data = np.random.rand(5, 16, 6).astype(logits.dtype)
                fluid.layers.warpctc(
                    input=logits_data,
                    label=label,
@@ -398,6 +484,21 @@ class TestWarpCTCOpError(unittest.TestCase):

            self.assertRaises(TypeError, test_label_len_Variable)

+    def test_dygraph_errors(self):
+        def test_dygraph_with_lod():
+
+            logits = np.random.uniform(0.1, 1.0, [20, 15]).astype("float32")
+            # labels should not be blank
+            labels = np.random.randint(0, 15 - 1, [15, 1], dtype="int32")
+            softmax = paddle.to_variable(logits)
+            labels = paddle.to_variable(labels)
+
+            fluid.layers.warpctc(input=softmax, label=labels)
+
+        paddle.disable_static()
+        self.assertRaises(ValueError, test_dygraph_with_lod)
+        paddle.enable_static()
+

 class TestCTCLossAPICase(unittest.TestCase):
    def test_functinal_api(self):

--- a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
@@ -63,7 +63,7 @@ class TestZerosLikeImpeartive(unittest.TestCase):
        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
        ) else fluid.CPUPlace()
        paddle.disable_static(place)
-        x = paddle.to_variable(np.ones(shape))
+        x = paddle.to_tensor(np.ones(shape))
        for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
            out = zeros_like(x, dtype)
            self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),

--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from scipy.special import expit, erf
+import paddle
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.fluid import compiler, Program, program_guard
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUActivation(OpTest):
+    def setUp(self):
+        self.op_type = "exp"
+        self.init_dtype()
+        self.init_kernel_type()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.exp(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-3)
+
+    def init_kernel_type(self):
+        pass
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSigmoid(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "sigmoid"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = 1 / (1 + np.exp(-x))
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'], 'Out', max_relative_error=0.01)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUTanh(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "tanh"
+        self.init_dtype()
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.tanh(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSqrt(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "sqrt"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUAbs(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "abs"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 25]).astype(self.dtype)
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.abs(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPURelu(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "relu"
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUGelu(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "gelu"
+        self.init_dtype()
+        approximate = False
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = gelu(x, approximate)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {"approximate": approximate, 'use_xpu': True}
+
+
+def gelu(x, approximate):
+    if approximate:
+        y_ref = 0.5 * x * (1.0 + np.tanh(
+            np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+    else:
+        y_ref = 0.5 * x * (1 + erf(x / np.sqrt(2)))
+    return y_ref.astype(x.dtype)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPULog(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "log"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSquare(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "square"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.square(x)
+
+        self.attrs = {'use_xpu': True}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUPow(TestXPUActivation):
+    def setUp(self):
+        self.op_type = "pow"
+        self.init_dtype()
+
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.power(x, 3)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'factor': 3.0, 'use_xpu': True}
+        self.outputs = {'Out': out}
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test import OpTest, skip_check_grad_ci
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+
+class TestElementwiseAddOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        self.check_output(check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_normal(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if self.dtype == np.float16:
+            return
+        self.check_grad(
+            ['X', 'Y'], 'Out', check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_ingore_x(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if self.dtype == np.float16:
+            return
+        self.check_grad(
+            ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad_ingore_y(self):
+        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        if self.dtype == np.float16:
+            return
+        self.check_grad(
+            ['X'],
+            'Out',
+            no_grad_set=set('Y'),
+            check_dygraph=(self.use_mkldnn == False))
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_axis()
+
+        self.inputs = {'X': self.x, 'Y': self.y}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': False, 'use_xpu': True}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['Y'], 'Out')
+
+    def test_check_grad_ingore_y(self):
+        if self.dtype == np.float32 and paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
+class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((100, )).astype(self.dtype)
+        self.y = np.random.random((100, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
+class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 100, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 100, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 100)
+
+
+class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(100, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
+        self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(1, 1, 100).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = 2
+
+
+class TestElementwiseAddOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
+
+            # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
+            # float16 only can be set on GPU place
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2)
+
+
+class TestAddOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.add(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_declarative(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = paddle.add(x, y)
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((z_value == z_expected).all(), True)
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 4]).astype('float64')
+            np_y = np.array([1, 5, 2]).astype('float64')
+            x = fluid.dygraph.to_variable(np_x)
+            y = fluid.dygraph.to_variable(np_y)
+            z = paddle.add(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((np_z == z_expected).all(), True)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+import paddle.fluid.core as core
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
+    BATCH_SIZE = 2
+    M = 3
+    N = 4
+    K = 5
+    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
+        K = 1
+    if dim_X == 1:
+        if transpose_X:
+            shape_X = [M]
+        else:
+            shape_X = [K]
+    if dim_Y == 1:
+        if transpose_Y:
+            shape_Y = [N]
+        else:
+            shape_Y = [K]
+    if dim_X >= 2:
+        if transpose_X:
+            shape_X = [K, M]
+        else:
+            shape_X = [M, K]
+    if dim_X == 3:
+        shape_X = [BATCH_SIZE] + shape_X
+    if dim_Y >= 2:
+        if transpose_Y:
+            shape_Y = [N, K]
+        else:
+            shape_Y = [K, N]
+    if dim_Y == 3:
+        shape_Y = [BATCH_SIZE] + shape_Y
+    return shape_X, shape_Y
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, 1))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((1, Y.size))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float32")
+    return Out
+
+
+class Generator(object):
+    def setUp(self):
+        self.op_type = "matmul"
+        X = np.random.random(self.shape_X).astype("float32")
+        Y = np.random.random(self.shape_Y).astype("float32")
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y)
+        self.inputs = {'X': X, 'Y': Y}
+        self.attrs = {
+            'transpose_X': self.transpose_X,
+            'transpose_Y': self.transpose_Y
+        }
+        self.outputs = {'Out': Out}
+
+    def test_check_output(self):
+        self.check_output()
+        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
+                self.inputs['Y'].shape) and self.inputs['X'].shape[
+                    0] == self.inputs['Y'].shape[0]:
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
+        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
+                self.inputs['Y'].shape) and self.inputs['X'].shape[
+                    0] == self.inputs['Y'].shape[0]:
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X', 'Y'], 'Out', max_relative_error=5e-2)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X"))
+        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
+                self.inputs['Y'].shape) and self.inputs['X'].shape[
+                    0] == self.inputs['Y'].shape[0]:
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                max_relative_error=5e-2,
+                no_grad_set=set("X"))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y'))
+        if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len(
+                self.inputs['Y'].shape) and self.inputs['X'].shape[
+                    0] == self.inputs['Y'].shape[0]:
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                max_relative_error=5e-2,
+                no_grad_set=set('Y'))
+
+
+class TestMatmulOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The inputs type of matmul_op must be Variable.
+            input1 = 12
+            self.assertRaises(TypeError, fluid.layers.matmul, input1, input1)
+            # The inputs dtype of matmul_op must be float32, float64.
+            input2 = fluid.layers.data(
+                name='input2', shape=[10, 10], dtype="int32")
+            self.assertRaises(TypeError, fluid.layers.matmul, input2, input2)
+            input3 = fluid.layers.data(
+                name='input3', shape=[2, 2], dtype="float16")
+            fluid.layers.matmul(input3, input3)
+
+
+# Negative dimension generation
+def generate_negative_dims(in_shape):
+    from itertools import combinations
+    size = len(in_shape)
+    indexs = list()
+    shapes = list()
+    for i in range(size):
+        indexs.extend(list(combinations([j for j in range(size)], i + 1)))
+    for idx in indexs:
+        shapes.append(
+            [in_shape[i] if i not in idx else -1 for i in range(size)])
+    return shapes
+
+
+# Build program with inputs sizes that contain negative numbers
+def test_negative_dims_program(obj):
+    for shape_x in generate_negative_dims(obj.shape_X):
+        for shape_y in generate_negative_dims(obj.shape_Y):
+            X = np.random.random(obj.shape_X).astype("float32")
+            Y = np.random.random(obj.shape_Y).astype("float32")
+            Ref = reference_matmul(X, Y, obj.transpose_X, obj.transpose_Y)
+            with program_guard(Program(), Program()):
+                x = fluid.data(name='x', shape=shape_x, dtype='float32')
+                y = fluid.data(name='y', shape=shape_y, dtype='float32')
+                output = fluid.layers.matmul(x, y, obj.transpose_X,
+                                             obj.transpose_Y)
+                obj.assertEqual(len(Ref.shape), len(output.shape))
+                for idx in range(len(Ref.shape)):
+                    if output.shape[idx] != -1:
+                        obj.assertEqual(Ref.shape[idx], output.shape[idx])
+                exe = fluid.Executor(fluid.CPUPlace())
+                res, = exe.run(fluid.default_main_program(),
+                               feed={'x': X,
+                                     'y': Y},
+                               fetch_list=[output])
+                np.allclose(res, Ref, atol=1e-5)
+
+
+# Generate program api cases for all negative possibilities
+def api_test(dim_x, dim_y, trans_x, trans_y):
+    test_name = ('TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+        dim_x, dim_y, trans_x, trans_y))
+    shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
+                                                  trans_y)
+    globals()[test_name] = type(test_name, (unittest.TestCase, ), {
+        'shape_X': shape_x,
+        'shape_Y': shape_y,
+        'transpose_X': trans_x,
+        'transpose_Y': trans_y,
+        'test_propram': test_negative_dims_program,
+    })
+
+
+# Generate operators cases for all possibilities
+def inject_test(dim_x, dim_y, trans_x, trans_y):
+    test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+        dim_x, dim_y, trans_x, trans_y))
+    shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
+                                                  trans_y)
+    globals()[test_name] = type(test_name, (Generator, OpTest), {
+        'shape_X': shape_x,
+        'shape_Y': shape_y,
+        'transpose_X': trans_x,
+        'transpose_Y': trans_y,
+    })
+
+
+for dim_X in (1, 2, 3):
+    for dim_Y in (1, 2, 3):
+        for transose_x in (False, True):
+            for transose_y in (False, True):
+                inject_test(dim_X, dim_Y, transose_x, transose_y)
+                api_test(dim_X, dim_Y, transose_x, transose_y)
+
+
+# Test case n-dim
+def generate_compatible_shapes(dim, transpose_X, transpose_Y):
+    M = 2
+    N = 4
+    K = 3
+    shape_X = [2 for _ in range(dim - 2)]
+    shape_Y = [2 for _ in range(dim - 2)]
+
+    if transpose_X:
+        shape_X += [K, M]
+    else:
+        shape_X += [M, K]
+
+    if transpose_Y:
+        shape_Y += [N, K]
+    else:
+        shape_Y += [K, N]
+
+    return shape_X, shape_Y
+
+
+# # Test case n-dim
+for dim in [4]:
+    for transpose_X in [False, True]:
+        for transpose_Y in [False, True]:
+            test_name = (
+                'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+                    dim, dim, transpose_X, transpose_Y))
+            shape_X, shape_Y = generate_compatible_shapes(dim, transpose_X,
+                                                          transpose_Y)
+            globals()[test_name] = type(test_name, (Generator, OpTest), {
+                'shape_X': shape_X,
+                'shape_Y': shape_Y,
+                'transpose_X': transpose_X,
+                'transpose_Y': transpose_Y,
+            })
+
+
+class API_TestMm(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2], dtype="float64")
+            y = fluid.data(name='y', shape=[2], dtype='float64')
+            res = fluid.data(name="output", shape=[1], dtype="float64")
+            result = paddle.mm(x, y)
+            exe = fluid.Executor(fluid.CPUPlace())
+            data1 = np.random.rand(2)
+            data2 = np.random.rand(2)
+            np_res = exe.run(feed={'x': data1, 'y': data2}, fetch_list=[result])
+            expected_result = np.matmul(
+                data1.reshape(1, 2), data2.reshape(2, 1))
+
+        self.assertTrue(
+            np.allclose(
+                np_res, expected_result, atol=1e-5),
+            "two value is\
+            {}\n{}, check diff!".format(np_res, expected_result))
+
+    def test_dygraph_without_out(self):
+        device = fluid.CPUPlace()
+        with fluid.dygraph.guard(device):
+            input_array1 = np.random.rand(3, 4).astype("float64")
+            input_array2 = np.random.rand(4, 3).astype("float64")
+            data1 = fluid.dygraph.to_variable(input_array1)
+            data2 = fluid.dygraph.to_variable(input_array2)
+            out = paddle.mm(data1, data2)
+            expected_result = np.matmul(input_array1, input_array2)
+        self.assertTrue(np.allclose(expected_result, out.numpy()))
+
+
+class Test_API_Matmul(unittest.TestCase):
+    def test_dygraph_without_out(self):
+        device = fluid.CPUPlace()
+        with fluid.dygraph.guard(device):
+            input_array1 = np.random.rand(3, 4).astype("float64")
+            input_array2 = np.random.rand(4, 3).astype("float64")
+            data1 = fluid.dygraph.to_variable(input_array1)
+            data2 = fluid.dygraph.to_variable(input_array2)
+            out = paddle.matmul(data1, data2)
+            expected_result = np.matmul(input_array1, input_array2)
+        self.assertTrue(np.allclose(expected_result, out.numpy()))
+
+
+class API_TestMmError(unittest.TestCase):
+    def test_errors(self):
+        def test_error1():
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                data1 = fluid.data(name="data1", shape=[10, 2], dtype="float32")
+                data2 = fluid.data(name="data2", shape=[3, 10], dtype="float32")
+                paddle.mm(data1, data2)
+
+        self.assertRaises(ValueError, test_error1)
+
+        def test_error2():
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                data1 = fluid.data(
+                    name="data1", shape=[-1, 10, 2], dtype="float32")
+                data2 = fluid.data(
+                    name="data2", shape=[-1, 2, 10], dtype="float32")
+                paddle.mm(data1, data2)
+
+        test_error2()
+
+        def test_error3():
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                data1 = fluid.data(
+                    name="data1", shape=[10, 10, 2], dtype="float32")
+                data2 = fluid.data(
+                    name="data2", shape=[3, 2, 10], dtype="float32")
+                paddle.mm(data1, data2)
+
+        self.assertRaises(ValueError, test_error3)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/xpu/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mul_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+class TestMulOp(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.dtype = np.float64
+        self.init_dtype_type()
+        self.inputs = {
+            'X': np.random.random((20, 5)).astype(self.dtype),
+            'Y': np.random.random((5, 21)).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+    def init_dtype_type(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+class TestMulOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # The input type of mul_op must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            x2 = fluid.create_lod_tensor(
+                np.array([[-1]]), [[1]], fluid.CPUPlace())
+            self.assertRaises(TypeError, fluid.layers.mul, x1, x2)
+            # The input dtype of mul_op must be float32 or float64.
+            x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32")
+            x4 = fluid.layers.data(name='x4', shape=[4], dtype="int32")
+            self.assertRaises(TypeError, fluid.layers.mul, x3, x4)
+
+
+class TestMulOp2(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.dtype = np.float64
+        self.init_dtype_type()
+        self.inputs = {
+            'X': np.random.random((3, 4, 2, 9)).astype(self.dtype),
+            'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.dtype)
+        }
+        self.attrs = {
+            'x_num_col_dims': 2,
+            'y_num_col_dims': 2,
+        }
+        result = np.dot(self.inputs['X'].reshape(3 * 4, 2 * 9),
+                        self.inputs['Y'].reshape(3 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
+        self.outputs = {'Out': result}
+
+    def init_dtype_type(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set('X'))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp1(TestMulOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place, atol=1e-1)
+
+    def test_check_grad_normal(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.5)
+
+    def test_check_grad_ingore_x(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUMulOp2(TestMulOp2):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        place = paddle.XPUPlace(0)
+        self.check_output_with_place(place, atol=2e-1)
+
+    def test_check_grad_normal(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.9)
+
+    def test_check_grad_ingore_x(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        place = paddle.XPUPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y'))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/metric/__init__.py
+++ b/python/paddle/metric/__init__.py
@@ -16,12 +16,11 @@ from .metrics import *
 from . import metrics

 from ..fluid.layers.metric_op import accuracy, auc
-from ..fluid.layers.nn import chunk_eval, cos_sim, mean_iou
+from ..fluid.layers.nn import chunk_eval, mean_iou

 __all__ = metrics.__all__ + [
    'accuracy',
    'auc',
    'chunk_eval',
-    'cos_sim',
    'mean_iou',
 ]
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -38,11 +38,9 @@ from .clip import GradientClipByValue  #DEFINE_ALIAS
 # from .clip import set_gradient_clip        #DEFINE_ALIAS
 from .clip import clip  #DEFINE_ALIAS
 from .clip import clip_by_norm  #DEFINE_ALIAS
-from .control_flow import case  #DEFINE_ALIAS
 from .control_flow import cond  #DEFINE_ALIAS
 # from .control_flow import DynamicRNN        #DEFINE_ALIAS
 # from .control_flow import StaticRNN        #DEFINE_ALIAS
-from .control_flow import switch_case  #DEFINE_ALIAS
 from .control_flow import while_loop  #DEFINE_ALIAS
 # from .control_flow import rnn        #DEFINE_ALIAS
 # from .decode import BeamSearchDecoder        #DEFINE_ALIAS

--- a/python/paddle/nn/control_flow.py
+++ b/python/paddle/nn/control_flow.py
@@ -13,18 +13,13 @@
 # limitations under the License.

 # TODO: define the control flow api  
-from ..fluid.layers import case  #DEFINE_ALIAS
 from ..fluid.layers import cond  #DEFINE_ALIAS
 from ..fluid.layers import while_loop  #DEFINE_ALIAS

-from ..fluid.layers import switch_case  #DEFINE_ALIAS
-
 __all__ = [
-    'case',
    'cond',
    #       'DynamicRNN',
    #       'StaticRNN',
-    'switch_case',
    'while_loop',
    #       'rnn'
 ]
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -933,7 +933,7 @@ def ctc_loss(log_probs,
    is interated to the Warp-CTC library to normalize values for each row of the input tensor.

    Parameters:
-        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type should be float32 or float64.
        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
        input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
        label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.

--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -571,15 +571,26 @@ def max_pool1d(x,
    padding = _expand_low_nd_padding(padding)

    if in_dygraph_mode():
-        pool_out = core.ops.max_pool2d_with_index(
-            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
-            'paddings', padding, 'padding_algorithm', padding_algorithm,
-            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
-            'exclusive', True, 'data_format', data_format)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+        if return_indices:
+            pool_out = core.ops.max_pool2d_with_index(
+                x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
+                stride, 'paddings', padding, 'padding_algorithm',
+                padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+                'use_mkldnn', False, 'exclusive', True, 'data_format',
+                data_format)
+            return (squeeze(pool_out[0], [2]), squeeze(
+                pool_out[1],
+                [2])) if return_indices else squeeze(pool_out[0], [2])
+        else:
+            pool_out = core.ops.pool2d(
+                x, 'pooling_type', 'max', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
+            return squeeze(pool_out, [2])

-    op_type = 'max_pool2d_with_index'
+    op_type = 'max_pool2d_with_index' if return_indices else "pool2d"
    helper = LayerHelper(op_type, **locals())
    dtype = helper.input_dtype()
    pool_out = helper.create_variable_for_type_inference(dtype)
@@ -696,7 +707,7 @@ def max_pool2d(x,
        )

    if in_dygraph_mode():
-        if data_format == "NCHW":
+        if return_indices:
            output = core.ops.max_pool2d_with_index(
                x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
                stride, 'paddings', padding, 'padding_algorithm',
@@ -704,7 +715,7 @@ def max_pool2d(x,
                'use_mkldnn', False, 'exclusive', True, 'data_format',
                data_format)
            return output if return_indices else output[0]
-        elif data_format == "NHWC" and not return_indices:
+        else:
            output = core.ops.pool2d(
                x, 'pooling_type', 'max', 'ksize', kernel_size,
                'global_pooling', False, 'padding_algorithm', padding_algorithm,
@@ -713,7 +724,7 @@ def max_pool2d(x,
                'data_format', data_format)
            return output

-    op_type = 'max_pool2d_with_index' if data_format == "NCHW" else "pool2d"
+    op_type = 'max_pool2d_with_index' if return_indices else "pool2d"
    helper = LayerHelper(op_type, **locals())
    dtype = helper.input_dtype()
    pool_out = helper.create_variable_for_type_inference(dtype)
@@ -822,7 +833,7 @@ def max_pool3d(x,
        )

    if in_dygraph_mode():
-        if data_format == "NCDHW":
+        if return_indices:
            output = core.ops.max_pool3d_with_index(
                x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
                stride, 'paddings', padding, 'global_pooling', False,
@@ -830,7 +841,7 @@ def max_pool3d(x,
                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
                'data_format', data_format)
            return output if return_indices else output[0]
-        elif data_format == "NDHWC" and not return_indices:
+        else:
            output = core.ops.pool3d(
                x, 'pooling_type', 'max', 'ksize', kernel_size,
                'global_pooling', False, 'padding_algorithm', padding_algorithm,
@@ -839,7 +850,7 @@ def max_pool3d(x,
                'data_format', data_format)
            return output

-    op_type = "max_pool3d_with_index" if data_format == "NCDHW" else "pool3d"
+    op_type = "max_pool3d_with_index" if return_indices else "pool3d"
    helper = LayerHelper(op_type, **locals())
    dtype = helper.input_dtype()
    pool_out = helper.create_variable_for_type_inference(dtype)

--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -248,7 +248,7 @@ class Conv1d(_ConvNd):
        padding = 0
        if self._padding_mode != "zeros":
            x = F.pad(x,
-                      self._padding,
+                      self._reversed_padding_repeated_twice,
                      mode=self._padding_mode,
                      data_format=self._data_format)
        else:

--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -773,7 +773,7 @@ class CTCLoss(fluid.dygraph.Layer):
        reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.

    Shape:
-        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type should be float32 or float64.
        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
        input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
        label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.

--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -17,6 +17,7 @@ __all__ = [
    'batch_norm',
    'embedding',
    'bilinear_tensor_product',
+    'case',
    'conv2d',
    'conv2d_transpose',
    'conv3d',
@@ -34,11 +35,13 @@ __all__ = [
    'prelu',
    'row_conv',
    'spectral_norm',
+    'switch_case',
 ]

 from ...fluid.layers import fc  #DEFINE_ALIAS
 from ...fluid.layers import batch_norm  #DEFINE_ALIAS
 from ...fluid.layers import bilinear_tensor_product  #DEFINE_ALIAS
+from ...fluid.layers import case  #DEFINE_ALIAS
 from ...fluid.layers import conv2d  #DEFINE_ALIAS
 from ...fluid.layers import conv2d_transpose  #DEFINE_ALIAS
 from ...fluid.layers import conv3d  #DEFINE_ALIAS
@@ -56,5 +59,6 @@ from ...fluid.layers import nce  #DEFINE_ALIAS
 from ...fluid.layers import prelu  #DEFINE_ALIAS
 from ...fluid.layers import row_conv  #DEFINE_ALIAS
 from ...fluid.layers import spectral_norm  #DEFINE_ALIAS
+from ...fluid.layers import switch_case  #DEFINE_ALIAS

 from ...fluid.input import embedding  #DEFINE_ALIAS
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -707,20 +707,14 @@ def cross(x, y, axis=None, name=None):
    Examples:
        .. code-block:: python
            import paddle
-            from paddle import to_variable
-            import numpy as np
-
            paddle.disable_static()

-            data_x = np.array([[1.0, 1.0, 1.0],
-                               [2.0, 2.0, 2.0],
-                               [3.0, 3.0, 3.0]])
-            data_y = np.array([[1.0, 1.0, 1.0],
-                               [1.0, 1.0, 1.0],
-                               [1.0, 1.0, 1.0]])
-            x = to_variable(data_x)
-            y = to_variable(data_y)
-
+            x = paddle.to_tensor([[1.0, 1.0, 1.0],
+                                  [2.0, 2.0, 2.0],
+                                  [3.0, 3.0, 3.0]])
+            y = paddle.to_tensor([[1.0, 1.0, 1.0],
+                                  [1.0, 1.0, 1.0],
+                                  [1.0, 1.0, 1.0]])
            z1 = paddle.cross(x, y)
            print(z1.numpy())
            # [[-1. -1. -1.]

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1650,12 +1650,11 @@ def cumsum(x, axis=None, dtype=None, name=None):
        .. code-block:: python
            
            import paddle
-            from paddle import to_variable
            import numpy as np

            paddle.disable_static()
            data_np = np.arange(12).reshape(3, 4)
-            data = to_variable(data_np)
+            data = paddle.to_tensor(data_np)

            y = paddle.cumsum(data)
            print(y.numpy())

--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -286,7 +286,7 @@ fi
 # Get the list of PR authors with unresolved unit test issues
 pip install PyGithub
 # For getting PR related data
-wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
+wget https://sys-p0.bj.bcebos.com/blk/block.txt --no-check-certificate
 wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate
 HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
 if [ "${HASUTFIXED}" != "" ]; then

--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -251,9 +251,10 @@
        "BilinearTensorProduct",
        "GroupNorm",
        "SpectralNorm",
-        "TreeConv",
+        "TreeConv"
+    ],
+    "wlist_temp":[
        "prroi_pool",
-        "to_tensor",
        "ChunkEvaluator",
        "EditDistance",
        "ErrorClipByValue",
@@ -406,7 +407,9 @@
        "TransformerDecoder.prepare_incremental_cache",
        "LinearChainCRF.forward",
        "CRFDecoding.forward",
-        "SequenceTagging.forward"
+        "SequenceTagging.forward",
+        "XPUPlace",
+        "is_compiled_with_xpu"
    ],
    "gpu_not_white":[
        "deformable_conv",