Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into yxp0925

Merge branch 'develop'.

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into yxp0925
Merge branch 'develop'.
d0c9f7e8 · smallv0221 · a35a2ee9 · 09f19532 · d0c9f7e8 · d0c9f7e8
91 changed file
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -721,6 +721,7 @@ function(proto_library TARGET_NAME)
  set(proto_hdrs)
  paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
+  add_dependencies(extern_xxhash ${TARGET_NAME})
 endfunction()
 function(py_proto_compile TARGET_NAME)

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -39,6 +39,7 @@ set(third_party_deps)
 #            REPOSITORY ${TARGET_REPOSITORY}
 #            TAG        ${TARGET_TAG}
 #            DIR        ${TARGET_SOURCE_DIR})
 FUNCTION(cache_third_party TARGET)
    SET(options "")
    SET(oneValueArgs URL REPOSITORY TAG DIR)
@@ -269,6 +270,10 @@ if(WITH_PSLIB)
    endif()
 endif(WITH_PSLIB)
+if(NOT WIN32 AND NOT APPLE)
+    include(external/gloo)
+    list(APPEND third_party_deps extern_gloo)
+endif()
 if(WITH_BOX_PS)
    include(external/box_ps)
@@ -276,10 +281,6 @@ if(WITH_BOX_PS)
 endif(WITH_BOX_PS)
 if(WITH_DISTRIBUTE)
-    if(WITH_GLOO)
-        include(external/gloo)
-        list(APPEND third_party_deps extern_gloo)
-    endif()
    if(WITH_GRPC)
        list(APPEND third_party_deps extern_grpc)

--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -74,7 +74,9 @@ class PullDenseWorker {
  virtual void Initialize(const TrainerDesc& param);
 #ifdef PADDLE_WITH_CUDA
  void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
+#endif
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
  void AddPlace(const paddle::platform::Place place) {
    places_.push_back(place);
  }
@@ -135,9 +137,9 @@ class PullDenseWorker {
 #ifdef PADDLE_WITH_CUDA
  std::vector<cudaStream_t> copy_streams_;
+#endif
  std::vector<paddle::platform::Place> places_;
  std::vector<Scope*> thread_scopes_;
-#endif
 };
 // should incorporate different type of device
@@ -161,6 +163,7 @@ class DeviceWorker {
  virtual void SetDataFeed(DataFeed* data_feed);
  virtual void SetWorkerNum(int num) {}
  virtual void CacheProgram(const ProgramDesc& main_program) {}
+  virtual void GetXpuOpIndex() {}
  virtual void SetNeedDumpField(bool need_dump_field) {
    need_dump_field_ = need_dump_field;
  }

--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -127,6 +127,7 @@ message DistributedStrategy {
  optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
  optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
  optional bool adaptive_localsgd = 24 [ default = false ];
+  optional bool fp16_allreduce = 25 [ default = false ];
  optional RecomputeConfig recompute_configs = 101;
  optional AMPConfig amp_configs = 102;

--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -745,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync(
    push_sparse_status->push_back(std::move(status));
  }
 }
+#endif
+#ifdef PADDLE_WITH_XPU
+void FleetWrapper::PushDenseVarsAsync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names,
+    std::vector<::std::future<int32_t>>* push_sparse_status,
+    float scale_datanorm, int batch_size,
+    const paddle::platform::Place& place) {
+#ifdef PADDLE_WITH_PSLIB
+  std::vector<paddle::ps::Region> regions;
+  for (auto& t : var_names) {
+    Variable* var = scope.FindVar(t);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int count = tensor->numel();
+    float* g_data = tensor->data<float>();
+    Variable* pin_var = scope.FindVar(t + "pin");
+    LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
+    float* pin_g =
+        pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
+    memory::Copy(platform::CPUPlace(), pin_g,
+                 BOOST_GET_CONST(platform::XPUPlace, place), g_data,
+                 sizeof(float) * count);
+    float* g = pin_g;
+    if (scale_datanorm >= 0) {
+      if (t.find(".batch_size@GRAD") != std::string::npos ||
+          t.find(".batch_sum@GRAD") != std::string::npos) {
+        Eigen::Map<Eigen::MatrixXf> mat(g, 1, count);
+        float scale = 1.0 / batch_size;
+        mat *= scale;
+      } else if (t.find(".batch_square_sum@GRAD") != std::string::npos) {
+        VLOG(3) << "epsilon: " << scale_datanorm;
+        for (int i = 0; i < count; ++i) {
+          g[i] = (g[i] - batch_size * scale_datanorm) / batch_size +
+                 batch_size * scale_datanorm;
+        }
+      }
+    }
+    paddle::ps::Region reg(g, count);
+    regions.emplace_back(std::move(reg));
+  }
+  auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(),
+                                                    regions.size(), table_id);
+  if (push_sparse_status) {
+    push_sparse_status->push_back(std::move(status));
+  }
+#endif
+}
 #endif
 void FleetWrapper::PushDenseVarsAsync(
    const Scope& scope, const uint64_t table_id,

--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -160,6 +160,14 @@ class FleetWrapper {
      float scale_datanorm, int batch_size,
      const paddle::platform::Place& place, cudaStream_t stream,
      cudaEvent_t event);
+#endif
+#ifdef PADDLE_WITH_XPU
+  void PushDenseVarsAsync(
+      const Scope& scope, const uint64_t table_id,
+      const std::vector<std::string>& var_names,
+      std::vector<::std::future<int32_t>>* push_sparse_status,
+      float scale_datanorm, int batch_size,
+      const paddle::platform::Place& place);
 #endif
  void PushDenseVarsAsync(
      const Scope& scope, const uint64_t table_id,

--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -113,30 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
  if (platform::is_cpu_place(tensor->place())) {
    memcpy(data_ptr, tensor->data<void>(),
           tensor->numel() * SizeOfType(tensor->type()));
-#ifdef PADDLE_WITH_CUDA
  } else {
+#ifdef PADDLE_WITH_CUDA
    memory::Copy(platform::CPUPlace(), data_ptr,
                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
                 tensor->data<void>(),
                 tensor->numel() * SizeOfType(tensor->type()), nullptr);
-  }
-#else
-  }
 #endif
+#ifdef PADDLE_WITH_XPU
+    memory::Copy(platform::CPUPlace(), data_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
+                 tensor->data<void>(),
+                 tensor->numel() * SizeOfType(tensor->type()));
+#endif
+  }
 }
-// void HeterWrapper::DeSerializeToTensor(Scope* scope,
-// const HeterRequest* request) {
 #ifdef PADDLE_WITH_CUDA
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                       const VariableMessage& req_var,
                                       platform::Place place,
                                       cudaStream_t stream) {
+  // const VariableMessage& req_var = request->vars();
+  auto* var = scope->FindVar(req_var.varname());
+  auto* tensor = var->GetMutable<LoDTensor>();
+  std::vector<int> vec_dim;
+  for (auto& x : req_var.dims()) {
+    vec_dim.push_back(x);
+  }
+  tensor->Resize(make_ddim(vec_dim));
+  LoD lod;
+  for (int i = 0; i < req_var.lod_level(); ++i) {
+    framework::Vector<size_t> v;
+    for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) {
+      v.push_back(req_var.lod(i).lod_data(j));
+    }
+    lod.push_back(v);
+  }
+  tensor->set_lod(lod);
+  void* tensor_data =
+      tensor->mutable_data(place, ToVarType(req_var.data_type()));
+#ifdef PADDLE_WITH_CUDA
+  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+               platform::CPUPlace(), req_var.data().data(),
+               tensor->numel() * SizeOfType(tensor->type()), stream);
 #else
+  memcpy(tensor_data, req_var.data().data(),
+         tensor->numel() * SizeOfType(tensor->type()));
+#endif
+}
+#endif
+// void HeterWrapper::DeSerializeToTensor(Scope* scope,
+// const HeterRequest* request) {
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                       const VariableMessage& req_var,
                                       platform::Place place) {
-#endif
  // const VariableMessage& req_var = request->vars();
  auto* var = scope->FindVar(req_var.varname());
  auto* tensor = var->GetMutable<LoDTensor>();
@@ -160,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
  void* tensor_data =
      tensor->mutable_data(place, ToVarType(req_var.data_type()));
-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_XPU
-  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
+  memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data,
               platform::CPUPlace(), req_var.data().data(),
-               tensor->numel() * SizeOfType(tensor->type()), stream);
+               tensor->numel() * SizeOfType(tensor->type()));
 #else
  memcpy(tensor_data, req_var.data().data(),
         tensor->numel() * SizeOfType(tensor->type()));
@@ -184,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType(
    case VariableMessage::BOOL:
      return framework::proto::VarType::BOOL;  // NOLINT
    default:
-      VLOG(0) << "Not support type " << type;
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "ToVarType:Unsupported type %d", type));
  }
 }

--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#include <cstdlib>
+#include <ctime>
+#include <string>
+#include <vector>
+#include "io/fs.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/data_set.h"
+#include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
+#include "paddle/fluid/framework/trainer.h"
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
 namespace paddle {
 namespace framework {
@@ -34,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
  int place_num = trainer_desc.worker_places_size();
  for (int i = 0; i < place_num; ++i) {
    int num = trainer_desc.worker_places(i);
+#ifdef PADDLE_WITH_CUDA
    platform::CUDAPlace place = platform::CUDAPlace(num);
    platform::CUDADeviceGuard guard(place.device);
    cudaStream_t stream;
@@ -44,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
    events_.push_back(event);
+#endif
+#ifdef PADDLE_WITH_XPU
+    platform::XPUPlace place = platform::XPUPlace(num);
+    places_.push_back(place);
+#endif
  }
  // thread_num_ = trainer_desc.thread_num();
  // SetDataset(dataset);
@@ -95,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc,
 void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
  auto place = places_[num];
  Scope* scope = place_scopes_[num];
+#ifdef PADDLE_WITH_CUDA
  auto stream = copy_streams_[num];
  auto event = events_[num];
  auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
  platform::CUDADeviceGuard guard(dev_id);
+#endif
+#ifdef PADDLE_WITH_XPU
+  xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+#endif
  auto& block = program.Block(0);
  for (auto& var : block.AllVars()) {
    if (var->Persistable()) {
@@ -116,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
      HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
    }                                                                   \
  } while (0)
+#define HeterMemcpyXpuFunc(cpp_type, proto_type)                \
+  do {                                                          \
+    if (root_tensor->type() == proto_type) {                    \
+      HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place); \
+    }                                                           \
+  } while (0)
+#ifdef PADDLE_WITH_CUDA
      _ForEachDataType_(HeterMemcpyFunc);
+#endif
+#ifdef PADDLE_WITH_XPU
+      _ForEachDataType_(HeterMemcpyXpuFunc);
+#endif
    }
  }
+#ifdef PADDLE_WITH_CUDA
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
  cudaEventSynchronize(event);
+#endif
 }
+#ifdef PADDLE_WITH_CUDA
 template <typename T>
 void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
                                  LoDTensor* root_tensor,
@@ -141,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
                 root_ptr, sizeof(T) * root_tensor->numel(), stream);
  }
 }
+#endif
+#ifdef PADDLE_WITH_XPU
+template <typename T>
+void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
+                                  LoDTensor* root_tensor,
+                                  const paddle::platform::Place& thread_place) {
+  T* thread_ptr =
+      thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
+  T* root_ptr = root_tensor->data<T>();
+  if (platform::is_cpu_place(root_tensor->place())) {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
+                 platform::CPUPlace(), root_ptr,
+                 sizeof(T) * root_tensor->numel());
+  } else {
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
+                 BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()),
+                 root_ptr, sizeof(T) * root_tensor->numel());
+  }
+}
+#endif
 void HeterXpuTrainer::DumpWork(int tid) {}
@@ -171,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
    CreateThreadParam(main_program, i);
    pull_dense_worker_->AddThreadScope(scope);
    pull_dense_worker_->AddPlace(places_[i]);
+#ifdef PADDLE_WITH_CUDA
    pull_dense_worker_->AddStream(copy_streams_[i]);
+#endif
  }
  pull_dense_worker_->Start();
+#ifdef PADDLE_WITH_CUDA
  for (auto& stream : copy_streams_) {
    cudaStreamSynchronize(stream);
  }
+#endif
  op_names_.clear();
  for (auto& op_desc : block.AllOps()) {
    std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
@@ -220,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
        OperatorBase* local_op_ptr = local_op.release();
        (context->ops_).push_back(local_op_ptr);
      }
+#ifdef PADDLE_WITH_CUDA
      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
      platform::CUDADeviceGuard guard(dev_id);
      PADDLE_ENFORCE_CUDA_SUCCESS(
          cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
+#endif
      object_pool_.Push(context);
    }
  }
@@ -267,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
    }                                                                          \
  } while (0)
      _ForEachDataType_(MergeCallback);
-      if (platform::is_gpu_place(thread_tensor->place())) {
+      if (!platform::is_cpu_place(thread_tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
        auto dev_id =
            BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
        platform::CUDADeviceGuard guard(dev_id);
        cudaMemset(thread_tensor->data<void>(), 0,
                   thread_tensor->numel() * SizeOfType(thread_tensor->type()));
+#endif
+#ifdef PADDLE_WITH_XPU
+        auto place = thread_tensor->place();
+        xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+        platform::DeviceContextPool& pool =
+            platform::DeviceContextPool::Instance();
+        platform::DeviceContext* dev_ctx = pool.Get(place);
+        const platform::XPUDeviceContext* xpu_ctx =
+            reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
+        xpu::memset(xpu_ctx->x_context(), thread_tensor->data<void>(), 0,
+                    thread_tensor->numel() * SizeOfType(thread_tensor->type()));
+#endif
      } else {
        memset(thread_tensor->data<void>(), 0,
               thread_tensor->numel() * SizeOfType(thread_tensor->type()));
@@ -281,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
    auto* merge_var = response->add_vars();
    heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_,
                               merge_var);
-    if (platform::is_gpu_place(root_tensor->place())) {
+    if (!platform::is_cpu_place(root_tensor->place())) {
+#ifdef PADDLE_WITH_CUDA
      auto dev_id =
          BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
      platform::CUDADeviceGuard guard(dev_id);
      cudaMemset(root_tensor->data<void>(), 0,
                 root_tensor->numel() * SizeOfType(root_tensor->type()));
+#endif
+#ifdef PADDLE_WITH_XPU
+      auto place = root_tensor->place();
+      xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device);
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      platform::DeviceContext* dev_ctx = pool.Get(place);
+      const platform::XPUDeviceContext* xpu_ctx =
+          reinterpret_cast<const platform::XPUDeviceContext*>(dev_ctx);
+      xpu::memset(xpu_ctx->x_context(), root_tensor->data<void>(), 0,
+                  root_tensor->numel() * SizeOfType(root_tensor->type()));
+#endif
    } else {
      memset(root_tensor->data<void>(), 0,
             root_tensor->numel() * SizeOfType(root_tensor->type()));
@@ -346,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
      OperatorBase* local_op_ptr = local_op.release();
      (context->ops_).push_back(local_op_ptr);
    }
+#ifdef PADDLE_WITH_CUDA
    auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
    platform::CUDADeviceGuard guard(dev_id);
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
+#endif
  }
  context->Reset();
@@ -359,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
    auto deserial_timer =
        std::make_shared<paddle::ps::CostTimer>("xpu_service_deserial");
    for (int i = 0; i < request->vars_size(); ++i) {
+#ifdef PADDLE_WITH_CUDA
      heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place,
                                      copy_streams_[context->place_num_]);
+#endif
+#ifdef PADDLE_WITH_XPU
+      heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place);
+#endif
    }
+#ifdef PADDLE_WITH_CUDA
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaEventRecord(context->event_, copy_streams_[context->place_num_]));
    while (cudaEventQuery(context->event_) != cudaSuccess) {
      VLOG(3) << "wait for kernel";
      bthread_yield();
    }
+#endif
  }
  {
@@ -378,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
      op->Run(*(context->scope_), place);
    }
  }
+#ifdef PADDLE_WITH_CUDA
  auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
      platform::DeviceContextPool::Instance().Get(place));
  PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -391,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
      bthread_yield();
    }
  }
+#endif
+#ifdef PADDLE_WITH_XPU
+  xpu_wait();
+#endif
  for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) {
    const std::string& varname = trainer_desc_.xpu_send_list(i);
@@ -407,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
       ++i) {
    uint64_t tid =
        static_cast<uint64_t>(param_.program_config(0).push_dense_table_id(i));
+#ifdef PADDLE_WITH_CUDA
    fleet_ptr_->PushDenseVarsAsync(
        *(context->scope_), tid, dense_grad_names_[tid],
        &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
        places_[context->place_num_], copy_streams_[context->place_num_],
        context->event_);
+#endif
+#ifdef PADDLE_WITH_XPU
+    fleet_ptr_->PushDenseVarsAsync(
+        *(context->scope_), tid, dense_grad_names_[tid],
+        &(context->push_dense_status_), scale_datanorm_, request->cur_batch(),
+        places_[context->place_num_]);
+#endif
  }
  for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
       ++i) {
@@ -453,7 +565,6 @@ void HeterXpuTrainer::Finalize() {
  pull_dense_worker_->Stop();
  root_scope_->DropKids();
 }
 }  // namespace framework
 }  // namespace paddle
 #endif
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
 #include <vector>
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
@@ -103,12 +104,32 @@ REGISTER_PASS(conv_activation_mkldnn_fuse_pass,
 REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
              paddle::framework::ir::ConvActivationFusePass);
+REGISTER_PASS_CAPABILITY(conv_relu_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("relu", 0));
 REGISTER_PASS(conv_leaky_relu_mkldnn_fuse_pass,
              paddle::framework::ir::Conv2DLeakyReLUFusePass);
+REGISTER_PASS_CAPABILITY(conv_leaky_relu_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .LE("leaky_relu", 1));
 REGISTER_PASS(conv_relu6_mkldnn_fuse_pass,
              paddle::framework::ir::Conv2DReLU6FusePass);
+REGISTER_PASS_CAPABILITY(conv_relu6_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("relu6", 0));
 REGISTER_PASS(conv_swish_mkldnn_fuse_pass,
              paddle::framework::ir::Conv2DSwishFusePass);
+REGISTER_PASS_CAPABILITY(conv_swish_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("swish", 0));
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h"
 #include <vector>
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
@@ -123,3 +124,10 @@ void ConvConcatReLUFusePass::ApplyImpl(ir::Graph* graph) const {
 REGISTER_PASS(conv_concat_relu_mkldnn_fuse_pass,
              paddle::framework::ir::ConvConcatReLUFusePass);
+REGISTER_PASS_CAPABILITY(conv_concat_relu_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("concat", 0)
+            .EQ("relu", 0));
--- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h"
 #include <paddle/fluid/string/pretty_log.h>
 #include <vector>
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
@@ -98,3 +99,10 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const {
 REGISTER_PASS(matmul_transpose_reshape_fuse_pass,
              paddle::framework::ir::MatmulTransposeReshapeMKLDNNPass);
+REGISTER_PASS_CAPABILITY(matmul_transpose_reshape_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("matmul", 0)
+            .EQ("transpose", 0)
+            .EQ("reshape", 0));
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/string/pretty_log.h"
 namespace paddle {
@@ -90,3 +91,9 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
 REGISTER_PASS(scale_matmul_fuse_pass,
              paddle::framework::ir::ScaleMatmulFusePass);
+REGISTER_PASS_CAPABILITY(scale_matmul_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("scale", 0)
+            .EQ("matmul", 0));
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace framework {
@@ -255,3 +256,15 @@ void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const {
 REGISTER_PASS(seq_concat_fc_fuse_pass,
              paddle::framework::ir::SeqConcatFcFusePass);
+REGISTER_PASS_CAPABILITY(seq_concat_fc_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("sequence_expand", 0)
+            .EQ("concat", 0)
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("sigmoid", 0)
+            .EQ("tanh", 0)
+            .EQ("relu", 0)
+            .EQ("identity", 0)
+            .EQ("fusion_seqexpand_concat_fc", 0));
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -62,13 +62,15 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
  fleet_ptr_ = FleetWrapper::GetInstance();
 #ifdef PADDLE_WITH_CUDA
  copy_streams_.clear();
+#endif
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
  places_.clear();
  thread_scopes_.clear();
 #endif
 }
 void PullDenseWorker::CreatePinVar() {
-#ifdef PADDLE_WITH_CUDA
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_PSLIB)
  // for (auto& v : dense_value_names_) {
  //  for (auto& name : v.second) {
  for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
@@ -83,8 +85,13 @@ void PullDenseWorker::CreatePinVar() {
      auto* ptr = root_scope_->Var(name + "pin");
      InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
      LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>();
+#ifdef PADDLE_WITH_CUDA
      pin_tensor->mutable_data<float>(tensor->dims(),
                                      platform::CUDAPinnedPlace());
+#endif
+#ifdef PADDLE_WITH_XPU
+      pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
+#endif
    }
  }
 #endif
@@ -107,7 +114,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
    exit(-1);
  }
  status_vec->resize(0);
-#ifdef PADDLE_WITH_CUDA
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
  for (size_t i = 0; i < places_.size(); ++i) {
    // for (auto& v : dense_value_names_) {
@@ -125,9 +132,16 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
        Variable* var = thread_scopes_[i]->FindVar(name);
        LoDTensor* tensor = var->GetMutable<LoDTensor>();
        float* w = tensor->data<float>();
+#ifdef PADDLE_WITH_CUDA
        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
                     platform::CUDAPinnedPlace(), pin_w,
                     sizeof(float) * tensor->numel(), copy_streams_[i]);
+#endif
+#ifdef PADDLE_WITH_XPU
+        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, places_[i]), w,
+                     platform::CPUPlace(), pin_w,
+                     sizeof(float) * tensor->numel());
+#endif
      }
    }
  }
@@ -148,7 +162,7 @@ void PullDenseWorker::PullDense(bool force_update) {
    uint64_t tid = static_cast<uint64_t>(
        dwp_param_.program_config(0).pull_dense_table_id(i));
    if (force_update || CheckUpdateParam(tid)) {
-#ifdef PADDLE_WITH_CUDA
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
      VLOG(3) << "pull dense " << force_update << " " << tid;
      fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
                                     &pull_dense_status_, false);

--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -138,7 +138,8 @@ class DistMultiTrainer : public MultiTrainer {
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
 };
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
 class HeterServiceContext {
 public:
  HeterServiceContext() {}
@@ -151,7 +152,9 @@ class HeterServiceContext {
  void Reset() { push_dense_status_.clear(); }
  int place_num_;
  Scope* scope_{nullptr};
+#ifdef PADDLE_WITH_CUDA
  cudaEvent_t event_;
+#endif
  std::vector<OperatorBase*> ops_;
  std::vector<::std::future<int32_t>> push_dense_status_;
 };
@@ -178,10 +181,18 @@ class HeterXpuTrainer : public TrainerBase {
  virtual void CacheProgram(const ProgramDesc& main_program) {
    new (&program_) ProgramDesc(main_program);
  }
+  virtual std::string GetDumpPath(int tid) { return ""; }
+  virtual void InitDumpEnv() {}
  template <typename T>
+#ifdef PADDLE_WITH_CUDA
  void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
                   const paddle::platform::Place& thread_place,
                   cudaStream_t stream);
+#endif
+#ifdef PADDLE_WITH_XPU
+  void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
+                   const paddle::platform::Place& thread_place);
+#endif
  void CreateThreadParam(const ProgramDesc& program, int num);
  template <typename T>
  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
@@ -207,9 +218,11 @@ class HeterXpuTrainer : public TrainerBase {
  std::vector<std::string> op_names_;
  std::vector<Scope*> place_scopes_;
  BtObjectPool<HeterServiceContext> object_pool_;
-  std::vector<cudaStream_t> copy_streams_;
  std::vector<platform::Place> places_;
+#ifdef PADDLE_WITH_CUDA
+  std::vector<cudaStream_t> copy_streams_;
  std::vector<cudaEvent_t> events_;
+#endif
 };
 #endif

--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -63,7 +63,8 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
 REGISTER_TRAINER_CLASS(MultiTrainer);
 REGISTER_TRAINER_CLASS(DistMultiTrainer);
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+    (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(HeterXpuTrainer);
 #endif
 #if defined(PADDLE_WITH_NCCL)

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -132,9 +132,17 @@ if(NOT APPLE AND WITH_MKLML)
    # seq_pool1
    set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
    download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
-    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1_compare_determine ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_determine_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_statis_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_profile_tester.cc)
    if(NOT WIN32)
-        set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
+        set_tests_properties(test_analyzer_seq_pool1_compare_determine PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_analyzer_seq_pool1_fuse_compare_zero_copy PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_analyzer_seq_pool1_fuse_statis PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_analyzer_seq_pool1_profile PROPERTIES TIMEOUT 120)
    endif()
 else()
    # TODO: fix this test on MACOS and OPENBLAS, the reason is that
@@ -215,7 +223,15 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
 # transformer, the dataset only works on batch_size=8 now
 set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
 download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz")
-inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
+inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_compare_tester.cc 
+  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
+       --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
+inference_analysis_test(test_analyzer_transformer_fuse SRCS analyzer_transformer_fuse_tester.cc 
+  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
+       --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})
+inference_analysis_test(test_analyzer_transformer_profile SRCS analyzer_transformer_profile_tester.cc 
  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
       --cpu_num_threads=${CPU_NUM_THREADS_ON_CI})

--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_determine_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+// Compare Deterministic result
+TEST(Analyzer_seq_pool1_compare_determine, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_compare_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+TEST(Analyzer_seq_pool1_compare, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
+TEST(Analyzer_seq_pool1_compare_zero_copy, compare_zero_copy) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  AnalysisConfig cfg1;
+  SetConfig(&cfg1);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  std::vector<std::string> outputs_name;
+  outputs_name.emplace_back(out_var_name);
+  CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
+                             reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
+                             input_slots_all, outputs_name);
+}
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+// Check the fuse status
+TEST(Analyzer_seq_pool1_fuse_statis, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
+  ASSERT_TRUE(fuse_statis.count("squared_mat_sub_fuse"));
+  ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse"));
+  ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
+  EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
+  EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2);
+  EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2);
+  LOG(INFO) << "num_ops: " << num_ops;
+  EXPECT_EQ(num_ops, 171);
+}
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_profile_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace seq_pool1_tester {
+void profile(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg, use_mkldnn);
+  std::vector<std::vector<PaddleTensor>> outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+}
+TEST(Analyzer_seq_pool1_profile, profile) { profile(); }
+}  // namespace seq_pool1_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -11,15 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
+namespace seq_pool1_tester {
 // diff: similarity_norm.tmp_0, for speed: fc_4.tmp_1
 static const char out_var_name[] = "reduce_sum_0.tmp_0";
@@ -164,77 +169,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
  cfg->pass_builder()->InsertPass(2, "seqpool_concat_fuse_pass");
 }
-void profile(bool use_mkldnn = false) {
+}  // namespace seq_pool1_tester
-  AnalysisConfig cfg;
-  SetConfig(&cfg, use_mkldnn);
-  std::vector<std::vector<PaddleTensor>> outputs;
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-}
-TEST(Analyzer_seq_pool1, profile) { profile(); }
-// Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_seq_pool1, compare) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-// Compare Deterministic result
-TEST(Analyzer_seq_pool1, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-// Check the fuse status
-TEST(Analyzer_seq_pool1, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops);
-  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse"));
-  ASSERT_TRUE(fuse_statis.count("squared_mat_sub_fuse"));
-  ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse"));
-  ASSERT_EQ(fuse_statis.at("fc_fuse"), 10);
-  EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2);
-  EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2);
-  EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2);
-  LOG(INFO) << "num_ops: " << num_ops;
-  EXPECT_EQ(num_ops, 171);
-}
-// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy
-TEST(Analyzer_seq_pool1, compare_zero_copy) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  AnalysisConfig cfg1;
-  SetConfig(&cfg1);
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  std::vector<std::string> outputs_name;
-  outputs_name.emplace_back(out_var_name);
-  CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
-                             reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
-                             input_slots_all, outputs_name);
-}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_compare_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace transformer_tester {
+void compare(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
+  }
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+TEST(Analyzer_Transformer, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+}  // namespace transformer_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_fuse_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace transformer_tester {
+// Check the fuse status
+TEST(Analyzer_Transformer, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  auto fuse_statis = GetFuseStatis(
+      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
+}
+}  // namespace transformer_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_profile_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/tests/api/analyzer_transformer_tester_helper.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace transformer_tester {
+void profile(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<std::vector<PaddleTensor>> outputs;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
+  }
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+}
+TEST(Analyzer_Transformer, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
+#endif
+}  // namespace transformer_tester
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
@@ -11,11 +11,16 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 namespace paddle {
 namespace inference {
+namespace analysis {
+namespace transformer_tester {
 struct DataRecord {
  std::vector<std::vector<int64_t>> src_word, src_pos, trg_word, init_idx;
@@ -182,57 +187,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
  }
 }
-// Easy for profiling independently.
+}  // namespace transformer_tester
-void profile(bool use_mkldnn = false) {
+}  // namespace analysis
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  std::vector<std::vector<PaddleTensor>> outputs;
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-}
-TEST(Analyzer_Transformer, profile) { profile(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
-#endif
-// Check the fuse status
-TEST(Analyzer_Transformer, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  int num_ops;
-  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  auto fuse_statis = GetFuseStatis(
-      static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
-}
-// Compare result of NativeConfig and AnalysisConfig
-void compare(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
-}
-TEST(Analyzer_Transformer, compare) { compare(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
-#endif
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include <algorithm>
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT
@@ -20,6 +21,7 @@
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
@@ -45,7 +47,7 @@ TEST(RetryAllocator, RetryAllocator) {
  size_t thread_num = 4;
  size_t sleep_time = 40;
-  size_t extra_time = 10;
+  size_t extra_time = 20;
  // Reserve to perform more tests in the future
  std::vector<std::shared_ptr<Allocator>> allocators;

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -763,10 +763,28 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
  }
 };
-//
+// AbsGrad: dx=dy if x >=0 else -dy
+// AbsDoubleGrad: ddy = ddx if x >=0 else -ddx
+template <typename T>
+class AbsDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
+ public:
+  using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("abs_grad_grad");
+    // input1: x
+    op->SetInput("X", this->Input("X"));
+    // input2: ddx
+    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    op->SetAttrMap(this->Attrs());
+    // output: ddy
+    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+  }
+};
 // ReluGrad: dx = dy if y >= 0 else 0
 // ReluGradGrad: ddy = ddx if y >= 0 else 0
-//
 template <typename T>
 class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
 public:
@@ -1214,7 +1232,13 @@ REGISTER_OPERATOR(
    std::conditional<ops::CanInplaceAct<ops::AbsGradFunctor<float>>(),
                     ops::ActFwdInplaceInferer, void>::type);
 REGISTER_OPERATOR(abs_grad, ops::ActivationOpGrad,
-                  ops::ActivationGradOpInplaceInferer);
+                  ops::ActivationGradOpInplaceInferer,
+                  ops::AbsDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::AbsDoubleGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    abs_grad_grad,
+    ops::ActivationOpDoubleGrad<ops::AbsGradGradFunctor<float>::FwdDeps()>,
+    ops::ActivationDoubleGradOpInplaceInferer);
 REGISTER_OP_CPU_KERNEL(abs,
                       ops::ActivationKernel<paddle::platform::CPUDeviceContext,
@@ -1234,6 +1258,18 @@ REGISTER_OP_CPU_KERNEL(
                              ops::AbsGradFunctor<int>>,
    ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
                              ops::AbsGradFunctor<int64_t>>);
+REGISTER_OP_CPU_KERNEL(
+    abs_grad_grad,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<plat::float16>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<int>>,
+    ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
+                                    ops::AbsGradGradFunctor<int64_t>>);
 /* ========================================================================== */
 /* ==========================  register checkpoint ===========================*/

--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -160,7 +160,7 @@ REGISTER_OP_CUDA_KERNEL(
                              ops::ExpGradFunctor<plat::float16>>);
 /* ========================================================================== */
-/* ==========================   exp register  ============================ */
+/* ==========================   abs register  ============================ */
 REGISTER_OP_CUDA_KERNEL(
    abs, ops::ActivationKernel<plat::CUDADeviceContext, ops::AbsFunctor<float>>,
@@ -180,4 +180,16 @@ REGISTER_OP_CUDA_KERNEL(
                              ops::AbsGradFunctor<int64_t>>,
    ops::ActivationGradKernel<plat::CUDADeviceContext,
                              ops::AbsGradFunctor<plat::float16>>);
+REGISTER_OP_CUDA_KERNEL(
+    abs_grad_grad,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<float>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<double>>,
+    ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<plat::float16>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<int>>,
+    ops::ActivationDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                    ops::AbsGradGradFunctor<int64_t>>);
 /* ========================================================================== */
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1430,6 +1430,27 @@ class ActivationDoubleGradKernel
  }
 };
+template <typename T>
+struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* X,
+                  const framework::Tensor* Out, const framework::Tensor* ddX,
+                  framework::Tensor* ddOut, framework::Tensor* dOut,
+                  framework::Tensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "AbsGradGrad"));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "AbsGradGrad"));
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "AbsGradGrad"));
+      ddout.device(*d) = ddx * x.sign();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
 template <typename T>
 struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device>

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -839,6 +839,7 @@ void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const {
  op->SetInput("SavedMean", this->Input("SavedMean"));
  op->SetInput("SavedVariance", this->Input("SavedVariance"));
  if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) {
+    op->SetInput("Mean", this->Input("Mean"));
    op->SetInput("Variance", this->Input("Variance"));
  }
  op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
@@ -868,14 +869,19 @@ void BatchNormDoubleGradOp::InferShape(
                   "BatchNormDoubleGrad");
  }
-  OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX", "BatchNormDoubleGrad");
  OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "BatchNormDoubleGrad");
  // check output
  OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", "BatchNormDoubleGrad");
  const auto x_dims = ctx->GetInputDim("X");
-  const int C = x_dims[1];
+  const DataLayout data_layout = framework::StringToDataLayout(
+      ctx->Attrs().Get<std::string>("data_layout"));
+  const int C =
+      ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+           ? x_dims[1]
+           : x_dims[x_dims.size() - 1]);
  if (ctx->HasOutput("DX")) {
    ctx->SetOutputDim("DX", x_dims);
  }
@@ -957,7 +963,9 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
    Tensor inv_var_tensor;
    if (use_global_stats) {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
      const auto *running_variance = ctx.Input<Tensor>("Variance");
+      mean_data = running_mean->data<T>();
      inv_var_tensor.Resize({C});
      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
@@ -1077,12 +1085,12 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
        //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
        //          np.sum(dy,
        //          axis=(n,h,w)) * (x - mean) *
-        //          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+        //          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
        //          inv_var
        //          *
        //          np.mean(dy, axis=(n,h,w)) -
        //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-        //          axis=(n,h,w))))
+        //          axis=(n,h,w)))
        if (ddX) {
          dx_arr +=
@@ -1176,7 +1184,8 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
                               C, sample_size);
      ddy_arr.setZero();
      if (use_global_stats) {
-        // math: ddy = r * ddx * inv_var
+        // math: ddy = r * ddx * inv_var + ddbias +
+        //           ddscale * (x - mean) * inv_var
        if (ddX) {
          ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
        }
@@ -1196,25 +1205,29 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
                       .replicate(1, sample_size) /
                   sample_size);
        }
-        if (ddScale && ddBias) {
+      }
-          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+      if (ddScale) {
-          Tensor ddscale_tile;
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-          ddscale_tile.Resize({C, sample_size});
+        Tensor ddscale_tile;
-          EigenArrayMap<T> ddscale_tile_data(
+        ddscale_tile.Resize({C, sample_size});
-              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+        EigenArrayMap<T> ddscale_tile_data(
-          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+            ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+        ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+      }
-          ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+      if (ddBias) {
-          Tensor ddbias_tile;
+        ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
-          ddbias_tile.Resize({C, sample_size});
+        Tensor ddbias_tile;
-          EigenArrayMap<T> ddbias_tile_data(
+        ddbias_tile.Resize({C, sample_size});
-              ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+        EigenArrayMap<T> ddbias_tile_data(
-          ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
+            ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+        ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
-          ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+        ddy_arr += ddbias_tile_data;
-          ddy_arr += ddbias_tile_data;
-        }
      }
      if (data_layout == DataLayout::kNCHW) {
        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(

--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -47,8 +47,8 @@ void OpTester::Init(const OpTesterConfig &config) {
    CreateInputVarDesc();
    CreateOutputVarDesc();
  } else {
-    PADDLE_THROW(platform::errors::NotFound("Operator '%s' is not registered.",
+    PADDLE_THROW(platform::errors::NotFound(
-                                            config_.op_type));
+        "Operator '%s' is not registered in OpTester.", config_.op_type));
  }
  if (config_.device_id >= 0) {
@@ -81,7 +81,8 @@ void OpTester::Run() {
      platform::EnableProfiler(platform::ProfilerState::kAll);
      platform::SetDeviceId(config_.device_id);
 #else
-      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "'CUDAPlace' is not supported in CPU only device."));
 #endif
    }
@@ -162,7 +163,8 @@ framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
  } else if (str == "fp64") {
    return framework::proto::VarType::FP64;
  } else {
-    PADDLE_THROW("Unsupported dtype %s.", str.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported dtype %s in OpTester.", str.c_str()));
  }
 }
@@ -233,8 +235,8 @@ void OpTester::CreateOpDesc() {
      case framework::proto::AttrType::INTS:
      case framework::proto::AttrType::FLOATS:
      case framework::proto::AttrType::STRINGS:
-        PADDLE_THROW(
+        PADDLE_THROW(platform::errors::Unimplemented(
-            platform::errors::Unimplemented("Not supported STRINGS type yet."));
+            "Unsupported STRINGS type in OpTester yet."));
        break;
      case framework::proto::AttrType::LONG: {
        int64_t value = StringTo<int64_t>(value_str);
@@ -242,7 +244,8 @@ void OpTester::CreateOpDesc() {
      } break;
      case framework::proto::AttrType::LONGS:
      default:
-        PADDLE_THROW("Unsupport attr type %d", type);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupport attr type %d in OpTester.", type));
    }
  }
 }
@@ -299,7 +302,8 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
    }
    is.close();
  } else {
-    PADDLE_THROW("Unsupported initializer %s.", initializer.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported initializer %s in OpTester.", initializer.c_str()));
  }
  if (!platform::is_cpu_place(place_)) {
@@ -351,7 +355,8 @@ void OpTester::CreateVariables(framework::Scope *scope) {
                          static_cast<double>(1.0), item.second.initializer,
                          item.second.filename);
    } else {
-      PADDLE_THROW("Unsupported dtype %d.", data_type);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported dtype %d in OpTester.", data_type));
    }
    VLOG(3) << "Set lod for tensor " << var_name;
@@ -473,7 +478,8 @@ std::string OpTester::DebugString() {
           << "\n";
      } break;
      default:
-        PADDLE_THROW("Unsupport attr type %d", attr_type);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupport attr type %d in OpTester.", attr_type));
    }
    ss << GenSpaces(--count) << "}\n";
  }
@@ -484,8 +490,10 @@ std::string OpTester::DebugString() {
 TEST(op_tester, base) {
  if (!FLAGS_op_config_list.empty()) {
    std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
+    PADDLE_ENFORCE_EQ(
-                   FLAGS_op_config_list.c_str());
+        static_cast<bool>(fin), true,
+        platform::errors::InvalidArgument("OpTester cannot open file %s",
+                                          FLAGS_op_config_list.c_str()));
    std::vector<OpTesterConfig> op_configs;
    while (!fin.eof()) {
      VLOG(4) << "Reading config " << op_configs.size() << "...";

--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -78,7 +78,8 @@ void OpInputConfig::ParseDType(std::istream& is) {
  } else if (dtype_str == "fp64" || dtype_str == "double") {
    dtype = "fp64";
  } else {
-    PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported dtype %s in OpInputConfig.", dtype_str.c_str()));
  }
  VLOG(4) << "dtype of input " << name << " is: " << dtype;
 }
@@ -91,7 +92,9 @@ void OpInputConfig::ParseInitializer(std::istream& is) {
  const std::vector<std::string> supported_initializers = {"random", "natural",
                                                           "zeros", "file"};
  if (!Has(supported_initializers, initializer_str)) {
-    PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported initializer %s in OpInputConfig.",
+        initializer_str.c_str()));
  }
  initializer = initializer_str;
@@ -126,7 +129,12 @@ void OpInputConfig::ParseLoD(std::istream& is) {
    }
  }
  EraseEndSep(&lod_str);
-  PADDLE_ENFORCE_GE(lod_str.length(), 4U);
+  PADDLE_ENFORCE_GE(
+      lod_str.length(), 4U,
+      platform::errors::InvalidArgument(
+          "The length of lod string should be "
+          "equal to or larger than 4. But length of lod string is %zu.",
+          lod_str.length()));
  VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length();
  // Parse the lod_str
@@ -153,8 +161,10 @@ void OpInputConfig::ParseLoD(std::istream& is) {
 OpTesterConfig::OpTesterConfig(const std::string& filename) {
  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
+  PADDLE_ENFORCE_EQ(
-                 filename.c_str());
+      static_cast<bool>(fin), true,
+      platform::errors::InvalidArgument("OpTesterConfig cannot open file %s.",
+                                        filename.c_str()));
  Init(fin);
 }

--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -54,8 +54,10 @@ class CSyncCommStreamOp : public framework::OperatorBase {
 class CSyncCommStreamOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() {
-    AddInput("X", "(Tensor) Dependency of the variable need to sync");
+    AddInput("X", "(Tensor) Dependency of the variable need to sync")
-    AddOutput("Out", "(Tensor) Dependency of the variable need to sync");
+        .AsDuplicable();
+    AddOutput("Out", "(Tensor) Dependency of the variable need to sync")
+        .AsDuplicable();
    AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
    AddComment(R"DOC(
 CSyncCommStream Operator

--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -520,11 +520,11 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
    //          (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
    //          np.sum(dy,
    //          axis=(h,w)) * (x - mean) *
-    //          (np.mean(ddx, axis=(h,w)) - ddx) + ddr * (dy * inv_var - inv_var
+    //          (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var -
-    //          *
+    //          inv_var *
    //          np.mean(dy, axis=(h,w)) -
    //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-    //          axis=(h,w))))
+    //          axis=(h,w)))
    Tensor x_sub_mean_mul_invstd;
    x_sub_mean_mul_invstd.Resize({sample_size, NxC});

--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -136,7 +136,6 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
 }
 using Tensor = paddle::framework::Tensor;
 template <typename KernelTuple, typename PlaceType>
 void BenchKernelXYZN() {
  using T = typename KernelTuple::data_type;
@@ -320,8 +319,15 @@ void BenchKernelSgd() {
  const T lr = 0.1;
  auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                  const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
+    PADDLE_ENFORCE_LE(
-    PADDLE_ENFORCE_GT(n, 0);
+        static_cast<size_t>(upper - lower), n - 1,
+        paddle::platform::errors::InvalidArgument(
+            "The range of Sgd (upper - lower) should be equal to or lower "
+            "than n-1 (Sgd size -1). But upper - lower is %d and n-1 is %d.",
+            static_cast<size_t>(upper - lower), (n - 1)));
+    PADDLE_ENFORCE_GT(
+        n, 0, paddle::platform::errors::InvalidArgument(
+                  "The Sgd size should be larger than 0. But the n is %d.", n));
    std::vector<int64_t> all, out;
    for (int i = 0; i < n; ++i) {
      all.push_back(i);

--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
@@ -132,11 +132,31 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
  }
  std::unique_ptr<GenBase> CreateJitCode(
      const emb_seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.table_height, 0);
+    PADDLE_ENFORCE_GT(attr.table_height, 0,
-    PADDLE_ENFORCE_GT(attr.table_width, 0);
+                      platform::errors::InvalidArgument(
-    PADDLE_ENFORCE_GT(attr.index_height, 0);
+                          "The attribute table_height of EmbSeqPool should "
-    PADDLE_ENFORCE_GT(attr.index_width, 0);
+                          "be larger than 0. But it is %d.",
-    PADDLE_ENFORCE_GT(attr.out_width, 0);
+                          attr.table_height));
+    PADDLE_ENFORCE_GT(attr.table_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute table_width of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.table_width));
+    PADDLE_ENFORCE_GT(attr.index_height, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute index_height of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.index_height));
+    PADDLE_ENFORCE_GT(attr.index_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute index_width of EmbSeqPool should "
+                          "be larger than 0. But it is %d.",
+                          attr.index_width));
+    PADDLE_ENFORCE_GT(attr.out_width, 0,
+                      platform::errors::InvalidArgument(
+                          "The attribute out_width of EmbSeqPool should be "
+                          "larger than 0. But it is %d.",
+                          attr.out_width));
    return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
  }
 };

--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -29,7 +29,11 @@ void MatMulJitCode::genCode() {
  preCode();
  int block, rest;
  const auto groups = packed_groups(n_, k_, &block, &rest);
-  PADDLE_ENFORCE_GT(groups.front(), 0);
+  PADDLE_ENFORCE_GT(
+      groups.front(), 0,
+      platform::errors::InvalidArgument("The number of rest registers should "
+                                        "be larger than 0. But it is %d.",
+                                        groups.front()));
  const int block_len = sizeof(float) * block;
  const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
@@ -118,9 +122,21 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
  }
  std::unique_ptr<GenBase> CreateJitCode(
      const matmul_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.m, 0);
+    PADDLE_ENFORCE_GT(
-    PADDLE_ENFORCE_GT(attr.n, 0);
+        attr.m, 0, platform::errors::InvalidArgument(
-    PADDLE_ENFORCE_GT(attr.k, 0);
+                       "The attribute m (first matrix's row) of MatMul should "
+                       "be larger than 0. But it is %d.",
+                       attr.m));
+    PADDLE_ENFORCE_GT(
+        attr.n, 0, platform::errors::InvalidArgument(
+                       "The attribute n (first matrix's col) of MatMul should "
+                       "be larger than 0. But it is %d.",
+                       attr.n));
+    PADDLE_ENFORCE_GT(
+        attr.k, 0, platform::errors::InvalidArgument(
+                       "The attribute k (second matrix's col) of MatMul should "
+                       "be larger than 0. But it is %d.",
+                       attr.k));
    return make_unique<MatMulJitCode>(attr, CodeSize(attr));
  }
 };

--- a/paddle/fluid/operators/jit/gen/matmul.h
+++ b/paddle/fluid/operators/jit/gen/matmul.h
@@ -33,7 +33,10 @@ class MatMulJitCode : public JitCode {
                         size_t code_size = 256 * 1024,
                         void* code_ptr = nullptr)
      : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
-    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
+    PADDLE_ENFORCE_EQ(m_, 1, platform::errors::Unimplemented(
+                                 "Jitcode of matmul only support m==1 (first "
+                                 "matrix's row) now. But m is %d.",
+                                 m_));
    this->genCode();
  }

--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@@ -70,8 +70,14 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
  }
  std::unique_ptr<GenBase> CreateJitCode(
      const seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.w, 0);
+    PADDLE_ENFORCE_GT(attr.w, 0, platform::errors::InvalidArgument(
-    PADDLE_ENFORCE_GT(attr.h, 0);
+                                     "The attribute width of SeqPool should "
+                                     "be larger than 0. But it is %d.",
+                                     attr.w));
+    PADDLE_ENFORCE_GT(attr.h, 0, platform::errors::InvalidArgument(
+                                     "The attribute height of SeqPool should "
+                                     "be larger than 0. But it is %d.",
+                                     attr.h));
    return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
  }
 };

--- a/paddle/fluid/operators/jit/gen/seqpool.h
+++ b/paddle/fluid/operators/jit/gen/seqpool.h
@@ -127,8 +127,13 @@ class SeqPoolJitCode : public JitCode {
        vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
        reg_idx++;
      }
-      PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs,
+      PADDLE_ENFORCE_EQ(
-                        "All heights should use same regs");
+          reg_idx, rest_used_num_regs,
+          platform::errors::InvalidArgument(
+              "All heights of SeqPool should use the same number of registers."
+              "It equals to the numbr of rest registers. But use %d registers "
+              "and the numbr of rest registers is %d.",
+              reg_idx, rest_used_num_regs));
      for (int i = 0; i < reg_idx; ++i) {
        vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
      }

--- a/paddle/fluid/operators/jit/gen/sgd.cc
+++ b/paddle/fluid/operators/jit/gen/sgd.cc
@@ -116,9 +116,24 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
  size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; }
  std::unique_ptr<GenBase> CreateJitCode(
      const sgd_attr_t& attr) const override {
-    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
+    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width,
-    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
+                      platform::errors::InvalidArgument(
-    PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
+                          "The attribute param_width of Sgd should be "
+                          "equal to the attribute grad_width. But param_width "
+                          "is %d and grad_width is %d.",
+                          attr.param_width, attr.grad_width));
+    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height,
+                      platform::errors::InvalidArgument(
+                          "The attribute selected_rows_size of Sgd should be "
+                          "equal to or less than the attribute grad_height. "
+                          "But selected_rows_size is %d and grad_height is %d.",
+                          attr.selected_rows_size, attr.grad_height));
+    PADDLE_ENFORCE_GE(
+        attr.selected_rows_size, 0,
+        platform::errors::InvalidArgument(
+            "The attribute selected_rows_size of Sgd should be "
+            "equal to or larger than 0. But selected_rows_size is %d.",
+            attr.selected_rows_size));
    return make_unique<SgdJitCode>(attr, CodeSize(attr));
  }
 };

--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
@@ -76,7 +76,11 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
    return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
  }
  std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
-    PADDLE_ENFORCE_GT(w, 0);
+    PADDLE_ENFORCE_GT(
+        w, 0,
+        platform::errors::InvalidArgument(
+            "The width of VBroadcast should be larger than 0. But w is %d.",
+            w));
    return make_unique<VBroadcastJitCode>(w, CodeSize(w));
  }
 };

--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -49,9 +49,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
 void* GenBase::operator new(size_t size) {
  void* ptr;
  constexpr size_t alignment = 32ul;
-  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0,
+  PADDLE_ENFORCE_EQ(
-                    "GenBase Alloc %ld error!", size);
+      posix_memalign(&ptr, alignment, size), 0,
-  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
+      platform::errors::InvalidArgument(
+          "Jitcode generator (GenBase) allocate %ld memory error!", size));
+  PADDLE_ENFORCE_NOT_NULL(ptr, platform::errors::InvalidArgument(
+                                   "Fail to allocate jitcode generator "
+                                   "(GenBase) CPU memory: size = %d .",
+                                   size));
  return ptr;
 }

--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -66,7 +66,8 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kEmbSeqPool);
    ONE_CASE(kSgd);
    default:
-      PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "JIT kernel do not support type: %d.", kt));
      return "NOT JITKernel";
  }
  return nullptr;
@@ -79,7 +80,8 @@ const char* to_string(SeqPoolType tp) {
    ONE_CASE(kAvg);
    ONE_CASE(kSqrt);
    default:
-      PADDLE_THROW("Not support type: %d, or forget to add it.", tp);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "SeqPool JIT kernel do not support type: %d.", tp));
      return "NOT PoolType";
  }
  return nullptr;
@@ -100,7 +102,8 @@ KernelType to_kerneltype(const std::string& act) {
  } else if (lower == "tanh" || lower == "vtanh") {
    return kVTanh;
  }
-  PADDLE_THROW("Not support type: %s, or forget to add this case", act);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Act JIT kernel do not support type: %s.", act));
  return kNone;
 }
@@ -109,12 +112,19 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
  int block, rest;
  const auto groups = packed_groups(n, k, &block, &rest);
  std::for_each(groups.begin(), groups.end(), [&](int i) {
-    PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
+    PADDLE_ENFORCE_GT(i, 0, platform::errors::InvalidArgument(
+                                "Each element of groups should be larger than "
+                                "0. However the element: %d doesn't satify.",
+                                i));
  });
  int sum = std::accumulate(groups.begin(), groups.end(), 0);
  std::memset(dst, 0, k * sum * block * sizeof(float));
  PADDLE_ENFORCE_GE(sum * block, n,
-                    "The packed n should be equal to or larger than n");
+                    platform::errors::InvalidArgument(
+                        "The packed n (sum * block) should be equal to or "
+                        "larger than n (matmul row size). "
+                        "However, the packed n is %d and n is %d.",
+                        sum * block, n));
  const int block_len = sizeof(float) * block;
  int n_offset = 0;
@@ -136,7 +146,8 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
 template <typename T>
 typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
    const T* src, T* dst, int n, int k) {
-  PADDLE_THROW("Only support pack with float type.");
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Only supports pack weights with float type."));
 }
 }  // namespace jit

--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -85,8 +85,10 @@ inline const Kernel* GetReferKernel() {
  auto& ref_pool = ReferKernelPool::Instance().AllKernels();
  KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace());
  auto ref_iter = ref_pool.find(kkey);
-  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
+  PADDLE_ENFORCE_NE(
-                 "Every Kernel should have reference function.");
+      ref_iter, ref_pool.end(),
+      platform::errors::PreconditionNotMet(
+          "Every Refer Kernel of jitcode should have reference function."));
  auto& ref_impls = ref_iter->second;
  for (auto& impl : ref_impls) {
    auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
@@ -101,7 +103,9 @@ template <typename KernelTuple>
 inline typename KernelTuple::func_type GetReferFunc() {
  auto ker = GetReferKernel<KernelTuple>();
  auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
-  PADDLE_ENFORCE(p, "The Refer kernel should exsit");
+  PADDLE_ENFORCE_NOT_NULL(p, platform::errors::InvalidArgument(
+                                 "Get the reference code of kernel in CPU "
+                                 "failed. The Refer kernel should exsit."));
  return p->GetFunc();
 }
@@ -132,7 +136,9 @@ std::vector<const Kernel*> GetAllCandidateKernels(
  // The last implementation should be reference function on CPUPlace.
  auto ref = GetReferKernel<KernelTuple>();
-  PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
+  PADDLE_ENFORCE_NOT_NULL(ref, platform::errors::InvalidArgument(
+                                   "Get all candicate kernel in CPU failed. "
+                                   "The Refer Kernel can not be empty."));
  res.emplace_back(ref);
  return res;
 }
@@ -147,11 +153,14 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
    std::string name = k->ImplType();
    if (name == "JitCode") {
      auto i = dynamic_cast<const GenBase*>(k);
-      PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
+      PADDLE_ENFORCE_NOT_NULL(i,
+                              platform::errors::InvalidArgument(
+                                  "Generate jitcode kernel (GenBase) failed."));
      res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
    } else {
      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
-      PADDLE_ENFORCE(i, "kernel cast can not fail.");
+      PADDLE_ENFORCE_NOT_NULL(i, platform::errors::InvalidArgument(
+                                     "Kernel cast (KernelMore) failed."));
      res.emplace_back(std::make_pair(name, i->GetFunc()));
    }
  }
@@ -173,7 +182,9 @@ template <typename KernelTuple, typename PlaceType = platform::CPUPlace>
 typename KernelTuple::func_type GetDefaultBestFunc(
    const typename KernelTuple::attr_type& attr) {
  auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
-  PADDLE_ENFORCE_GE(funcs.size(), 1UL);
+  PADDLE_ENFORCE_GE(funcs.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "The candicate jit kernel is at least one in CPU."));
  // Here could do some runtime benchmark of this attr and return the best one.
  // But yet just get the first one as the default best one,
  // which is searched in order and tuned by offline.

--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -95,7 +95,8 @@ void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
  } else if (type == kVIdentity) {
    return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
  }
-  PADDLE_THROW("Not support type: %s", type);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Act JIT kernel do not support type: %s", type));
  return nullptr;
 }

--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -103,11 +103,24 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
 template <typename T>
 void EmbSeqPool(const T* table, const int64_t* idx, T* out,
                const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  PADDLE_ENFORCE_EQ(
+      attr->table_width * attr->index_width, attr->out_width,
+      platform::errors::InvalidArgument(
+          "The attribute table_width * index_width of EmbSeqPool should "
+          "be equal to out_width. But table_width * index_width is %d, "
+          "out_width is %d.",
+          attr->table_width * attr->index_width, attr->out_width));
  auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
+    PADDLE_ENFORCE_LT(
-                      idx[i], i);
+        idx[i], attr->table_height,
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+        platform::errors::InvalidArgument(
+            "The idx shoud be lower than the attribute table_height of "
+            "EmbSeqPool. But %dth of idx is %d and table_height is %d.",
+            i, idx[i], attr->table_height));
+    PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
+                                     "The idx shoud be equal to or larger than "
+                                     "the 0. But %dth of idx is %d.",
+                                     i, idx[i]));
  };
  for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -168,22 +181,50 @@ void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
 template <typename T>
 void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
         T* out, const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
+  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width,
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+                    platform::errors::InvalidArgument(
+                        "The attribute param_width of Sgd should be "
+                        "equal to the attribute grad_width. But param_width "
+                        "is %d and grad_width is %d.",
+                        attr->param_width, attr->grad_width));
+  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height,
+                    platform::errors::InvalidArgument(
+                        "The attribute selected_rows_size of Sgd should be "
+                        "equal to or less than the attribute grad_height. "
+                        "But selected_rows_size is %d and grad_height is %d.",
+                        attr->selected_rows_size, attr->grad_height));
  T scalar = -lr[0];
  int width = attr->grad_width;
  if (out == param) {
    for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
      auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
+      PADDLE_ENFORCE_LT(h_idx, attr->param_height,
-      PADDLE_ENFORCE_GE(h_idx, 0);
+                        platform::errors::InvalidArgument(
+                            "The rows of Sgd should be "
+                            "less than the attribute. But %dth of rows "
+                            "is %d and grad_width is %d.",
+                            i, h_idx, attr->param_height));
+      PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
+                                      "The rows of Sgd should be "
+                                      "larger than 0. But %dth of rows "
+                                      "is %d.",
+                                      i, h_idx));
      VAXPY(scalar, grad + i * width, out + h_idx * width, width);
    }
  } else {
    for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
      auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
+      PADDLE_ENFORCE_LT(h_idx, attr->param_height,
-      PADDLE_ENFORCE_GE(h_idx, 0);
+                        platform::errors::InvalidArgument(
+                            "The rows of Sgd should be "
+                            "less than the attribute. But %dth of rows "
+                            "is %d and grad_width is %d.",
+                            i, h_idx, attr->param_height));
+      PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
+                                      "The rows of Sgd should be "
+                                      "larger than 0. But %dth of rows "
+                                      "is %d.",
+                                      i, h_idx));
      VScal(&scalar, grad + i * width, out + h_idx * width, width);
      VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width,
           width);

--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -147,7 +147,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) {  // NOLINT
  } else if (type == kVIdentity) {
    return VIdentity<T>;
  }
-  PADDLE_THROW("Not support type: %s", type);
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Act JIT kernel do not support type: %s.", type));
  return nullptr;
 }
@@ -465,12 +466,25 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
 template <typename T>
 void EmbSeqPool(const T* table, const int64_t* idx, T* out,
                const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  PADDLE_ENFORCE_EQ(
+      attr->table_width * attr->index_width, attr->out_width,
+      platform::errors::InvalidArgument(
+          "The attribute table_width * index_width of EmbSeqPool should "
+          "be equal to out_width. But table_width * index_width is %d and "
+          "out_width is %d.",
+          attr->table_width * attr->index_width, attr->out_width));
  auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d",
+    PADDLE_ENFORCE_LT(
-                      idx[i], i);
+        idx[i], attr->table_height,
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+        platform::errors::InvalidArgument(
+            "The idx shoud be lower than the attribute table_height of "
+            "EmbSeqPool. But %dth of idx is %d and table_height is %d.",
+            i, idx[i], attr->table_height));
+    PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument(
+                                     "The idx shoud be equal to or larger than "
+                                     "the 0. But %dth of idx is %d.",
+                                     i, idx[i]));
  };
  for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -505,12 +519,31 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out,
 template <typename T>
 void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
         T* out, const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
+  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width,
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+                    platform::errors::InvalidArgument(
+                        "The attribute param_width of Sgd should be "
+                        "equal to the attribute grad_width. But param_width "
+                        "is %d and grad_width is %d.",
+                        attr->param_width, attr->grad_width));
+  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height,
+                    platform::errors::InvalidArgument(
+                        "The attribute selected_rows_size of Sgd should be "
+                        "equal to or less than the attribute grad_height. "
+                        "But selected_rows_size is %d and grad_height is %d.",
+                        attr->selected_rows_size, attr->grad_height));
  for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
    auto h_idx = rows[i];
-    PADDLE_ENFORCE_LT(h_idx, attr->param_height);
+    PADDLE_ENFORCE_LT(h_idx, attr->param_height,
-    PADDLE_ENFORCE_GE(h_idx, 0);
+                      platform::errors::InvalidArgument(
+                          "The rows of Sgd should be "
+                          "less than the attribute. But %dth of rows "
+                          "is %d and grad_width is %d.",
+                          i, h_idx, attr->param_height));
+    PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument(
+                                    "The rows of Sgd should be "
+                                    "larger than 0. But %dth of rows "
+                                    "is %d.",
+                                    i, h_idx));
    for (int64_t j = 0; j < attr->grad_width; ++j) {
      out[h_idx * attr->grad_width + j] =
          param[h_idx * attr->grad_width + j] -

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -850,8 +850,15 @@ void TestKernelSgd() {
  const T lr = 0.1;
  auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
                                  const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
+    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1,
-    PADDLE_ENFORCE_GT(n, 0);
+                      paddle::platform::errors::InvalidArgument(
+                          "The range of Sgd (upper - lower) should be lower "
+                          "than n-1 (Sgd size -1). But the upper - lower is %d "
+                          "and n-1 is %d.",
+                          static_cast<size_t>(upper - lower), n - 1));
+    PADDLE_ENFORCE_GT(
+        n, 0, paddle::platform::errors::InvalidArgument(
+                  "The Sgd size should be larger than 0. But the n is %d.", n));
    std::vector<int64_t> all, out;
    for (int i = 0; i < n; ++i) {
      all.push_back(i);

--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -420,6 +420,22 @@ void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
  });
 }
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMV(
+    bool trans_a, int M, int N, platform::float16 alpha,
+    const platform::float16 *A, const platform::float16 *B,
+    platform::float16 beta, platform::float16 *C) const {
+  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                           alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::float16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                           alpha, A, B, beta, C);
+  }
+}
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -479,6 +495,19 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
  }
 }
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::float16 alpha, const platform::float16 **A,
+    const platform::float16 **B, platform::float16 beta, platform::float16 **C,
+    int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::float16>(transA, transB, M, N, K, alpha, A[k],
+                                           B[k], beta, C[k]);
+  }
+}
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,

--- a/paddle/fluid/operators/matmul_v2_op.cu
+++ b/paddle/fluid/operators/matmul_v2_op.cu
@@ -17,10 +17,12 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plf = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(matmul_v2,
+REGISTER_OP_CUDA_KERNEL(
-                        ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
+    matmul_v2, ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
-                        ops::MatMulV2Kernel<plf::CUDADeviceContext, double>);
+    ops::MatMulV2Kernel<plf::CUDADeviceContext, double>,
+    ops::MatMulV2Kernel<plf::CUDADeviceContext, plf::float16>);
 REGISTER_OP_CUDA_KERNEL(
    matmul_v2_grad, ops::MatMulV2GradKernel<plf::CUDADeviceContext, float>,
-    ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>);
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>,
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, plf::float16>);
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -163,17 +163,20 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
    if (trans_y) {
      const int M = Y->numel() / N;
      VLOG(3) << "MatMul's case 2";
-      blas.GEMV(false, M, N, 1., y_data, x_data, 0., Out->data<T>());
+      blas.GEMV(false, M, N, static_cast<T>(1), y_data, x_data,
+                static_cast<T>(0), Out->data<T>());
    } else {
      const int M = y_dims[y_ndim - 1];
      const int batch_size = Y->numel() / (M * N);
      if (batch_size == 1) {
        VLOG(3) << "MatMul's case 3";
-        blas.GEMV(true, N, M, 1., y_data, x_data, 0., Out->data<T>());
+        blas.GEMV(true, N, M, static_cast<T>(1), y_data, x_data,
+                  static_cast<T>(0), Out->data<T>());
      } else {
        VLOG(3) << "MatMul's case 4";
-        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, y_data,
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
-                         x_data, 0, Out->data<T>(), batch_size, M * N, 0);
+                         y_data, x_data, static_cast<T>(0), Out->data<T>(),
+                         batch_size, M * N, 0);
      }
    }
    return;
@@ -205,16 +208,19 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
      const int batch_size = X->numel() / (M * N);
      if (batch_size == 1) {
        VLOG(3) << "MatMul's case 5";
-        blas.GEMV(true, N, M, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+        blas.GEMV(true, N, M, static_cast<T>(1), x_data, y_data,
+                  static_cast<T>(0), Out->data<T>());
      } else {
        VLOG(3) << "MatMul's case 6";
-        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, x_data,
+        blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast<T>(1),
-                         y_data, 0, Out->data<T>(), batch_size, M * N, 0);
+                         x_data, y_data, static_cast<T>(0), Out->data<T>(),
+                         batch_size, M * N, 0);
      }
    } else {
      const int M = X->numel() / N;
      VLOG(3) << "MatMul's case 7";
-      blas.GEMV(false, M, N, 1.0f, x_data, y_data, 0.0f, Out->data<T>());
+      blas.GEMV(false, M, N, static_cast<T>(1), x_data, y_data,
+                static_cast<T>(0), Out->data<T>());
    }
    return;
  }
@@ -263,37 +269,38 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
  if (x_batch_size == 1 && y_batch_size == 1) {
    VLOG(3) << "MatMul's case 8";
    blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
-              trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
+              trans_y ? CblasTrans : CblasNoTrans, M, N, K, static_cast<T>(1),
-              y_data, 0.0f, Out->data<T>());
+              x_data, y_data, static_cast<T>(0), Out->data<T>());
  } else if (x_batch_size == 1) {
    if (M == 1 && trans_y) {
      VLOG(3) << "MatMul's case 9";
-      blas.GEMV(false, y_batch_size * N, K, 1.0f, y_data, x_data, 0.0f,
+      blas.GEMV(false, y_batch_size * N, K, static_cast<T>(1), y_data, x_data,
-                Out->data<T>());
+                static_cast<T>(0), Out->data<T>());
    } else {
      VLOG(3) << "MatMul's case 10";
      blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                       trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
+                       trans_y ? CblasTrans : CblasNoTrans, M, N, K,
-                       x_data, y_data, 0, Out->data<T>(), out_batch_size, 0,
+                       static_cast<T>(1), x_data, y_data, static_cast<T>(0),
-                       K * N);
+                       Out->data<T>(), out_batch_size, 0, K * N);
    }
  } else if (y_batch_size == 1) {
    if (!trans_x) {
      VLOG(3) << "MatMul's case 11";
      blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans,
-                x_batch_size * M, N, K, 1.0f, x_data, y_data, 0.0f,
+                x_batch_size * M, N, K, static_cast<T>(1), x_data, y_data,
-                Out->data<T>());
+                static_cast<T>(0), Out->data<T>());
    } else {
      VLOG(3) << "MatMul's case 12";
      blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K,
-                       1.0f, x_data, y_data, 0, Out->data<T>(), out_batch_size,
+                       static_cast<T>(1), x_data, y_data, static_cast<T>(0),
-                       M * K, 0);
+                       Out->data<T>(), out_batch_size, M * K, 0);
    }
  } else if (!is_broadcast_dims) {
    VLOG(3) << "MatMul's case 13";
    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data,
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K,
-                     y_data, 0, Out->data<T>(), out_batch_size, M * K, K * N);
+                     static_cast<T>(1), x_data, y_data, static_cast<T>(0),
+                     Out->data<T>(), out_batch_size, M * K, K * N);
  } else {
    // in the case, can't use stridedgemm
    std::vector<const T*> x_ptr(out_batch_size);
@@ -314,9 +321,9 @@ void MatMulFunction(const Tensor* X, const Tensor* Y,
    }
    VLOG(3) << "MatMul's case 14";
    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f,
+                     trans_y ? CblasTrans : CblasNoTrans, M, N, K,
-                     x_ptr.data(), y_ptr.data(), 0.0f, out_ptr.data(),
+                     static_cast<T>(1), x_ptr.data(), y_ptr.data(),
-                     out_batch_size);
+                     static_cast<T>(0), out_ptr.data(), out_batch_size);
  }
 }

--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -40,12 +40,12 @@ using DataLayout = framework::DataLayout;
 //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
 //          np.sum(dy,
 //          axis=(n,h,w)) * (x - mean) *
-//          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+//          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
 //          inv_var
 //          *
 //          np.mean(dy, axis=(n,h,w)) -
 //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-//          axis=(n,h,w))))
+//          axis=(n,h,w)))
 template <typename T, int BlockDim, framework::DataLayout layout>
 __global__ void DoubleGradComputeDX(const T *x, const T *mean,
@@ -138,7 +138,7 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
                ? (j / sample_size * C + i) * sample_size + j % sample_size
                : j * outer_size + i;
        dx[index] += (dy[index] * var_val - dy_sum_val / inner_size * var_val -
-                      (x[index] - mean_val) * var_val *
+                      (x[index] - mean_val) * var_val * var_val *
                          dy_mul_x_sub_mean_sum_val * var_val / inner_size) *
                     ddscale[i];
      }
@@ -326,19 +326,57 @@ __global__ void DoubleGradComputeDScaleWithGlobal(
 }
 // math: dx = ddscale * dy * inv_var
-// math: ddy = scale * ddx * inv_var
 template <typename T, framework::DataLayout layout>
-__global__ void DoubleGradComputeDataWithGlobal(
+__global__ void DoubleGradComputeDXWithGlobal(const T *dy, const T *ddscale,
-    const T *dy, const T *scale, const T *variance, const double epsilon,
+                                              const T *variance,
-    const int C, const int sample_size, const int num, T *dx) {
+                                              const double epsilon, const int C,
+                                              const int sample_size,
+                                              const int num, T *dx) {
  int gid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
-  if (scale != nullptr) {
+  if (ddscale != nullptr) {
    for (int i = gid; i < num; i += stride) {
      const int c =
          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
      T inv_var = 1.0 / sqrt(variance[c] + epsilon);
-      dx[i] = dy[i] * scale[c] * inv_var;
+      dx[i] = dy[i] * ddscale[c] * inv_var;
+    }
+  }
+}
+// math: ddy = scale * ddx * inv_var + ddbias +
+//             ddscale * (x - mean) * inv_var
+template <typename T, framework::DataLayout layout>
+__global__ void DoubleGradComputeDDYWithGlobal(
+    const T *ddx, const T *scale, const T *mean, const T *variance, const T *x,
+    const T *ddbias, const T *ddscale, const double epsilon, const int C,
+    const int sample_size, const int num, T *ddy) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  if (ddx != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      T inv_var = 1.0 / sqrt(variance[c] + epsilon);
+      ddy[i] += ddx[i] * scale[c] * inv_var;
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      T inv_var = 1.0 / sqrt(variance[c] + epsilon);
+      ddy[i] += (x[i] - mean[c]) * inv_var * ddscale[c];
+    }
+  }
+  __syncthreads();
+  if (ddbias != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      ddy[i] += ddbias[c];
    }
  }
 }
@@ -383,8 +421,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
  const T *mean_data, *variance_data;
  if (use_global_stats) {
+    const auto *running_mean = ctx.Input<Tensor>("Mean");
    const auto *running_var = ctx.Input<Tensor>("Variance");
+    const auto *running_mean_data = running_mean->template data<T>();
    const auto *running_var_data = running_var->template data<T>();
+    mean_data = running_mean_data;
    variance_data = running_var_data;
  } else {
    const T *smean_data = Saved_mean->data<T>();
@@ -398,12 +439,12 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
    set_constant(dev_ctx, dX, static_cast<T>(0));
    if (use_global_stats) {
      if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDXWithGlobal<
            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
            dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
            dx_data);
      } else {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDXWithGlobal<
            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
            dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
            dx_data);
@@ -456,15 +497,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
    set_constant(dev_ctx, ddY, static_cast<T>(0));
    if (use_global_stats) {
      if (data_layout == DataLayout::kNHWC) {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDDYWithGlobal<
            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
-            ddx_data, scale_data, variance_data, epsilon, C, sample_size, num,
+            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
-            ddy_data);
+            ddscale_data, epsilon, C, sample_size, num, ddy_data);
      } else {
-        DoubleGradComputeDataWithGlobal<
+        DoubleGradComputeDDYWithGlobal<
            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
-            ddx_data, scale_data, variance_data, epsilon, C, sample_size, num,
+            ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
-            ddy_data);
+            ddscale_data, epsilon, C, sample_size, num, ddy_data);
      }
    } else {
      if (data_layout == DataLayout::kNHWC) {

--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -41,7 +41,9 @@ inline std::vector<T> GetDataFromTensor(const framework::Tensor* x) {
    // NOTE: Converting int64 to int32 may cause data overflow.
    vec_new_data = std::vector<T>(data, data + x->numel());
  } else {
-    PADDLE_THROW("The dtype of Tensor must be int32 or int64.");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The dtype of Tensor must be int32 or int64, but received: %s",
+        x->type()));
  }
  return vec_new_data;
 }
@@ -53,10 +55,11 @@ inline std::vector<T> GetDataFromTensorList(
  for (size_t i = 0; i < list_tensor.size(); ++i) {
    auto tensor = list_tensor[i];
    PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
-                      "ShapeError: The shape of Tensor in list must be [1]. "
+                      platform::errors::InvalidArgument(
-                      "But received the shape "
+                          "The shape of Tensor in list must be [1]. "
-                      "is [%s]",
+                          "But received its shape "
-                      tensor->dims());
+                          "is [%s]",
+                          tensor->dims()));
    if (tensor->type() == framework::proto::VarType::INT32) {
      if (platform::is_gpu_place(tensor->place())) {
@@ -76,7 +79,10 @@ inline std::vector<T> GetDataFromTensorList(
        vec_new_data.push_back(static_cast<T>(*tensor->data<int64_t>()));
      }
    } else {
-      PADDLE_THROW("The dtype of Tensor in list must be int32 or int64.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The dtype of Tensor in list must be int32 or int64, but received: "
+          "%s",
+          tensor->type()));
    }
  }
  return vec_new_data;

--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -134,7 +134,26 @@ USE_CUDA_ATOMIC(Max, int);
 USE_CUDA_ATOMIC(Max, unsigned int);
 // CUDA API uses unsigned long long int, we cannot use uint64_t here.
 // It because unsigned long long int is not necessarily uint64_t
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
+#else
+CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {
+  if (*address >= val) {
+    return;
+  }
+  unsigned long long int old = *address, assumed;
+  do {
+    assumed = old;
+    if (assumed >= val) {
+      break;
+    }
+    old = atomicCAS(address, assumed, val);
+  } while (assumed != old);
+}
+#endif
 CUDA_ATOMIC_WRAPPER(Max, int64_t) {
  // Here, we check long long int must be int64_t.
@@ -187,7 +206,26 @@ USE_CUDA_ATOMIC(Min, int);
 USE_CUDA_ATOMIC(Min, unsigned int);
 // CUDA API uses unsigned long long int, we cannot use uint64_t here.
 // It because unsigned long long int is not necessarily uint64_t
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
+#else
+CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {
+  if (*address <= val) {
+    return;
+  }
+  unsigned long long int old = *address, assumed;
+  do {
+    assumed = old;
+    if (assumed <= val) {
+      break;
+    }
+    old = atomicCAS(address, assumed, val);
+  } while (assumed != old);
+}
+#endif
 CUDA_ATOMIC_WRAPPER(Min, int64_t) {
  // Here, we check long long int must be int64_t.

--- a/paddle/scripts/installation_validate.py
+++ b/paddle/scripts/installation_validate.py
@@ -15,4 +15,5 @@
 import paddle.fluid as fluid
 import paddle as pd
+fluid.install_check.run_check()
 print(pd.__version__)
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -40,6 +40,7 @@ if not defined WITH_TPCACHE set WITH_TPCACHE=ON
 rem -------set cache build work directory-----------
+rmdir build\python /s/q
 if "%WITH_CACHE%"=="OFF" (
    rmdir build /s/q
    goto :mkbuild
@@ -48,10 +49,10 @@ if "%WITH_CACHE%"=="OFF" (
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
 set day_now=%datetime:~6,2%
 set day_before=-1
-set /p day_before=<day.txt
+set /p day_before=< %work_dir%\..\day.txt
 if %day_now% NEQ %day_before% (
-    echo %day_now% > day.txt
+    echo %day_now% > %work_dir%\..\day.txt
-    type day.txt
+    type %work_dir%\..\day.txt
    rmdir build /s/q
 )
 git diff origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
@@ -208,7 +209,7 @@ echo Build third_party the %build_times% time:
 msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
 if %ERRORLEVEL% NEQ 0 (
    set /a build_times=%build_times%+1  
-    if %build_times% GTR 3 (
+    if %build_times% GTR 2 (
        exit /b 7
    ) else (
        echo Build third_party failed, will retry!
@@ -223,7 +224,7 @@ echo Build Paddle the %build_times% time:
 msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
    set /a build_times=%build_times%+1
-    if %build_times% GTR 2 (
+    if %build_times% GTR 1 (
        exit /b 7
    ) else (
        echo Build Paddle failed, will retry!
@@ -301,6 +302,7 @@ goto:eof
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
+call :timestamp "%start%" "%end%" "1 card TestCases Total"
 call :timestamp "%start%" "%end%" "TestCases Total"
 echo Running unit tests failed, will exit!
 exit /b 8
@@ -313,6 +315,7 @@ echo    ========================================
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
+call :timestamp "%start%" "%end%" "1 card TestCases Total"
 call :timestamp "%start%" "%end%" "TestCases Total"
 cd %work_dir%\paddle\fluid\inference\api\demo_ci
@@ -345,6 +348,8 @@ echo     ============================================ >>  check_change_of_unitte
 echo EOF>>  check_change_of_unittest.sh
 echo spec_path=$(pwd)/UNITTEST_PR.spec>>  check_change_of_unittest.sh
 echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>>  check_change_of_unittest.sh
+echo num=$(awk 'END{print NR}' ${spec_path})>> check_change_of_unittest.sh
+echo echo "Windows 1 card TestCases count is $num">> check_change_of_unittest.sh
 echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>>  check_change_of_unittest.sh
 echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>>  check_change_of_unittest.sh
 echo if [ "$origin_upstream_url" == "" ]; then>>  check_change_of_unittest.sh
@@ -455,8 +460,6 @@ taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
 taskkill /f /im python.exe  2>NUL
-call paddle_winci\Scripts\deactivate.bat 2>NUL
-del %PADDLE_WHL_FILE_WIN%
 taskkill /f /im python.exe  2>NUL
 echo Windows CI run successfully!
 exit /b 0

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -988,11 +988,6 @@ set +x
                fi
                read testcase <<< $(echo "$line"|grep -oEi "\w+$")
-                if python $PADDLE_ROOT/tools/is_ut_disabled.py $testcase; then
-                    echo $testcase" is disabled."
-                    continue
-                fi
                if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then
                    echo $testcase" will only run at night."
                    continue

--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -845,6 +845,29 @@ class DistributedStrategy(object):
        check_configs_key(self.strategy.dgc_configs, configs, "dgc_configs")
        assign_configs_value(self.strategy.dgc_configs, configs)
+    @property
+    def fp16_allreduce(self):
+        """
+        Indicating whether we are using fp16 gradient allreduce training
+        Default Value: False
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.fp16_allreduce = True # by default this is false
+        """
+        return self.strategy.fp16_allreduce
+    @fp16_allreduce.setter
+    @is_strict_auto
+    def fp16_allreduce(self, flag):
+        if not isinstance(flag, bool):
+            raise TypeError('fp16_allreduce must be value of bool type')
+        self.strategy.fp16_allreduce = flag
    @property
    def gradient_merge(self):
        """

--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -23,3 +23,4 @@ from .lars_optimizer import LarsOptimizer
 from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
 from .dgc_optimizer import DGCOptimizer
 from .lamb_optimizer import LambOptimizer
+from .fp16_allreduce_optimizer import FP16AllReduceOptimizer
--- a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+from paddle.fluid import core, framework, unique_name
+from .meta_optimizer_base import MetaOptimizerBase
+class FP16AllReduceOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(FP16AllReduceOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        # we do not allow meta optimizer to be inner optimizer currently
+        self.meta_optimizers_white_list = [
+            "LarsOptimizer",
+            "LambOptimizer",
+            "RecomputeOptimizer",
+            "LocalSGDOptimizer",
+            "GradientMergeOptimizer",
+            "GraphExecutionOptimizer",
+            "AdaptiveLocalSGDOptimizer",
+        ]
+        self.meta_optimizers_black_list = ["DGCOptimizer"]
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(FP16AllReduceOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+        if self.user_defined_strategy.fp16_allreduce:
+            return True
+        return False
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.fp16_allreduce = False
+    def _enable_strategy(self, dist_strategy, context=None):
+        dist_strategy.fp16_allreduce = True
+    @staticmethod
+    def fp16_compression(param_and_grads):
+        """
+        Compress fp32 gradients to fp16 during allreduce.
+        """
+        op_maker = core.op_proto_and_checker_maker
+        new_param_and_grads = []  # param, grad, is_cast
+        # cast grad from fp32->fp16 before allreduce,
+        for param, grad in param_and_grads:
+            if grad is None or grad.dtype != core.VarDesc.VarType.FP32:
+                new_param_and_grads.append((param, grad, False))
+                continue
+            op = grad.op
+            block = grad.block
+            var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()]
+            if param.name not in var_attr:
+                new_param_and_grads.append((param, grad, False))
+                continue
+            # remove (param, grad) from op_role_var
+            var_attr.remove(param.name)
+            var_attr.remove(grad.name)
+            if len(var_attr) > 1:
+                op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr)
+            else:
+                op._remove_attr(op_maker.kOpRoleVarAttrName())
+            new_grad = block.create_var(
+                name=unique_name.generate(grad.name + ".cast_fp16"),
+                dtype=core.VarDesc.VarType.FP16,
+                persistable=False,
+                stop_gradient=True)
+            with block.program._backward_role_guard():
+                cast_op = block.append_op(
+                    type="cast",
+                    inputs={"X": grad},
+                    outputs={"Out": new_grad},
+                    attrs={
+                        "in_dtype": core.VarDesc.VarType.FP32,
+                        "out_dtype": core.VarDesc.VarType.FP16
+                    },
+                    stop_gradient=True)
+                backward = op_maker.OpRole.Backward
+                cast_op._set_attr(op_maker.kOpRoleAttrName(), backward)
+                cast_op._set_attr(op_maker.kOpRoleVarAttrName(),
+                                  [param.name, new_grad.name])
+                new_grad.op = cast_op
+            new_param_and_grads.append((param, new_grad, True))
+        ret_param_and_grads = []
+        # cast grad from fp16->fp32 after allreduce.
+        # NOTE. Now we split fp16 compression into two for loops,
+        # if we do not separate them, fuse allreduce will wrong.
+        # This must be the problem of fuse allreduce pass, need
+        # fixed in future.
+        for param, grad, cast in new_param_and_grads:
+            if not cast:
+                ret_param_and_grads.append((param, grad))
+                continue
+            block = grad.block
+            new_grad = block.create_var(
+                name=unique_name.generate(grad.name + ".cast_fp32"),
+                dtype=core.VarDesc.VarType.FP32,
+                persistable=False,
+                stop_gradient=True)
+            with block.program._optimized_guard(
+                [param, grad]), framework.name_scope('fp16_allreduce'):
+                cast_op = block.append_op(
+                    type="cast",
+                    inputs={"X": grad},
+                    outputs={"Out": new_grad},
+                    attrs={
+                        "in_dtype": core.VarDesc.VarType.FP16,
+                        "out_dtype": core.VarDesc.VarType.FP32
+                    },
+                    stop_gradient=True)
+            ret_param_and_grads.append((param, new_grad))
+        return ret_param_and_grads
+    def apply_optimize(self, loss, startup_program, params_grads):
+        new_params_grads = self.fp16_compression(params_grads)
+        return self.inner_opt.apply_optimize(
+            loss,
+            startup_program=startup_program,
+            params_grads=new_params_grads)
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1355,7 +1355,7 @@ class Executor(object):
        if not program._fleet_opt is None:
            if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker":
                is_heter = 1
-            if program._fleet_opt("trainer", "") == "HeterXpuTrainer":
+            if program._fleet_opt.get("trainer", "") == "HeterXpuTrainer":
                is_heter = 1
        if scope is None:
            scope = global_scope()

--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -167,10 +167,10 @@ class DataLoader(object):
            The variables should be created by :code:`fluid.data()`.
            :attr:`feed_list` must be set if :attr:`return_list` is
            False. Default None.
-        places(list(Place)|tuple(Place)): a list of Place, to put data
+        places(list(Place)|tuple(Place)|optional): a list of Place,
-            onto, :attr:`places` must be set in both static graph and 
+            to put data onto, :attr:`places` can be None, if 
-            dynamic graph mode, in dynamic graph mode, place number must
+            :attr:`places` is None, default place(CPUPlace or CUDAPlace(0))
-            be 1. Default None.
+            will be used. Default None.
        return_list (bool): whether the return value on each device is 
            presented as a list. If :attr:`return_list=False`, the return
            value on each device would be a dict of str -> LoDTensor, where
@@ -222,6 +222,8 @@ class DataLoader(object):
        .. code-block:: python
            import numpy as np
+            import paddle
            import paddle.fluid as fluid
            from paddle.io import Dataset, BatchSampler, DataLoader
@@ -247,11 +249,48 @@ class DataLoader(object):
                def __len__(self):
                    return self.num_samples
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
            # get places
            places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
+            # --------------------- dygraph mode --------------------
+            class SimpleNet(fluid.dygraph.Layer):
+                def __init__(self):
+                    super(SimpleNet, self).__init__()
+                    self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
+                def forward(self, image, label=None):
+                    return self.fc(image)
+            with fluid.dygraph.guard(places[0]):
+                simple_net = SimpleNet()
+                opt = fluid.optimizer.SGD(learning_rate=1e-3,
+                                          parameter_list=simple_net.parameters())
+                loader = DataLoader(dataset,
+                                    batch_size=BATCH_SIZE,
+                                    shuffle=True,
+                                    drop_last=True,
+                                    num_workers=2)
+                for e in range(EPOCH_NUM):
+                    for i, (image, label) in enumerate(loader()):
+                        out = simple_net(image)
+                        loss = fluid.layers.cross_entropy(out, label)
+                        avg_loss = fluid.layers.reduce_mean(loss)
+                        avg_loss.backward()
+                        opt.minimize(avg_loss)
+                        simple_net.clear_gradients()
+                        print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
+            # -------------------------------------------------------
            # -------------------- static graph ---------------------
+            paddle.enable_static()
            def simple_net(image, label):
                fc_tmp = fluid.layers.fc(image, size=CLASS_NUM, act='softmax')
                cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label)
@@ -270,11 +309,8 @@ class DataLoader(object):
            prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
-            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
            loader = DataLoader(dataset,
                                feed_list=[image, label],
-                                places=places,
                                batch_size=BATCH_SIZE, 
                                shuffle=True,
                                drop_last=True,
@@ -287,39 +323,6 @@ class DataLoader(object):
            # -------------------------------------------------------
-            # --------------------- dygraph mode --------------------
-            class SimpleNet(fluid.dygraph.Layer):
-                def __init__(self):
-                    super(SimpleNet, self).__init__()
-                    self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
-                def forward(self, image, label=None):
-                    return self.fc(image)
-            with fluid.dygraph.guard(places[0]):
-                simple_net = SimpleNet()
-                opt = fluid.optimizer.SGD(learning_rate=1e-3,
-                                          parameter_list=simple_net.parameters())
-                loader = DataLoader(dataset,
-                                    places=places[0],
-                                    batch_size=BATCH_SIZE,
-                                    shuffle=True,
-                                    drop_last=True,
-                                    num_workers=2)
-                for e in range(EPOCH_NUM):
-                    for i, (image, label) in enumerate(loader()):
-                        out = simple_net(image)
-                        loss = fluid.layers.cross_entropy(out, label)
-                        avg_loss = fluid.layers.reduce_mean(loss)
-                        avg_loss.backward()
-                        opt.minimize(avg_loss)
-                        simple_net.clear_gradients()
-                        print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
-            # -------------------------------------------------------
    .. note::
        For reading iterable dataset with multiprocess Dataloader,
@@ -356,11 +359,9 @@ class DataLoader(object):
                    "feed_list should be set when return_list=False"
        self.feed_list = feed_list
-        assert places is not None, "places cannot be None"
+        if places is None:
+            places = _current_expected_place()
        self.places = _convert_places(places)
-        if in_dygraph_mode():
-            assert len(self.places) == 1, \
-                    "Number of places must be 1 in dygraph mode"
        assert num_workers >= 0, "num_workers should be a non-negative value"
        if num_workers > 0 and (sys.platform == 'darwin' or

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -45,6 +45,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_fp16_allreduce_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
@@ -334,9 +335,6 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
 list(REMOVE_ITEM TEST_OPS test_sampling_id_op)
-list(REMOVE_ITEM TEST_OPS test_paddle_save_load)
 if (APPLE OR WIN32)
  list(REMOVE_ITEM TEST_OPS test_dataset)
@@ -458,6 +456,7 @@ if(WITH_DISTRIBUTE)
 	       py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
           py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
           py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
+           py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS})
    	   py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
    	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
 	   py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})

--- a/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+from paddle.distributed.fleet.meta_optimizers import FP16AllReduceOptimizer as FP16AllReduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+from dist_mnist import cnn_model
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+class TestDistMnist2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2):
+        # Input data
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        # Evaluator
+        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_acc = fluid.layers.accuracy(
+            input=predict, label=label, total=batch_size_tensor)
+        inference_program = fluid.default_main_program().clone()
+        # Optimization
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = FP16AllReduce(opt)
+        # Reader
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        opt.minimize(avg_cost)
+        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+if __name__ == "__main__":
+    runtime_main(TestDistMnist2x2)
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_activation_fuse_pass.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+class ConvActivationMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                data,
+                num_filters=self.conv_num_filters,
+                filter_size=self.conv_filter_size,
+                bias_attr=self.conv_bias_attr,
+                act=self.act)
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+    def set_params(self):
+        self.conv_num_filters = 3
+        self.conv_filter_size = 3
+        self.conv_bias_attr = False
+        self.act = "relu"
+        self.pass_name = 'conv_relu_mkldnn_fuse_pass'
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+class ConvActivationMkldnnFusePassTest_1(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 5
+        self.conv_filter_size = 5
+        self.conv_bias_attr = True
+        self.act = "relu"
+        self.pass_name = 'conv_relu_mkldnn_fuse_pass'
+class ConvActivationMkldnnFusePassTest_2(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 3
+        self.conv_filter_size = 3
+        self.conv_bias_attr = False
+        self.act = "leaky_relu"
+        self.pass_name = 'conv_leaky_relu_mkldnn_fuse_pass'
+class ConvActivationMkldnnFusePassTest_3(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 5
+        self.conv_filter_size = 5
+        self.conv_bias_attr = True
+        self.act = "leaky_relu"
+        self.pass_name = 'conv_leaky_relu_mkldnn_fuse_pass'
+class ConvActivationMkldnnFusePassTest_4(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 3
+        self.conv_filter_size = 3
+        self.conv_bias_attr = False
+        self.act = "relu6"
+        self.pass_name = 'conv_relu6_mkldnn_fuse_pass'
+class ConvActivationMkldnnFusePassTest_4(ConvActivationMkldnnFusePassTest):
+    def set_params(self):
+        self.conv_num_filters = 5
+        self.conv_filter_size = 5
+        self.conv_bias_attr = True
+        self.act = "swish"
+        self.pass_name = 'conv_swish_mkldnn_fuse_pass'
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+class ConvConcatReluMkldnnFusePassTest_0(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_1 = fluid.data(
+                name="data_1", shape=[-1, 3, 100, 100], dtype="float32")
+            data_2 = fluid.data(
+                name="data_2", shape=[-1, 3, 100, 100], dtype="float32")
+            conv_1 = fluid.layers.conv2d(
+                data_1,
+                num_filters=self.conv1_num_filters,
+                filter_size=self.conv1_filter_size,
+                padding=self.conv1_padding,
+                bias_attr=self.conv1_bias_attr)
+            conv_2 = fluid.layers.conv2d(
+                data_2,
+                num_filters=self.conv2_num_filters,
+                filter_size=self.conv2_filter_size,
+                padding=self.conv2_padding,
+                bias_attr=self.conv2_bias_attr)
+            concat = fluid.layers.concat(
+                [conv_1, conv_2], axis=self.concat_axis)
+            out = fluid.layers.relu(concat)
+        self.feeds = {
+            "data_1": np.random.random((1, 3, 100, 100)).astype("float32"),
+            "data_2": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [out]
+        self.enable_mkldnn = True
+    def set_params(self):
+        self.conv1_num_filters = 3
+        self.conv1_filter_size = 3
+        self.conv1_padding = 0
+        self.conv1_bias_attr = False
+        self.conv2_num_filters = 3
+        self.conv2_filter_size = 3
+        self.conv2_padding = 0
+        self.conv2_bias_attr = False
+        self.concat_axis = 0
+        self.pass_name = "conv_concat_relu_mkldnn_fuse_pass"
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+class ConvConcatReluMkldnnFusePassTest_1(ConvConcatReluMkldnnFusePassTest_0):
+    def set_params(self):
+        self.conv1_num_filters = 3
+        self.conv1_filter_size = 3
+        self.conv1_padding = 0
+        self.conv1_bias_attr = False
+        self.conv2_num_filters = 5
+        self.conv2_filter_size = 5
+        self.conv2_padding = 1
+        self.conv2_bias_attr = True
+        self.concat_axis = 1
+        self.pass_name = "conv_concat_relu_mkldnn_fuse_pass"
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_transpose_reshape_fuse_pass.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+class MatmulTransposeReshapeMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=self.data_shape, dtype="float32")
+            weight = fluid.layers.create_parameter(
+                shape=self.weight_shape, dtype="float32")
+            matmul = fluid.layers.matmul(
+                data,
+                weight,
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y)
+            transpose = fluid.layers.transpose(matmul, self.tranpose_perm)
+            reshape = fluid.layers.reshape(transpose, shape=self.reshape_shape)
+        self.fetch_list = [reshape]
+        self.enable_mkldnn = True
+    def set_params(self):
+        self.data_shape = [-1, 3, 100, 110]
+        self.weight_shape = [1, 3, 110, 100]
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 110)).astype("float32")
+        }
+        self.transpose_x = False
+        self.transpose_y = False
+        self.tranpose_perm = [0, 2, 1, 3]
+        self.reshape_shape = [3, 100, 100]
+        self.pass_name = 'matmul_transpose_reshape_fuse_pass'
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+class MatmulTransposeReshapeMkldnnFusePassTest_1(
+        MatmulTransposeReshapeMkldnnFusePassTest):
+    def set_params(self):
+        self.data_shape = [-1, 3, 100, 100]
+        self.weight_shape = [1, 3, 100, 100]
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.transpose_x = True
+        self.transpose_y = True
+        self.tranpose_perm = [0, 2, 1, 3]
+        self.reshape_shape = [6, 50, 100]
+        self.pass_name = 'matmul_transpose_reshape_fuse_pass'
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_scale_matmul_fuse_pass.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+class ScaleMatmulMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[1, 3, 100, 100], dtype="float32")
+            weight = fluid.layers.create_parameter(
+                shape=[1, 3, 100, 100], dtype="float32")
+            scale = fluid.layers.scale(data, scale=self.scale_scale)
+            matmul = fluid.layers.matmul(
+                scale,
+                weight,
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y)
+        self.fetch_list = [matmul]
+        self.enable_mkldnn = True
+    def set_params(self):
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.scale_scale = 2.0
+        self.transpose_x = False
+        self.transpose_y = False
+        self.pass_name = "scale_matmul_fuse_pass"
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+    def test_pass_compatible(self):
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+class ScaleMatmulMkldnnFusePassTest_1(ScaleMatmulMkldnnFusePassTest):
+    def set_params(self):
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.scale_scale = 5.0
+        self.transpose_x = True
+        self.transpose_y = True
+        self.pass_name = "scale_matmul_fuse_pass"
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_relu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_relu_fuse_pass.py
@@ -20,25 +20,13 @@ from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
-class ConvBnFusePassMKLDNNTest(InferencePassTest):
+class SeqConcatFCFusePassTest(InferencePassTest):
-    def setUp(self):
+    def test_compatible(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
+        self.assertTrue(
-            data = fluid.data(
+            PassVersionChecker.IsCompatible('seq_concat_fc_fuse_pass'))
-                name="data", shape=[-1, 3, 100, 100], dtype="float32")
-            conv_out = fluid.layers.conv2d(
-                data, num_filters=3, filter_size=3, bias_attr=False, act="relu")
-        self.feeds = {
-            "data": np.random.random((1, 3, 100, 100)).astype("float32")
-        }
-        self.fetch_list = [conv_out]
-        self.enable_mkldnn = True
-    def test_check_output(self):
-        use_gpu = False
-        self.check_output_with_option(use_gpu)
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -147,5 +147,29 @@ class TestSquareDoubleGradCheck(unittest.TestCase):
            self.func(p)
+class TestAbsDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        shape = [2, 3, 7, 9]
+        eps = 1e-6
+        dtype = np.float64
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.abs(x)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -28,6 +28,7 @@ from paddle.fluid import compiler, Program, program_guard
 class TestSqrtOpError(unittest.TestCase):
    def test_errors(self):
+        paddle.enable_static()
        with program_guard(Program(), Program()):
            # The input type of sqrt op must be Variable or numpy.ndarray.
            in1 = 1
@@ -44,6 +45,7 @@ class TestSqrtOpError(unittest.TestCase):
 class TestActivation(OpTest):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "exp"
        self.init_dtype()
        self.init_kernel_type()
@@ -71,6 +73,7 @@ class TestActivation(OpTest):
 class TestParameter(object):
    def test_out_name(self):
+        paddle.enable_static()
        with fluid.program_guard(fluid.Program()):
            np_x = np.array([0.1])
            data = fluid.layers.data(name="X", shape=[1])
@@ -92,6 +95,7 @@ class TestParameter(object):
 class TestSigmoid(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "sigmoid"
        self.init_dtype()
@@ -112,6 +116,7 @@ class TestSigmoid(TestActivation):
 class TestLogSigmoid(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "logsigmoid"
        self.init_dtype()
@@ -180,6 +185,7 @@ class TestLogSigmoidAPI(unittest.TestCase):
 class TestTanh(TestActivation, TestParameter):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "tanh"
        self.init_dtype()
        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
@@ -255,6 +261,7 @@ class TestTanhAPI(unittest.TestCase):
 class TestAtan(TestActivation, TestParameter):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "atan"
        self.init_dtype()
@@ -291,6 +298,7 @@ class TestAtan(TestActivation, TestParameter):
 class TestSinh(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "sinh"
        self.init_dtype()
@@ -349,6 +357,7 @@ class TestSinh(TestActivation):
 class TestSinhOpError(unittest.TestCase):
    def test_errors(self):
+        paddle.enable_static()
        with program_guard(Program()):
            # The input type must be Variable.
            self.assertRaises(TypeError, fluid.layers.sinh, 1)
@@ -362,6 +371,7 @@ class TestSinhOpError(unittest.TestCase):
 class TestCosh(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "cosh"
        self.init_dtype()
@@ -420,6 +430,7 @@ class TestCosh(TestActivation):
 class TestCoshOpError(unittest.TestCase):
    def test_errors(self):
+        paddle.enable_static()
        with program_guard(Program()):
            # The input type must be Variable.
            self.assertRaises(TypeError, fluid.layers.cosh, 1)
@@ -438,6 +449,7 @@ def ref_tanhshrink(x):
 class TestTanhshrink(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "tanh_shrink"
        self.init_dtype()
@@ -512,6 +524,7 @@ def ref_hardshrink(x, threshold):
 class TestHardShrink(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "hard_shrink"
        self.init_dtype()
@@ -541,6 +554,7 @@ class TestHardShrink_threshold_negative(TestHardShrink):
 class TestHardShrinkAPI(unittest.TestCase):
    # test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
    def setUp(self):
+        paddle.enable_static()
        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
            else paddle.CPUPlace()
@@ -662,6 +676,7 @@ def ref_softshrink(x, threshold=0.5):
 class TestSoftshrink(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "softshrink"
        self.init_dtype()
@@ -736,6 +751,7 @@ class TestSoftshrinkAPI(unittest.TestCase):
 class TestSqrt(TestActivation, TestParameter):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "sqrt"
        self.init_dtype()
@@ -753,6 +769,7 @@ class TestSqrt(TestActivation, TestParameter):
 class TestRsqrt(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "rsqrt"
        self.init_dtype()
@@ -770,6 +787,7 @@ class TestRsqrt(TestActivation):
 class TestAbs(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "abs"
        self.init_dtype()
@@ -792,6 +810,7 @@ class TestAbs(TestActivation):
 class TestCeil(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "ceil"
        self.init_dtype()
@@ -808,6 +827,7 @@ class TestCeil(TestActivation):
 class TestFloor(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "floor"
        self.init_dtype()
@@ -826,6 +846,7 @@ class TestFloor(TestActivation):
 class TestCos(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "cos"
        self.init_dtype()
@@ -843,6 +864,7 @@ class TestCos(TestActivation):
 class TestAcos(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "acos"
        self.init_dtype()
@@ -860,6 +882,7 @@ class TestAcos(TestActivation):
 class TestSin(TestActivation, TestParameter):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "sin"
        self.init_dtype()
@@ -877,6 +900,7 @@ class TestSin(TestActivation, TestParameter):
 class TestAsin(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "asin"
        self.init_dtype()
@@ -894,6 +918,7 @@ class TestAsin(TestActivation):
 class TestRound(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "round"
        self.init_dtype()
@@ -909,6 +934,7 @@ class TestRound(TestActivation):
 class TestRelu(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "relu"
        self.init_dtype()
@@ -979,6 +1005,7 @@ class TestLeakyRelu(TestActivation):
        return 0.02
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "leaky_relu"
        self.init_dtype()
        alpha = self.get_alpha()
@@ -1084,6 +1111,7 @@ def gelu(x, approximate):
 class TestGeluApproximate(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "gelu"
        self.init_dtype()
        approximate = True
@@ -1102,6 +1130,7 @@ class TestGeluApproximate(TestActivation):
 class TestGelu(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "gelu"
        self.init_dtype()
        approximate = False
@@ -1169,6 +1198,7 @@ class TestGELUAPI(unittest.TestCase):
 class TestBRelu(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "brelu"
        self.init_dtype()
@@ -1194,6 +1224,7 @@ class TestBRelu(TestActivation):
 class TestBReluOpError(unittest.TestCase):
    def test_errors(self):
+        paddle.enable_static()
        with program_guard(Program()):
            # The input type must be Variable.
            self.assertRaises(TypeError, fluid.layers.brelu, 1)
@@ -1215,6 +1246,7 @@ def ref_relu6(x, threshold=6.0):
 class TestRelu6(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "relu6"
        self.init_dtype()
@@ -1286,6 +1318,7 @@ class TestRelu6API(unittest.TestCase):
 class TestHardSwish(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = 'hard_swish'
        self.init_dtype()
@@ -1310,6 +1343,7 @@ class TestHardSwish(TestActivation):
 class TestHardSwishOpError(unittest.TestCase):
    def test_errors(self):
+        paddle.enable_static()
        with program_guard(Program()):
            # The input type must be Variable.
            self.assertRaises(TypeError, fluid.layers.hard_swish, 1)
@@ -1323,6 +1357,7 @@ class TestHardSwishOpError(unittest.TestCase):
 class TestSoftRelu(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "soft_relu"
        self.init_dtype()
@@ -1348,6 +1383,7 @@ class TestSoftRelu(TestActivation):
 class TestSoftReluOpError(unittest.TestCase):
    def test_errors(self):
+        paddle.enable_static()
        with program_guard(Program()):
            # The input type must be Variable.
            self.assertRaises(TypeError, fluid.layers.soft_relu, 1)
@@ -1366,6 +1402,7 @@ def elu(x, alpha):
 class TestELU(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "elu"
        self.init_dtype()
@@ -1435,6 +1472,7 @@ class TestELUAPI(unittest.TestCase):
 class TestReciprocal(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "reciprocal"
        self.init_dtype()
@@ -1452,6 +1490,7 @@ class TestReciprocal(TestActivation):
 class TestLog(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "log"
        self.init_dtype()
@@ -1478,6 +1517,7 @@ class TestLog(TestActivation):
 class TestLog1p(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "log1p"
        self.init_dtype()
@@ -1522,6 +1562,7 @@ class TestLog1p(TestActivation):
 class TestSquare(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "square"
        self.init_dtype()
@@ -1539,6 +1580,7 @@ class TestSquare(TestActivation):
 class TestPow(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "pow"
        self.init_dtype()
@@ -1557,6 +1599,7 @@ class TestPow(TestActivation):
 class TestPow_factor_tensor(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "pow"
        self.init_dtype()
@@ -1633,6 +1676,7 @@ class TestPow_factor_tensor(TestActivation):
 class TestSTanh(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "stanh"
        self.init_dtype()
@@ -1653,6 +1697,7 @@ class TestSTanh(TestActivation):
 class TestSTanhOpError(unittest.TestCase):
    def test_errors(self):
+        paddle.enable_static()
        with program_guard(Program()):
            # The input type must be Variable.
            self.assertRaises(TypeError, fluid.layers.stanh, 1)
@@ -1673,6 +1718,7 @@ def ref_softplus(x, beta=1, threshold=20):
 class TestSoftplus(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "softplus"
        self.init_dtype()
@@ -1751,6 +1797,7 @@ def ref_softsign(x):
 class TestSoftsign(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "softsign"
        self.init_dtype()
@@ -1818,6 +1865,7 @@ class TestSoftsignAPI(unittest.TestCase):
 class TestThresholdedRelu(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "thresholded_relu"
        self.init_dtype()
@@ -1841,6 +1889,7 @@ class TestThresholdedRelu(TestActivation):
 class TestThresholdedReluOpError(unittest.TestCase):
    def test_errors(self):
+        paddle.enable_static()
        with program_guard(Program()):
            # The input type must be Variable.
            self.assertRaises(TypeError, fluid.layers.thresholded_relu, 1)
@@ -1854,6 +1903,7 @@ class TestThresholdedReluOpError(unittest.TestCase):
 class TestHardSigmoid(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "hard_sigmoid"
        self.init_dtype()
@@ -1883,6 +1933,7 @@ class TestHardSigmoid(TestActivation):
 class TestHardSigmoidOpError(unittest.TestCase):
    def test_errors(self):
+        paddle.enable_static()
        with program_guard(Program()):
            # The input type must be Variable.
            self.assertRaises(TypeError, fluid.layers.hard_sigmoid, 1)
@@ -1896,6 +1947,7 @@ class TestHardSigmoidOpError(unittest.TestCase):
 class TestSwish(TestActivation):
    def setUp(self):
+        paddle.enable_static()
        self.op_type = "swish"
        self.init_dtype()
@@ -1915,6 +1967,7 @@ class TestSwish(TestActivation):
 class TestSwishOpError(unittest.TestCase):
    def test_errors(self):
+        paddle.enable_static()
        with program_guard(Program()):
            # The input type must be Variable.
            self.assertRaises(TypeError, fluid.layers.swish, 1)

--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -115,8 +115,15 @@ class InplaceTestBase(unittest.TestCase):
                        fetch_val2, = exe.run(compiled_prog,
                                              feed=feed_dict,
                                              fetch_list=[fetch_var])
+                        #NOTE(zhiqiu): Temporally changed from array_equal to allclose. 
-                        self.assertTrue(np.array_equal(fetch_val1, fetch_val2))
+                        # The real root is fuse_all_reduce and fuse_all_optimizer_opss may 
+                        # result in diff because of the instruction set on the virtual machine.
+                        # And the related unit tests: test_fuse_all_reduce_pass and test_fuse_optimizer_pass use "almostEqual" in their checks.
+                        # There are also some related issues:
+                        # https://github.com/PaddlePaddle/Paddle/issues/21270
+                        # https://github.com/PaddlePaddle/Paddle/issues/21046
+                        # https://github.com/PaddlePaddle/Paddle/issues/21045
+                        self.assertTrue(np.allclose(fetch_val1, fetch_val2))
    def check_multi_card_fetch_var(self):
        if self.is_invalid_test():
@@ -160,7 +167,8 @@ class InplaceTestBase(unittest.TestCase):
                        fetch_vals.append(fetch_val)
                for item in fetch_vals:
-                    self.assertTrue(np.array_equal(fetch_vals[0], item))
+                    # save above
+                    self.assertTrue(np.allclose(fetch_vals[0], item))
 class CUDAInplaceTest(InplaceTestBase):

--- a/tools/is_ut_disabled.py
+++ b/tools/is_ut_disabled.py
@@ -11,30 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Check whether ut is disabled. """
-import os
+from __future__ import print_function
-import sys
+import unittest
+from test_dist_base import TestDistBase
-def check_ut():
+class TestDistMnist2x2FP16AllReduce(TestDistBase):
-    """ Get disabled unit tests. """
+    def _setup_config(self):
-    disable_ut_file = 'disable_ut'
+        self._sync_mode = True
-    cmd = 'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/{}'.format(
+        self._use_reduce = False
-        disable_ut_file)
+        self._nccl2_mode = True
-    os.system(cmd)
-    with open(disable_ut_file) as utfile:
-        for u in utfile:
-            if u.rstrip('\r\n') == sys.argv[1]:
-                exit(0)
-    exit(1)
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_mnist_fp16_allreduce.py", delta=1e-5)
-if __name__ == '__main__':
-    if len(sys.argv) != 2:
+if __name__ == "__main__":
-        exit(1)
+    unittest.main()
-    try:
-        check_ut()
-    except Exception as e:
-        print(e)
-        exit(1)
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -102,6 +102,16 @@ class TestStrategyConfig(unittest.TestCase):
        strategy.dgc = "True"
        self.assertEqual(strategy.dgc, False)
+    def test_fp16_allreduce(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.fp16_allreduce = True
+        self.assertEqual(strategy.fp16_allreduce, True)
+        strategy.fp16_allreduce = False
+        self.assertEqual(strategy.fp16_allreduce, False)
+        with self.assertRaises(TypeError):
+            strategy.fp16_allreduce = "True"
+        self.assertEqual(strategy.fp16_allreduce, False)
    def test_sync_nccl_allreduce(self):
        strategy = paddle.distributed.fleet.DistributedStrategy()
        strategy.sync_nccl_allreduce = True

--- a/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import unittest
+import paddle
+import paddle.fluid as fluid
+import os
+paddle.enable_static()
+class TestFleetFP16CompressOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+    def net(self, main_prog, startup_prog, dtype='float32'):
+        with fluid.program_guard(main_prog, startup_prog):
+            input_x = paddle.fluid.layers.data(
+                name="x", shape=[32], dtype=dtype)
+            input_y = paddle.fluid.layers.data(
+                name="y", shape=[1], dtype='int64')
+            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                size=2,
+                                                act='softmax')
+            cost = paddle.fluid.layers.cross_entropy(
+                input=prediction, label=input_y)
+            avg_cost = paddle.fluid.layers.mean(x=cost)
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            strategy.fp16_allreduce = True
+        return avg_cost, strategy
+    def test_fp16_allreduce_optimizer(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+        ops = [op.type for op in avg_cost.block.ops]
+        cast_out = [
+            op.output('Out')[0] for op in avg_cost.block.ops
+            if op.type == 'cast'
+        ]
+        cast_op_count = 0
+        for name in ops:
+            if name == 'cast':
+                cast_op_count += 1
+        self.assertIn('cast', ops)
+        self.assertEqual(cast_op_count, 12)  # 6 + 6, cast_fp16 + cast_fp32
+        for name in cast_out:
+            self.assertIn('cast_fp16', name)
+    def test_fp16_allreduce_not_apply_fp16_net(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog, dtype='float16')
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertNotIn('cast', ops)
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -56,7 +56,10 @@ class TestDygraphGroupNormv2(unittest.TestCase):
            x = np.random.randn(*shape).astype("float32")
            y1 = compute_v1(x)
            y2 = compute_v2(x)
-            self.assertTrue(np.allclose(y1, y2))
+            result = np.allclose(y1, y2)
+            if not result:
+                print("y1:", y1, "\ty2:", y2)
+            self.assertTrue(result)
            test_weight_bias_false()
    def test_static(self):

--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -25,6 +25,9 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
+if fluid.is_compiled_with_cuda():
+    fluid.set_flags({'FLAGS_cudnn_deterministic': True})
 batch_size = 8
 train_parameters = {
    "input_size": [3, 224, 224],
@@ -340,7 +343,9 @@ class TestImperativeResneXt(unittest.TestCase):
                    label.stop_gradient = True
                    out = se_resnext(img)
-                    loss = fluid.layers.cross_entropy(input=out, label=label)
+                    softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+                    loss = fluid.layers.cross_entropy(
+                        input=softmax_out, label=label)
                    avg_loss = fluid.layers.mean(x=loss)
                    dy_out = avg_loss.numpy()
@@ -386,7 +391,8 @@ class TestImperativeResneXt(unittest.TestCase):
                name='pixel', shape=[3, 224, 224], dtype='float32')
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            out = se_resnext(img)
-            loss = fluid.layers.cross_entropy(input=out, label=label)
+            softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+            loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
            avg_loss = fluid.layers.mean(x=loss)
            optimizer.minimize(avg_loss)
@@ -443,7 +449,9 @@ class TestImperativeResneXt(unittest.TestCase):
                        static_grad_value[static_grad_name_list[
                            i - grad_start_pos]] = out[i]
-        self.assertTrue(np.allclose(static_out, dy_out))
+        self.assertTrue(
+            np.allclose(static_out, dy_out),
+            "\nstatic_out: {}\ndy_out: {}".format(static_out, dy_out))
        self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
@@ -455,16 +463,23 @@ class TestImperativeResneXt(unittest.TestCase):
        self.assertEqual(len(dy_grad_value), len(static_grad_value))
        for key, value in six.iteritems(static_grad_value):
-            self.assertTrue(np.allclose(value, dy_grad_value[key]))
+            self.assertTrue(
+                np.allclose(value, dy_grad_value[key]),
+                "\nstatic_grad_value: {}\ndy_grad_value: {}".format(
+                    value, dy_grad_value[key]))
            self.assertTrue(np.isfinite(value.all()))
            self.assertFalse(np.isnan(value.any()))
        self.assertEqual(len(dy_param_value), len(static_param_value))
        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            self.assertTrue(
+                np.allclose(value, dy_param_value[key]),
+                "\nstatic_param_value: {}\ndy_param_value: {}".format(
+                    value, dy_param_value[key]))
            self.assertTrue(np.isfinite(value.all()))
            self.assertFalse(np.isnan(value.any()))
 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -65,15 +65,21 @@ class TestMatMulV2Op(OpTest):
        self.y_shape = (100, )
        self.trans_x = False
        self.trans_y = False
+    def init_kernel_type(self):
        self.dtype = "float64"
    def setUp(self):
+        self.init_kernel_type()
        self.config()
        self.op_type = "matmul_v2"
        x = np.random.random(self.x_shape).astype(self.dtype)
        y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        x = -0.1 + 0.2 * x
+        y = -0.1 + 0.2 * y
        result = reference_matmul(x, y, self.trans_x, self.trans_y)
+        result = result.astype(self.dtype)
        self.inputs = {
            'X': x,
            'Y': y,
@@ -98,7 +104,6 @@ class TestMatMuklOp2(TestMatMulV2Op):
        self.y_shape = (1, 3, 2, 100)
        self.trans_x = False
        self.trans_y = True
-        self.dtype = "float64"
 class TestMatMuklOp3(TestMatMulV2Op):
@@ -111,7 +116,6 @@ class TestMatMuklOp3(TestMatMulV2Op):
        self.y_shape = (1, 1, 100, 2)
        self.trans_x = False
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp4(TestMatMulV2Op):
@@ -124,7 +128,6 @@ class TestMatMuklOp4(TestMatMulV2Op):
        self.y_shape = (1, 2, 100, 2)
        self.trans_x = False
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp5(TestMatMulV2Op):
@@ -133,11 +136,10 @@ class TestMatMuklOp5(TestMatMulV2Op):
    """
    def config(self):
-        self.x_shape = (1, 1, 100, 2)
+        self.x_shape = (1, 1, 100, 1)
        self.y_shape = (100, )
        self.trans_x = True
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp6(TestMatMulV2Op):
@@ -150,7 +152,6 @@ class TestMatMuklOp6(TestMatMulV2Op):
        self.y_shape = (100, )
        self.trans_x = True
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp7(TestMatMulV2Op):
@@ -163,7 +164,6 @@ class TestMatMuklOp7(TestMatMulV2Op):
        self.y_shape = (100, )
        self.trans_x = False
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp8(TestMatMulV2Op):
@@ -176,7 +176,6 @@ class TestMatMuklOp8(TestMatMulV2Op):
        self.y_shape = (1, 1, 100, 2)
        self.trans_x = False
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp9(TestMatMulV2Op):
@@ -189,7 +188,6 @@ class TestMatMuklOp9(TestMatMulV2Op):
        self.y_shape = (2, 1, 2, 100)
        self.trans_x = False
        self.trans_y = True
-        self.dtype = "float64"
 class TestMatMuklOp10(TestMatMulV2Op):
@@ -198,11 +196,10 @@ class TestMatMuklOp10(TestMatMulV2Op):
    """
    def config(self):
-        self.x_shape = (1, 1, 2, 100)
+        self.x_shape = (1, 1, 25, 4)
-        self.y_shape = (1, 2, 100, 2)
+        self.y_shape = (1, 2, 4, 25)
        self.trans_x = False
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp11(TestMatMulV2Op):
@@ -215,7 +212,6 @@ class TestMatMuklOp11(TestMatMulV2Op):
        self.y_shape = (1, 1, 100, 2)
        self.trans_x = False
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp12(TestMatMulV2Op):
@@ -224,11 +220,10 @@ class TestMatMuklOp12(TestMatMulV2Op):
    """
    def config(self):
-        self.x_shape = (2, 1, 100, 2)
+        self.x_shape = (2, 1, 4, 25)
-        self.y_shape = (1, 1, 100, 2)
+        self.y_shape = (1, 1, 4, 25)
        self.trans_x = True
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp13(TestMatMulV2Op):
@@ -237,11 +232,10 @@ class TestMatMuklOp13(TestMatMulV2Op):
    """
    def config(self):
-        self.x_shape = (2, 2, 100, 2)
+        self.x_shape = (2, 2, 2, 50)
-        self.y_shape = (2, 2, 100, 2)
+        self.y_shape = (2, 2, 2, 50)
        self.trans_x = True
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp14(TestMatMulV2Op):
@@ -254,7 +248,6 @@ class TestMatMuklOp14(TestMatMulV2Op):
        self.y_shape = (1, 2, 2, 100, 2)
        self.trans_x = True
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp15(TestMatMulV2Op):
@@ -267,7 +260,6 @@ class TestMatMuklOp15(TestMatMulV2Op):
        self.y_shape = (1, 2, 2, 100, 1)
        self.trans_x = False
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp16(TestMatMulV2Op):
@@ -277,10 +269,9 @@ class TestMatMuklOp16(TestMatMulV2Op):
    def config(self):
        self.x_shape = (100)
-        self.y_shape = (1, 2, 2, 100, 1)
+        self.y_shape = (1, 2, 2, 100, 2)
        self.trans_x = False
        self.trans_y = False
-        self.dtype = "float64"
 class TestMatMuklOp17(TestMatMulV2Op):
@@ -293,7 +284,54 @@ class TestMatMuklOp17(TestMatMulV2Op):
        self.y_shape = (100)
        self.trans_x = False
        self.trans_y = False
-        self.dtype = "float64"
+#--------------------test matmul fp16--------------------
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=atol)
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_grad_with_place(
+                    place, ['X', 'Y'],
+                    'Out',
+                    max_relative_error=max_relative_error)
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+create_test_fp16_class(TestMatMulV2Op)
+create_test_fp16_class(TestMatMuklOp2)
+create_test_fp16_class(TestMatMuklOp3)
+create_test_fp16_class(TestMatMuklOp4)
+create_test_fp16_class(TestMatMuklOp5)
+create_test_fp16_class(TestMatMuklOp6)
+create_test_fp16_class(TestMatMuklOp7)
+create_test_fp16_class(TestMatMuklOp8)
+create_test_fp16_class(TestMatMuklOp9)
+create_test_fp16_class(TestMatMuklOp10)
+create_test_fp16_class(TestMatMuklOp11)
+create_test_fp16_class(TestMatMuklOp12)
+create_test_fp16_class(TestMatMuklOp13)
+create_test_fp16_class(TestMatMuklOp14)
+create_test_fp16_class(TestMatMuklOp15)
+create_test_fp16_class(TestMatMuklOp16)
+create_test_fp16_class(TestMatMuklOp17)
 class TestMatMulV2API(unittest.TestCase):
@@ -331,6 +369,17 @@ class TestMatMulV2API(unittest.TestCase):
                y = paddle.to_tensor(input_y)
                result = paddle.matmul(x, y)
+    def test_dygraph_fp16(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                with fluid.dygraph.guard(place):
+                    input_x = np.random.random([4, 3]).astype("float16")
+                    input_y = np.random.random([3, 4]).astype("float16")
+                    x = paddle.to_tensor(input_x)
+                    y = paddle.to_tensor(input_y)
+                    result = paddle.matmul(x, y)
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
            dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
            dataloader = DataLoader(
                dataset,
-                places=places,
                num_workers=num_workers,
                batch_size=BATCH_SIZE,
                drop_last=True)

--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
@@ -76,7 +76,6 @@ class TestDygraphDataLoader(unittest.TestCase):
            dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
            dataloader = DataLoader(
                dataset,
-                places=places,
                num_workers=num_workers,
                batch_size=BATCH_SIZE,
                drop_last=True)

--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -130,5 +130,41 @@ class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
        self.shape = [2, 2, 3, 4, 5]
+class TestBatchNormDoubleGradCheckCase5(TestBatchNormDoubleGradCheck):
+    @prog_scope()
+    def func(self, place):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+            np.random.seed()
+            dtype = "float32"
+            eps = 0.005
+            atol = 2e-4
+            chn = self.shape[1] if self.data_layout == 'NCHW' else self.shape[
+                -1]
+            x = layers.create_parameter(dtype=dtype, shape=self.shape, name='x')
+            z = fluid.layers.batch_norm(
+                input=x,
+                data_layout=self.data_layout,
+                use_global_stats=self.use_global_stats)
+            x_arr = np.random.uniform(-1, 1, self.shape).astype(dtype)
+            w, b = prog.global_block().all_parameters()[1:3]
+            w_arr = np.ones(chn).astype(dtype)
+            b_arr = np.zeros(chn).astype(dtype)
+            gradient_checker.double_grad_check(
+                [x, w, b],
+                z,
+                x_init=[x_arr, w_arr, b_arr],
+                atol=atol,
+                place=place,
+                eps=eps)
+class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5):
+    def init_test(self):
+        self.data_layout = 'NCHW'
+        self.use_global_stats = True
+        self.shape = [2, 3, 4, 5]
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -29,19 +29,23 @@ IMAGE_SIZE = 784
 CLASS_NUM = 10
-# define a random dataset
+def random_batch_reader():
-class RandomDataset(paddle.io.Dataset):
+    def _get_random_inputs_and_labels():
-    def __init__(self, num_samples):
-        self.num_samples = num_samples
-    def __getitem__(self, idx):
        np.random.seed(SEED)
-        image = np.random.random([IMAGE_SIZE]).astype('float32')
+        image = np.random.random([BATCH_SIZE, IMAGE_SIZE]).astype('float32')
-        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+        label = np.random.randint(0, CLASS_NUM - 1, (
+            BATCH_SIZE,
+            1, )).astype('int64')
        return image, label
-    def __len__(self):
+    def __reader__():
-        return self.num_samples
+        for _ in range(BATCH_NUM):
+            batch_image, batch_label = _get_random_inputs_and_labels()
+            batch_image = paddle.to_tensor(batch_image)
+            batch_label = paddle.to_tensor(batch_label)
+            yield batch_image, batch_label
+    return __reader__
 class LinearNet(nn.Layer):
@@ -66,8 +70,7 @@ def train(layer, loader, loss_fn, opt):
 class TestSaveLoad(unittest.TestCase):
    def setUp(self):
        # enable dygraph mode
-        self.place = paddle.CPUPlace()
+        paddle.disable_static()
-        paddle.disable_static(self.place)
        # config seed
        paddle.manual_seed(SEED)
@@ -81,14 +84,8 @@ class TestSaveLoad(unittest.TestCase):
        adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
        # create data loader
-        dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+        # TODO: using new DataLoader cause unknown Timeout on windows, replace it
-        loader = paddle.io.DataLoader(
+        loader = random_batch_reader()
-            dataset,
-            places=self.place,
-            batch_size=BATCH_SIZE,
-            shuffle=True,
-            drop_last=True,
-            num_workers=2)
        # train
        train(layer, loader, loss_fn, adam)
@@ -103,8 +100,8 @@ class TestSaveLoad(unittest.TestCase):
        layer, opt = self.build_and_train_model()
        # save
-        layer_save_path = "linear.pdparams"
+        layer_save_path = "test_paddle_save_load.linear.pdparams"
-        opt_save_path = "linear.pdopt"
+        opt_save_path = "test_paddle_save_load.linear.pdopt"
        layer_state_dict = layer.state_dict()
        opt_state_dict = opt.state_dict()
@@ -120,7 +117,7 @@ class TestSaveLoad(unittest.TestCase):
        # test save load in static mode
        paddle.enable_static()
-        static_save_path = "static_mode_test/linear.pdparams"
+        static_save_path = "static_mode_test/test_paddle_save_load.linear.pdparams"
        paddle.save(layer_state_dict, static_save_path)
        load_static_state_dict = paddle.load(static_save_path)
        self.check_load_state_dict(layer_state_dict, load_static_state_dict)
@@ -133,15 +130,15 @@ class TestSaveLoad(unittest.TestCase):
        # 2. test save path format error
        with self.assertRaises(ValueError):
-            paddle.save(layer_state_dict, "linear.model/")
+            paddle.save(layer_state_dict, "test_paddle_save_load.linear.model/")
        # 3. test load path not exist error
        with self.assertRaises(ValueError):
-            paddle.load("linear.params")
+            paddle.load("test_paddle_save_load.linear.params")
        # 4. test load old save path error
        with self.assertRaises(ValueError):
-            paddle.load("linear")
+            paddle.load("test_paddle_save_load.linear")
 if __name__ == '__main__':

--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -156,8 +156,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
    def __check_input(x, y):
        var_names = {'x': x, 'y': y}
        for name, val in var_names.items():
-            check_variable_and_dtype(val, name, ['float32', 'float64'],
+            check_variable_and_dtype(
-                                     'matmul')
+                val, name, ['float16', 'float32', 'float64'], 'matmul')
    __check_input(x, y)

--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -286,7 +286,7 @@ fi
 # Get the list of PR authors with unresolved unit test issues
 pip install PyGithub
 # For getting PR related data
-wget https://sys-p0.bj.bcebos.com/blk/block.txt --no-check-certificate
+wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
 wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate
 HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
 if [ "${HASUTFIXED}" != "" ]; then

--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -80,9 +80,7 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
    make -j8 && make install && \
    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
-# gcc4.8 TRT
+# Downgrade gcc&&g++
-RUN mkdir -p /opt/compiler && cd /opt/compiler && \
+<install_gcc>
-    wget -q https://paddle-ci.gz.bcebos.com/gcc-4.8.2.tar.gz && \
-    tar xf gcc-4.8.2.tar.gz && rm -f gcc-4.8.2.tar.gz
 CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -21,7 +21,7 @@ function make_ubuntu_dockerfile(){
 function make_centos_dockerfile(){
  dockerfile_name="Dockerfile.cuda9_cudnn7_gcc48_py35_centos6"
-  sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.centos >${dockerfile_name}
+  sed 's/<baseimg>/10.2-cudnn7-devel-centos6/g' Dockerfile.centos >${dockerfile_name}
  sed -i 's#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g' ${dockerfile_name} 
  dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'`
  sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so && \
@@ -29,6 +29,15 @@ function make_centos_dockerfile(){
     rm -rf /usr/include/NvInfer*" ${dockerfile_name}
  sed -i "${dockerfile_line}i RUN wget --no-check-certificate  -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
+  sed -i 's#<install_gcc>#WORKDIR /usr/bin \
+      COPY tools/dockerfile/build_scripts /build_scripts \
+      RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
+      RUN cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \
+      ENV PATH=/usr/local/gcc-8.2/bin:$PATH #g' ${dockerfile_name}
 }

--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -534,13 +534,6 @@ def get_incrementapi():
                f.write('\n')
-# only white on CPU
-gpu_not_white = [
-    "deformable_conv", "cuda_places", "CUDAPinnedPlace", "CUDAPlace",
-    "cuda_profiler", 'DGCMomentumOptimizer'
-]
 def get_wlist():
    '''
    this function will get the white list of API.
@@ -552,17 +545,25 @@ def get_wlist():
    '''
    wlist = []
    wlist_file = []
+    # only white on CPU
+    gpu_not_white = []
    with open("wlist.json", 'r') as load_f:
        load_dict = json.load(load_f)
        for key in load_dict:
-            if key == 'wlist_file':
+            if key == 'wlist_dir':
-                wlist_file = wlist_file + load_dict[key]
+                for item in load_dict[key]:
+                    wlist_file.append(item["name"])
+            elif key == "gpu_not_white":
+                gpu_not_white = load_dict[key]
+            elif key == "wlist_api":
+                for item in load_dict[key]:
+                    wlist.append(item["name"])
            else:
                wlist = wlist + load_dict[key]
-    return wlist, wlist_file
+    return wlist, wlist_file, gpu_not_white
-wlist, wlist_file = get_wlist()
+wlist, wlist_file, gpu_not_white = get_wlist()
 if len(sys.argv) < 2:
    print("Error: inadequate number of arguments")

--- a/tools/wlist.json
+++ b/tools/wlist.json
 {
-    "wlist_file" : [
+    "wlist_dir":[
-        "../python/paddle/fluid/contrib", 
+        {
-        "../python/paddle/verison.py",
+            "name":"../python/paddle/fluid/contrib",
-        "../python/paddle/fluid/core_avx.py",
+            "annotation":""
-        "../python/paddle/distributed"
+        },
+        {
+            "name":"../python/paddle/verison.py",
+            "annotation":""
+        },
+        {
+            "name":"../python/paddle/fluid/core_avx.py",
+            "annotation":""
+        },
+        {
+            "name":"../python/paddle/distributed",
+            "annotation":""
+        }
    ],
-    "wlist_inneed":[
+    "wlist_api":[
+        {
+            "name":"xxxxx",
+            "annotation":"not a real api, just for example"
+        }
+    ],
+    "wlist_temp_api":[
        "append_LARS",
        "BuildStrategy.debug_graphviz_path",
        "BuildStrategy.enable_sequential_execution",
@@ -63,9 +81,7 @@
        "cuda_places",
        "CUDAPinnedPlace",
        "CUDAPlace",
-        "Program.parse_from_string"
+        "Program.parse_from_string",
-    ],
-    "wlist_nosample":[
        "Compressor",
        "Compressor.config",
        "Compressor.run",
@@ -159,13 +175,9 @@
        "RNN",
        "BiRNN",
        "RNNCellBase",
-        "RNNCellBase.get_initial_states"
+        "RNNCellBase.get_initial_states",
-    ],
-    "wlist_no_op_pass":[
        "gelu",
-        "erf"
+        "erf",
-    ],
-    "wlist_ci_nopass":[
        "DecodeHelper",
        "DecodeHelper.initialize",
        "DecodeHelper.sample",
@@ -188,9 +200,7 @@
        "SampleEmbeddingHelper",
        "BasicDecoder",
        "lstm",
-        "partial_sum"
+        "partial_sum",
-    ],
-    "wlist_nopass":[
        "StateCell",
        "StateCell.compute_state",
        "TrainingDecoder",
@@ -242,9 +252,7 @@
        "GroupNorm",
        "SpectralNorm",
        "TreeConv",
-        "prroi_pool"
+        "prroi_pool",
-    ],
-    "wlist_temp":[
        "to_tensor",
        "ChunkEvaluator",
        "EditDistance",
@@ -322,9 +330,7 @@
        "Conv2DTranspose",
        "QueueDataset.local_shuffle",
        "save_persistables@dygraph/checkpoint.py",
-        "load_persistables@dygraph/checkpoint.py"
+        "load_persistables@dygraph/checkpoint.py",
-    ],
-    "wlist_ignore":[
        "elementwise_pow",
        "WeightedAverage.reset",
        "ChunkEvaluator.eval",
@@ -401,5 +407,13 @@
        "LinearChainCRF.forward",
        "CRFDecoding.forward",
        "SequenceTagging.forward"
+    ],
+    "gpu_not_white":[
+        "deformable_conv",
+        "cuda_places",
+        "CUDAPinnedPlace",
+        "CUDAPlace",
+        "cuda_profiler",
+        "DGCMomentumOptimizer"
    ]
 }