From 4069262f0e06da6f240ac4c9e90ba0403a94bc4d Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Fri, 17 Aug 2018 10:58:01 +0800
Subject: [PATCH] Revert ""cherry picked operators changes" (#12184)" (#12747)

This reverts commit bf3c34960f2a59a2616957f8fb4107b2ac7aa02b.
---
 paddle/fluid/operators/activation_op.cu       |  4 +-
 paddle/fluid/operators/activation_op.h        | 12 ++--
 paddle/fluid/operators/assign_value_op.cu.cc  |  5 +-
 paddle/fluid/operators/conv_cudnn_op.cu.cc    | 56 +++++++-----------
 paddle/fluid/operators/cross_entropy_op.cu    | 12 ++--
 paddle/fluid/operators/elementwise_add_op.cu  |  3 +-
 paddle/fluid/operators/elementwise_div_op.cu  |  9 +--
 paddle/fluid/operators/elementwise_mul_op.cu  |  8 +--
 .../fluid/operators/elementwise_op_function.h |  4 +-
 paddle/fluid/operators/elementwise_sub_op.cu  |  8 +--
 paddle/fluid/operators/fill_constant_op.cc    | 53 +++++++++++------
 paddle/fluid/operators/fill_constant_op.cu.cc | 26 --------
 paddle/fluid/operators/fill_constant_op.h     | 48 ---------------
 paddle/fluid/operators/fill_op.cc             |  2 +-
 paddle/fluid/operators/gaussian_random_op.cu  |  2 -
 paddle/fluid/operators/math/cross_entropy.cu  | 20 +------
 paddle/fluid/operators/math/cross_entropy.h   | 17 ------
 .../operators/math/selected_rows_functor.cu   | 13 +---
 paddle/fluid/operators/math/softmax.cu        |  3 -
 paddle/fluid/operators/mean_op.cu             | 10 ++--
 paddle/fluid/operators/mean_op.h              |  2 +-
 paddle/fluid/operators/mul_op.cu.cc           |  7 +--
 paddle/fluid/operators/pool_cudnn_op.cu.cc    |  6 +-
 paddle/fluid/operators/scale_op.cu            |  6 +-
 paddle/fluid/operators/softmax_cudnn_op.cu.cc |  3 +-
 paddle/fluid/operators/softmax_op.cu.cc       |  3 +-
 paddle/fluid/operators/sum_op.cu              |  5 +-
 paddle/fluid/operators/sum_op.h               |  2 +-
 paddle/fluid/operators/top_k_op.cu            | 28 ++-------
 paddle/fluid/operators/uniform_random_op.cu   | 59 +++----------------
 30 files changed, 108 insertions(+), 328 deletions(-)
 delete mode 100644 paddle/fluid/operators/fill_constant_op.cu.cc
 delete mode 100644 paddle/fluid/operators/fill_constant_op.h
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index d3a7ceed4..27487b396 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -26,8 +26,6 @@ namespace plat = paddle::platform;
       act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
                                                  ops::grad_functor<float>>, \
       ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>,                 \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<plat::float16>>);
+                                ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 48f3b5a5b..912415192 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -333,7 +333,8 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0.5) * dout / out;
+    const Out out_conj = Eigen::numext::conj(out);
+    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
   }
 };
 
@@ -739,7 +740,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * static_cast<T>(factor) *
-                   x.pow(static_cast<T>(factor) - static_cast<T>(1));
+                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
   }
 };
 
@@ -862,11 +863,10 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    T b = static_cast<T>(beta);
     auto temp1 = static_cast<T>(1) /
-                 (static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
-    dx.device(d) = dout * ((b * out) + temp2);
+                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
+    dx.device(d) = dout * ((beta * out) + temp2);
   }
 };
 
diff --git a/paddle/fluid/operators/assign_value_op.cu.cc b/paddle/fluid/operators/assign_value_op.cu.cc
index 0ff174b38..08bfde5dc 100644
--- a/paddle/fluid/operators/assign_value_op.cu.cc
+++ b/paddle/fluid/operators/assign_value_op.cu.cc
@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/assign_value_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
-                        ops::AssignValueKernel<float>,
-                        ops::AssignValueKernel<plat::float16>);
+                        ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 59bfe8f61..22cbf680c 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -39,27 +39,6 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
     static_cast<size_t>(1024) * 1024 * 1024;
 
-template <typename T, typename DeviceContext>
-// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
-bool EnableFp16(const DeviceContext& dev_ctx,
-                cudnnConvolutionDescriptor_t cudnn_conv_desc) {
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-  // Tensor core is supported since the volta GPU and
-  // is only enabled when input and filter data are float16
-  if (dev_ctx.GetComputeCapability() >= 70 &&
-      std::type_index(typeid(T)) ==
-          std::type_index(typeid(platform::float16))) {
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
-    return true;
-  } else {
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        cudnn_conv_desc, CUDNN_DEFAULT_MATH));
-  }
-#endif
-  return false;
-}
-
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
  public:
@@ -149,14 +128,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionFwdAlgo_t algo;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
-    if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Tensor core is supported since the volta GPU and
+    // is only enabled when input and filter data are float16
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      // Currently tensor core is only enabled using this algo
       algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     } else {
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          workspace_size_limit, &algo));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
     }
+#endif
 
     // get workspace size able to allocate
     CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
@@ -296,9 +288,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       } else {
         data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       }
-      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
-        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-      }
 
       CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
@@ -318,9 +307,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       } else {
         filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
       }
-      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
-        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
-      }
 
       CUDNN_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@@ -376,8 +362,7 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
 
 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
@@ -385,5 +370,4 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>)
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu
index 65fd3a5db..30dbd5bd3 100644
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -13,16 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 using CUDACtx = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(cross_entropy,
                         ops::CrossEntropyOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyOpKernel<CUDACtx, double>,
-                        ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
-    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
-    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
+                        ops::CrossEntropyOpKernel<CUDACtx, double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu
index f9f5c66d3..dfff518f1 100644
--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -30,5 +30,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise_div_op.cu b/paddle/fluid/operators/elementwise_div_op.cu
index 4cc7ba0f4..588d1f742 100644
--- a/paddle/fluid/operators/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise_div_op.cu
@@ -14,24 +14,19 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_div_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_div_grad,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>);
+                                  int64_t>);
diff --git a/paddle/fluid/operators/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise_mul_op.cu
index 350d43168..2fb1b4bee 100644
--- a/paddle/fluid/operators/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
@@ -14,25 +14,19 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_mul_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_mul_grad,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
     ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 7223a972d..bc3e95e90 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
   int j = blockIdx.x;
   int i = threadIdx.x;
   int tid = threadIdx.x;
-  T val(0);
+  T val = 0;
 
   do {
     int x_offset = i * w + j;
@@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
   int tid = threadIdx.x;
   int j = blockIdx.x;
 
-  T val(0);
+  T val = 0;
   int ttid = tid;
 
   while (true) {
diff --git a/paddle/fluid/operators/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise_sub_op.cu
index ff3f6f8a2..8709f686f 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
@@ -14,25 +14,19 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_sub_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_sub_grad,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 862249269..130f18dde 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -12,28 +12,48 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
 
-class FillConstantOp : public framework::OperatorWithKernel {
+class FillConstantInferShape : public framework::InferShapeBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void operator()(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillConstantOp should not be null.");
-    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
     ctx->SetOutputDim("Out", framework::make_ddim(shape));
   }
+};
+
+class FillConstantOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
+    auto value = Attr<float>("value");
+    auto force_cpu = Attr<bool>("force_cpu");
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    if (force_cpu) {
+      auto cpu = platform::CPUPlace();
+      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+    } else {
+      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
+    }
 
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    math::set_constant(dev_ctx, &out, value);
   }
 };
 
@@ -67,11 +87,6 @@ Fill up a variable with specified constant value.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
+                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fill_constant,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int64_t>)
diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc
deleted file mode 100644
index 51ccaefa4..000000000
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_constant,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>)
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
deleted file mode 100644
index b2a2a7b2f..000000000
--- a/paddle/fluid/operators/fill_constant_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillConstantOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto value = ctx.Attr<float>("value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->Resize(framework::make_ddim(ctx.Attr<std::vector<int>>("shape")));
-    if (force_cpu) {
-      auto cpu = platform::CPUPlace();
-      out->mutable_data(cpu, framework::ToTypeIndex(data_type));
-    } else {
-      out->mutable_data(ctx.GetPlace(), framework::ToTypeIndex(data_type));
-    }
-
-    math::set_constant(ctx.template device_context<DeviceContext>(), out,
-                       value);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index 352a17c92..925dc1906 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -70,6 +69,7 @@ class FillOp : public framework::OperatorBase {
 
     framework::VisitDataType(
         dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
+
     if (!force_cpu && platform::is_gpu_place(place)) {
       // Copy tensor to out
       platform::DeviceContextPool &pool =
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index b49072379..778485641 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -61,7 +60,6 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(gaussian_random,
                         paddle::operators::GPUGaussianRandomKernel<float>,
                         paddle::operators::GPUGaussianRandomKernel<double>);
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 58b85abf8..0de58d5fd 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -15,25 +15,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T>
-HOSTDEVICE T log(const T& val) {
-  return std::log(val);
-}
-
-template <>
-HOSTDEVICE platform::float16 log(const platform::float16& val) {
-  // strage bug, hlog is not exists.
-  return static_cast<float16>(0);
-  // half tmp = static_cast<half>(val);
-  // return static_cast<platform::float16>(hlog(tmp));
-}
-
 namespace {
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
@@ -49,12 +35,12 @@ template <typename T>
 __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
                                        const int class_num) {
   int tid = threadIdx.x;
-  T val(0);
+  T val = 0;
 
   int idx = blockIdx.x * class_num + tid;
   int end = blockIdx.x * class_num + class_num;
   for (; idx < end; idx += blockDim.x) {
-    val += math::TolerableValue<T>()(log(X[idx])) * label[idx];
+    val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
   }
 
   val = paddle::platform::reduceSum(val, tid, blockDim.x);
@@ -98,8 +84,6 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
 
 template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
 template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext,
-                                   platform::float16>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index 2e4e4781c..adc5b3fe4 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
@@ -35,21 +33,6 @@ struct TolerableValue {
   }
 };
 
-// float16 value clip behave different.
-using paddle::platform::float16;
-using paddle::platform::isfinite;
-template <>
-struct TolerableValue<float16> {
-  HOSTDEVICE float16 operator()(const float16& x) const {
-    if (isfinite(x))
-      return x;
-    else if (x > static_cast<float16>(0))
-      return std::numeric_limits<float16>::max();
-    else
-      return std::numeric_limits<float16>::min();
-  }
-};
-
 template <typename DeviceContext, typename T>
 class CrossEntropyFunctor {
  public:
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 00dbfc11a..a92762c7f 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -77,7 +76,6 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
 
 template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
 
 namespace {
 template <typename T, int block_size>
@@ -122,7 +120,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     auto* out_data = output->data<T>();
 
     SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(context, output, static_cast<T>(0));
+    functor(context, output, 0.0);
 
     const int block_size = 256;
     dim3 threads(block_size, 1);
@@ -140,8 +138,6 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
 
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
-                                      platform::float16>;
 
 template <typename T>
 struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
@@ -181,8 +177,6 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext,
-                                  platform::float16>;
 
 namespace {
 template <typename T, int block_size>
@@ -235,8 +229,6 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
-                                        platform::float16>;
 
 namespace scatter {
 
@@ -284,7 +276,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
         context.GetPlace());
 
     math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), static_cast<T>(0));
+    constant_functor(context, out.mutable_value(), 0.0);
 
     auto* out_data = out.mutable_value()->data<T>();
     auto* input_data = input.value().data<T>();
@@ -308,7 +300,6 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
 template struct MergeAdd<platform::CUDADeviceContext, double>;
 template struct MergeAdd<platform::CUDADeviceContext, int>;
 template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
-template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
 
 template <typename T, int block_size>
 __global__ void UpdateToTensorKernel(const T* selected_rows,
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 785c4baec..3effe7762 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -94,15 +94,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
 template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<double>;
-template class SoftmaxGradCUDNNFunctor<platform::float16>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
 
 template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext,
-                                  platform::float16>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 07aa23754..91e0ab28e 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -12,16 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#define EIGEN_USE_GPU
+
 #include "paddle/fluid/operators/mean_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index a41d50ae0..362e9f9ae 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -55,7 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
     IG->mutable_data<T>(context.GetPlace());
 
     T ig_size = static_cast<T>(IG->numel());
-    Eigen::DSizes<int, 1> bcast(static_cast<int>(ig_size));
+    Eigen::DSizes<int, 1> bcast(ig_size);
 
     EigenVector<T>::Flatten(*IG).device(
         *context.template device_context<DeviceContext>().eigen_device()) =
diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc
index 6c5a83c6a..81f3e42bf 100644
--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
@@ -20,7 +20,6 @@ namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
                         ops::MulKernel<plat::CUDADeviceContext, double>,
                         ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
-    ops::MulGradKernel<plat::CUDADeviceContext, double>,
-    ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(mul_grad,
+                        ops::MulGradKernel<plat::CUDADeviceContext, float>,
+                        ops::MulGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 9fdbee818..31f083565 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -174,8 +174,7 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
+                   ops::PoolCUDNNGradOpKernel<double>);
 
 REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<float>,
@@ -183,5 +182,4 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
+                   ops::PoolCUDNNGradOpKernel<double>);
diff --git a/paddle/fluid/operators/scale_op.cu b/paddle/fluid/operators/scale_op.cu
index d26686704..04c802da1 100644
--- a/paddle/fluid/operators/scale_op.cu
+++ b/paddle/fluid/operators/scale_op.cu
@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
-#include "paddle/fluid/platform/float16.h"
 
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     scale,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   plat::float16>);
+                                   int64_t>);
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
index c2d45c3d2..2bdb23e99 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -78,5 +78,4 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
                    ops::SoftmaxCUDNNKernel<float>,
                    ops::SoftmaxCUDNNKernel<plat::float16>);
 REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxGradCUDNNKernel<float>,
-                   ops::SoftmaxGradCUDNNKernel<plat::float16>);
+                   ops::SoftmaxGradCUDNNKernel<float>);
diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc
index 19359b7ee..5fb4f011d 100644
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
@@ -23,5 +23,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index db4c2d6c1..89bcd1bbc 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -11,13 +11,10 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sum_op.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
     ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
     ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index dda677279..49a4afb3a 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -46,7 +46,7 @@ class SumKernel : public framework::OpKernel<T> {
       if (!in_place) {
         math::SetConstant<DeviceContext, T> constant_functor;
         constant_functor(context.template device_context<DeviceContext>(), out,
-                         static_cast<T>(0));
+                         0.0);
       }
 
       math::SelectedRowsAddToTensor<DeviceContext, T> functor;
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 5fc0784f6..9da8551eb 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -11,19 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <limits>
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using paddle::platform::float16;
 
 template <typename T>
 struct Pair {
@@ -35,11 +32,6 @@ struct Pair {
     id = id;
   }
 
-  __device__ __forceinline__ void clear() {
-    v = -INFINITY;
-    id = -1;
-  }
-
   __device__ __forceinline__ void operator=(const Pair<T>& in) {
     v = in.v;
     id = in.id;
@@ -61,12 +53,6 @@ struct Pair {
   int64_t id;
 };
 
-template <>
-__device__ __forceinline__ void Pair<float16>::clear() {
-  v = platform::raw_uint16_to_float16(0x400);
-  id = -1;
-}
-
 template <typename T>
 __device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
                                       int beam_size) {
@@ -164,7 +150,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
         if (k < MaxLength - (*beam)) {
           topk[k] = topk[k + *beam];
         } else {
-          topk[k].clear();
+          topk[k].set(-INFINITY, -1);
         }
       }
       if (!(*is_empty)) {
@@ -174,7 +160,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
     }
 
     *max = topk[MaxLength - 1];
-    if ((*max).v == static_cast<T>(-1)) *is_empty = true;
+    if ((*max).v == -1) *is_empty = true;
     *beam = 0;
   }
 }
@@ -195,7 +181,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
         if (k < MaxLength - *beam) {
           topk[k] = topk[k + *beam];
         } else {
-          topk[k].set(std::numeric_limits<T>::min(), -1);
+          topk[k].set(-INFINITY, -1);
         }
       }
       if (!(*is_empty)) {
@@ -287,7 +273,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
   bool firststep = true;
 
   for (int k = 0; k < MaxLength; k++) {
-    topk[k].clear();
+    topk[k].set(-INFINITY, -1);
   }
   while (k) {
     ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
@@ -339,7 +325,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    top_k, paddle::operators::TopkOpCUDAKernel<float>,
-    paddle::operators::TopkOpCUDAKernel<double>,
-    paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
+                        paddle::operators::TopkOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 2b8039a0c..e1c7323a3 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -11,14 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <glog/logging.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
@@ -40,11 +36,6 @@ struct UniformGenerator {
   }
 };
 
-template <typename T, typename V>
-struct CastFunctor {
-  HOSTDEVICE V operator()(const T& a) { return static_cast<V>(a); }
-};
-
 // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -75,50 +66,18 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     T max = static_cast<T>(context.Attr<float>("max"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
-    if (out_var->IsType<framework::LoDTensor>() &&
-        std::type_index(typeid(T)) ==
-            std::type_index(typeid(platform::float16))) {
-      framework::Tensor master_copy_tensor;
-      master_copy_tensor.Resize(tensor->dims());
-      float* master_copy_tensor_data =
-          master_copy_tensor.mutable_data<float>(context.GetPlace());
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<float>(master_copy_tensor_data),
-                        UniformGenerator<float>(static_cast<float>(min),
-                                                static_cast<float>(max), seed));
-      platform::Transform<platform::CUDADeviceContext> trans;
-      auto* in_begin = master_copy_tensor.data<float>();
-      auto* in_end = in_begin + master_copy_tensor.numel();
-      auto* out_begin = tensor->mutable_data<T>(context.GetPlace());
-      trans(context.template device_context<platform::CUDADeviceContext>(),
-            in_begin, in_end, out_begin, CastFunctor<float, T>());
-    } else {
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        UniformGenerator<T>(min, max, seed));
-    }
-    if (VLOG_IS_ON(5)) {
-      framework::Tensor cpu_tensor;
-      framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
-      auto& dev_ctx =
-          *platform::DeviceContextPool::Instance().Get(context.GetPlace());
-      dev_ctx.Wait();
-      auto x = framework::EigenVector<T>::Flatten(cpu_tensor);
-      VLOG(5) << "The Uniform output " << x;
-    }
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                      thrust::device_ptr<T>(data),
+                      UniformGenerator<T>(min, max, seed));
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    uniform_random, paddle::operators::GPUUniformRandomKernel<float>,
-    paddle::operators::GPUUniformRandomKernel<double>,
-    paddle::operators::GPUUniformRandomKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    uniform_random_batch_size_like,
-    paddle::operators::GPUUniformRandomKernel<float>,
-    paddle::operators::GPUUniformRandomKernel<double>,
-    paddle::operators::GPUUniformRandomKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(uniform_random,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
+REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
-- 
GitLab