Merge remote-tracking branch 'ups/develop' into feature/op/fusion_lstm

a72f68f2 · tensor-tang · df28a3b4 · 17da113c · a72f68f2 · a72f68f2
51 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,11 +204,12 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
-set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
    include(external/anakin)
+elseif()
+    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
 endif()
 include(cudnn)              # set cudnn libraries, must before configure

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -104,6 +104,10 @@ if(WITH_GPU)
        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
            message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
        endif()
+        set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
+        set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY})
+        message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h")
+        message(STATUS "cudnn library is ${CUDNN_LIBRARY}")
    endif()
 elseif(WITH_AMD_GPU)
    add_definitions(-DPADDLE_WITH_HIP)

--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -35,9 +35,8 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
 ExternalProject_Add(
    extern_anakin
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(luotao): use PaddlePaddle/Anakin later
+    GIT_REPOSITORY      "https://github.com/PaddlePaddle/Anakin"
-    GIT_REPOSITORY      "https://github.com/luotao1/Anakin"
+    GIT_TAG             "04256ba78fa3da0beb74e8036c8efd68c12824d6"
-    GIT_TAG             "3957ae9263eaa0b1986758dac60a88852afb09be"
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DUSE_GPU_PLACE=YES

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -155,10 +155,11 @@ paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale',
 paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))

--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -20,6 +20,9 @@
 DEFINE_int32(io_threadpool_size, 100,
             "number of threads used for doing IO, default 100");
+DEFINE_int32(dist_threadpool_size, 0,
+             "number of threads used for distributed executed.");
 namespace paddle {
 namespace framework {
@@ -35,6 +38,10 @@ void ThreadPool::Init() {
  if (threadpool_.get() == nullptr) {
    // TODO(Yancey1989): specify the max threads number
    int num_threads = std::thread::hardware_concurrency();
+    if (FLAGS_dist_threadpool_size > 0) {
+      num_threads = FLAGS_dist_threadpool_size;
+      VLOG(1) << "set dist_threadpool_size to " << num_threads;
+    }
    PADDLE_ENFORCE_GT(num_threads, 0);
    threadpool_.reset(new ThreadPool(num_threads));
  }

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -60,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()
-if (WITH_ANAKIN) # only needed in CI
+if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
    nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
    #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)

--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -26,6 +26,8 @@ namespace plat = paddle::platform;
      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
                                                 ops::grad_functor<float>>, \
      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>);
+                                ops::grad_functor<double>>,                 \
+      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
+                                ops::grad_functor<plat::float16>>);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -333,8 +333,7 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    const Out out_conj = Eigen::numext::conj(out);
+    dx.device(d) = static_cast<T>(0.5) * dout / out;
-    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
  }
 };
@@ -740,7 +739,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
    dx.device(d) = dout * static_cast<T>(factor) *
-                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
+                   x.pow(static_cast<T>(factor) - static_cast<T>(1));
  }
 };
@@ -863,10 +862,11 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    T b = static_cast<T>(beta);
    auto temp1 = static_cast<T>(1) /
-                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+                 (static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
+    auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
-    dx.device(d) = dout * ((beta * out) + temp2);
+    dx.device(d) = dout * ((b * out) + temp2);
  }
 };

--- a/paddle/fluid/operators/assign_value_op.cu.cc
+++ b/paddle/fluid/operators/assign_value_op.cu.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/assign_value_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
-                        ops::AssignValueKernel<float>);
+                        ops::AssignValueKernel<float>,
+                        ops::AssignValueKernel<plat::float16>);
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -39,6 +39,27 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
    static_cast<size_t>(1024) * 1024 * 1024;
+template <typename T, typename DeviceContext>
+// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
+bool EnableFp16(const DeviceContext& dev_ctx,
+                cudnnConvolutionDescriptor_t cudnn_conv_desc) {
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+  // Tensor core is supported since the volta GPU and
+  // is only enabled when input and filter data are float16
+  if (dev_ctx.GetComputeCapability() >= 70 &&
+      std::type_index(typeid(T)) ==
+          std::type_index(typeid(platform::float16))) {
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+        cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+    return true;
+  } else {
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+        cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+  }
+#endif
+  return false;
+}
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
 public:
@@ -128,27 +149,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    cudnnConvolutionFwdAlgo_t algo;
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
+    if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
-    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        workspace_size_limit, &algo));
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-    // Tensor core is supported since the volta GPU and
-    // is only enabled when input and filter data are float16
-    if (dev_ctx.GetComputeCapability() >= 70 &&
-        std::type_index(typeid(T)) ==
-            std::type_index(typeid(platform::float16))) {
-      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
-      // Currently tensor core is only enabled using this algo
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
    } else {
-      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+          workspace_size_limit, &algo));
    }
-#endif
    // get workspace size able to allocate
    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
@@ -288,6 +296,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
      } else {
        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
      }
+      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
+        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+      }
      CUDNN_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
@@ -307,6 +318,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
      } else {
        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
      }
+      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
+        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+      }
      CUDNN_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@@ -362,7 +376,8 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
+                   paddle::operators::CUDNNConvGradOpKernel<double>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<float>,
@@ -370,4 +385,5 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>);
+                   paddle::operators::CUDNNConvGradOpKernel<double>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>)
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -13,12 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 using CUDACtx = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(cross_entropy,
                        ops::CrossEntropyOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyOpKernel<CUDACtx, double>);
+                        ops::CrossEntropyOpKernel<CUDACtx, double>,
-REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
-                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+REGISTER_OP_CUDA_KERNEL(
-                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
+    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
+    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -190,12 +190,15 @@ bool VariableResponse::ProcSerializedField(
 #endif
  }
+  VLOG(7) << "ProcSerializedField:" << meta_.varname()
+          << ", type:" << meta_.type() << std::endl;
  framework::DDim dims = GetDims(meta_.dims());
  if (meta_.type() == sendrecv::LOD_TENSOR) {
    PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
    if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
      return false;
    }
    return true;
  }
@@ -206,7 +209,9 @@ bool VariableResponse::ProcSerializedField(
    return true;
  }
-  return true;
+  PADDLE_ENFORCE("not supported var types:", meta_.varname(), meta_.type());
+  return false;
 }
 };  // namespace distributed

--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -30,4 +30,5 @@ REGISTER_OP_CUDA_KERNEL(
    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise_div_op.cu
@@ -14,19 +14,24 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_div_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    elementwise_div,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
+                              plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_div_grad,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
+                                  plat::float16>);
--- a/paddle/fluid/operators/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
@@ -14,19 +14,25 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_mul_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    elementwise_mul,
    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext,
+                              plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_mul_grad,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
+                                  plat::float16>,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
                                  int64_t>);
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
  int j = blockIdx.x;
  int i = threadIdx.x;
  int tid = threadIdx.x;
-  T val = 0;
+  T val(0);
  do {
    int x_offset = i * w + j;
@@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
  int tid = threadIdx.x;
  int j = blockIdx.x;
-  T val = 0;
+  T val(0);
  int ttid = tid;
  while (true) {

--- a/paddle/fluid/operators/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
@@ -14,19 +14,25 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_sub_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    elementwise_sub,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
+                              plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_sub_grad,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
+                                  plat::float16>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
                                  int64_t>);
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -12,48 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace operators {
-class FillConstantInferShape : public framework::InferShapeBase {
+class FillConstantOp : public framework::OperatorWithKernel {
 public:
-  void operator()(framework::InferShapeContext *ctx) const override {
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of FillConstantOp should not be null.");
-    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
    ctx->SetOutputDim("Out", framework::make_ddim(shape));
  }
-};
-class FillConstantOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
-    auto value = Attr<float>("value");
-    auto force_cpu = Attr<bool>("force_cpu");
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
-    if (force_cpu) {
-      auto cpu = platform::CPUPlace();
-      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
-    } else {
-      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
-    }
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  framework::OpKernelType GetExpectedKernelType(
-    auto &dev_ctx = *pool.Get(dev_place);
+      const framework::ExecutionContext& ctx) const override {
-    math::set_constant(dev_ctx, &out, value);
+    return framework::OpKernelType(
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
+        ctx.device_context());
  }
 };
@@ -87,6 +67,11 @@ Fill up a variable with specified constant value.
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
-                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fill_constant,
+    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int64_t>)
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/platform/float16.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fill_constant,
+    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::float16>)
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class FillConstantOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
+    auto value = ctx.Attr<float>("value");
+    auto force_cpu = ctx.Attr<bool>("force_cpu");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->Resize(framework::make_ddim(ctx.Attr<std::vector<int>>("shape")));
+    if (force_cpu) {
+      auto cpu = platform::CPUPlace();
+      out->mutable_data(cpu, framework::ToTypeIndex(data_type));
+    } else {
+      out->mutable_data(ctx.GetPlace(), framework::ToTypeIndex(data_type));
+    }
+    math::set_constant(ctx.template device_context<DeviceContext>(), out,
+                       value);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
@@ -69,7 +70,6 @@ class FillOp : public framework::OperatorBase {
    framework::VisitDataType(
        dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
    if (!force_cpu && platform::is_gpu_place(place)) {
      // Copy tensor to out
      platform::DeviceContextPool &pool =

--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
@@ -60,6 +61,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(gaussian_random,
                        paddle::operators::GPUGaussianRandomKernel<float>,
                        paddle::operators::GPUGaussianRandomKernel<double>);

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -123,8 +123,11 @@ void ListenAndServOp::RunSyncLoop(
      optimize_prepared.begin(),
      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
+  // Trainers will get all parameters from pserver in the
+  // startup program, so we will wait RequestGet first
+  rpc_service_->SetCond(distributed::kRequestGet);
+  rpc_service_->WaitBarrier(distributed::kRequestGet);
  rpc_service_->ResetBarrierCounter();
  while (true) {
    rpc_service_->Profiler().OneStep();
    // Get from multiple trainers, we don't care about the order in which

--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -15,11 +15,25 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
 namespace math {
+template <typename T>
+HOSTDEVICE T log(const T& val) {
+  return std::log(val);
+}
+template <>
+HOSTDEVICE platform::float16 log(const platform::float16& val) {
+  // strage bug, hlog is not exists.
+  return static_cast<float16>(0);
+  // half tmp = static_cast<half>(val);
+  // return static_cast<platform::float16>(hlog(tmp));
+}
 namespace {
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
@@ -35,12 +49,12 @@ template <typename T>
 __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
                                       const int class_num) {
  int tid = threadIdx.x;
-  T val = 0;
+  T val(0);
  int idx = blockIdx.x * class_num + tid;
  int end = blockIdx.x * class_num + class_num;
  for (; idx < end; idx += blockDim.x) {
-    val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
+    val += math::TolerableValue<T>()(log(X[idx])) * label[idx];
  }
  val = paddle::platform::reduceSum(val, tid, blockDim.x);
@@ -84,6 +98,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
 template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
 template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
+template class CrossEntropyFunctor<platform::CUDADeviceContext,
+                                   platform::float16>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
 namespace paddle {
@@ -33,6 +35,21 @@ struct TolerableValue {
  }
 };
+// float16 value clip behave different.
+using paddle::platform::float16;
+using paddle::platform::isfinite;
+template <>
+struct TolerableValue<float16> {
+  HOSTDEVICE float16 operator()(const float16& x) const {
+    if (isfinite(x))
+      return x;
+    else if (x > static_cast<float16>(0))
+      return std::numeric_limits<float16>::max();
+    else
+      return std::numeric_limits<float16>::min();
+  }
+};
 template <typename DeviceContext, typename T>
 class CrossEntropyFunctor {
 public:

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
@@ -76,6 +77,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
 template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
 namespace {
 template <typename T, int block_size>
@@ -120,7 +122,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
    auto* out_data = output->data<T>();
    SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(context, output, 0.0);
+    functor(context, output, static_cast<T>(0));
    const int block_size = 256;
    dim3 threads(block_size, 1);
@@ -138,6 +140,8 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
+                                      platform::float16>;
 template <typename T>
 struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
@@ -177,6 +181,8 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext,
+                                  platform::float16>;
 namespace {
 template <typename T, int block_size>
@@ -229,6 +235,8 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
+                                        platform::float16>;
 namespace scatter {
@@ -276,7 +284,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
        context.GetPlace());
    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), 0.0);
+    constant_functor(context, out.mutable_value(), static_cast<T>(0));
    auto* out_data = out.mutable_value()->data<T>();
    auto* input_data = input.value().data<T>();
@@ -300,6 +308,7 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
 template struct MergeAdd<platform::CUDADeviceContext, double>;
 template struct MergeAdd<platform::CUDADeviceContext, int>;
 template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
+template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
 template <typename T, int block_size>
 __global__ void UpdateToTensorKernel(const T* selected_rows,

--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -94,12 +94,15 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
 template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<double>;
+template class SoftmaxGradCUDNNFunctor<platform::float16>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext,
+                                  platform::float16>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;

--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/mean_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -55,7 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
    IG->mutable_data<T>(context.GetPlace());
    T ig_size = static_cast<T>(IG->numel());
-    Eigen::DSizes<int, 1> bcast(ig_size);
+    Eigen::DSizes<int, 1> bcast(static_cast<int>(ig_size));
    EigenVector<T>::Flatten(*IG).device(
        *context.template device_context<DeviceContext>().eigen_device()) =

--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
@@ -20,6 +20,7 @@ namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
                        ops::MulKernel<plat::CUDADeviceContext, double>,
                        ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(mul_grad,
+REGISTER_OP_CUDA_KERNEL(
-                        ops::MulGradKernel<plat::CUDADeviceContext, float>,
+    mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
-                        ops::MulGradKernel<plat::CUDADeviceContext, double>);
+    ops::MulGradKernel<plat::CUDADeviceContext, double>,
+    ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -174,7 +174,8 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
                   ops::PoolCUDNNOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>);
+                   ops::PoolCUDNNGradOpKernel<double>,
+                   ops::PoolCUDNNGradOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                   ops::PoolCUDNNOpKernel<float>,
@@ -182,4 +183,5 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                   ops::PoolCUDNNOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>);
+                   ops::PoolCUDNNGradOpKernel<double>,
+                   ops::PoolCUDNNGradOpKernel<plat::float16>);
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,14 +23,40 @@ class PReluOp : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}
  void InferShape(framework::InferShapeContext *ctx) const override {
+    std::string mode = ctx->Attrs().Get<std::string>("mode");
+    auto x_dim = ctx->GetInputDim("X");
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
-    PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
-                   "Size of weight Alpha must be one.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    if (mode == "all") {
+      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
+                     "For mode 'all', size of weight Alpha must be one.");
+    } else if (mode == "channel") {
+      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == x_dim[1],
+                     "For channel-wise mode, size of weight Alpha must be "
+                     "equal to the number of channels, should be %d",
+                     x_dim[1]);
+    } else if (mode == "element") {
+      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == product(x_dim),
+                     "For element-wise mode, size of weight Alpha must be "
+                     "equal to the number of input, should be %d",
+                     product(x_dim));
+    } else {
+      PADDLE_THROW("Unkown mode %s", mode);
+    }
+    ctx->SetOutputDim("Out", x_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
 };
 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -44,9 +67,7 @@ class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "The output tensor of prelu operator.");
    AddComment(R"DOC(
 PRelu Operator.
 The equation is:
 $$
 f(x) =
 \begin{cases}
@@ -54,11 +75,15 @@ f(x) =
 x,         \qquad  \text{if} \ x >= 0
 \end{cases}
 $$
 The input `X` can carry the LoD (Level of Details) information,
 or not. And the output shares the LoD information with input `X`.
+There are modes: 
+  all: all elements share same weight
+  channel: elements in a channel share same weight
+  element: each element has a weight 
 )DOC");
+    AddAttr<std::string>("mode", "The mode for inputs to share weights.")
+        .SetDefault("all");
  }
 };
@@ -71,9 +96,23 @@ class PReluGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    auto x_grad_name = framework::GradVarName("X");
-    ctx->SetOutputDim(framework::GradVarName("Alpha"),
+    auto alpha_grad_name = framework::GradVarName("Alpha");
-                      ctx->GetInputDim("Alpha"));
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(alpha_grad_name)) {
+      ctx->SetOutputDim(alpha_grad_name, ctx->GetInputDim("Alpha"));
+    }
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
  }
 };

--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/prelu_op.h"
-REGISTER_OP_CUDA_KERNEL(
-    prelu,
-    paddle::operators::PReluKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(prelu_grad,
-                        paddle::operators::PReluGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,32 +10,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 using platform::Transform;
-template <typename T>
-class PReluFunctor {
- public:
-  explicit PReluFunctor(const T* alpha) : alpha_(alpha) {}
-  HOSTDEVICE T operator()(const T& x) const {
-    if (x > 0)
-      return x;
-    else
-      return x * (*alpha_);
-  }
- private:
-  const T* alpha_;
-};
 template <typename DeviceContext, typename T>
 class PReluKernel : public framework::OpKernel<T> {
 public:
@@ -50,53 +31,93 @@ class PReluKernel : public framework::OpKernel<T> {
    const T* x_ptr = x->data<T>();
    T* o_ptr = out->mutable_data<T>(context.GetPlace());
-    auto* alpha_ptr = alpha->data<T>();
+    const T* alpha_ptr = alpha->data<T>();
+    std::string mode = context.Attr<std::string>("mode");
    int numel = x->numel();
+    auto dim = x->dims();
-    Transform<DeviceContext> trans;
+    int index = 0;
-    trans(context.template device_context<DeviceContext>(), x_ptr,
+    int i = 0;
-          x_ptr + numel, o_ptr, PReluFunctor<T>(alpha_ptr));
+    int temp = 0;
-  }
+    if (mode == "channel") {
-};
+      for (i = 0; i < numel; i++) {
+        temp = numel / (dim[0] * dim[1]);
-template <typename T>
+        index = (i / temp) % dim[1];
-class PReluGradFunctor {
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
- public:
+      }
-  explicit PReluGradFunctor(const T* alpha) : alpha_(alpha) {}
+    } else if (mode == "element") {
+      for (i = 0; i < numel; i++) {
-  HOSTDEVICE T operator()(const T& out, const T& dout) const {
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[i] * x_ptr[i];
-    if (out > 0)
+      }
-      return dout;
+    } else {
-    else
+      for (i = 0; i < numel; i++) {
-      return dout * (*alpha_);
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
+      }
+    }
  }
- private:
-  const T* alpha_;
 };
 template <typename DeviceContext, typename T>
 class PReluGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
    auto* out = context.Input<Tensor>("Out");
    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* alpha_ptr = alpha->data<T>();
+    const T* alpha_ptr = alpha->data<T>();
+    const T* x_ptr = x->data<T>();
-    T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
    const T* dout_ptr = dout->data<T>();
    const T* out_ptr = out->data<T>();
-    int numel = dx->numel();
+    std::string mode = context.Attr<std::string>("mode");
+    int numel = x->numel();
-    Transform<DeviceContext> trans;
+    auto dim = x->dims();
-    trans(context.template device_context<DeviceContext>(), out_ptr,
+    int index = 0;
-          out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor<T>(alpha_ptr));
+    int i = 0;
+    int temp = 0;
-    // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
+    if (dx) {
+      T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
+      if (mode == "channel") {
+        for (i = 0; i < numel; i++) {
+          temp = numel / (dim[0] * dim[1]);
+          index = (i / temp) % dim[1];
+          dx_ptr[i] =
+              out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
+        }
+      } else if (mode == "element") {
+        for (i = 0; i < numel; i++) {
+          dx_ptr[i] = out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[i] * dout_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          dx_ptr[i] = out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i];
+        }
+      }
+    }
+    index = 0;
+    if (dalpha) {
+      T* dalpha_ptr = dalpha->mutable_data<T>(context.GetPlace());
+      if (mode == "channel") {
+        for (i = 0; i < numel; i++) {
+          temp = numel / (dim[0] * dim[1]);
+          index = (i / temp) % dim[1];
+          dalpha_ptr[index] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+        }
+      } else if (mode == "element") {
+        for (i = 0; i < numel; i++) {
+          dalpha_ptr[i] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          dalpha_ptr[0] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+        }
+      }
+    }
+    // TODO(Guanzhong): add GPU kernels
  }
 };

--- a/paddle/fluid/operators/scale_op.cu
+++ b/paddle/fluid/operators/scale_op.cu
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/scale_op.h"
+#include "paddle/fluid/platform/float16.h"
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    scale,
    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
+                                   int64_t>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   plat::float16>);
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -35,7 +35,7 @@ class ScatterOpKernel : public framework::OpKernel<T> {
    auto *Out = ctx.Output<Tensor>("Out");
    // In place output: Out = X, Out[Ids] += Updates
-    Out->ShareDataWith(*X);
+    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
    // Apply ScatterUpdate: Out[index] += Updates[:]
    ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
  }
@@ -53,7 +53,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
    // In place gradient: dX = dO
-    dX->ShareDataWith(*dOut);
+    framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
    dUpdates->mutable_data<T>(ctx.GetPlace());
    // Gradient by Gather: dUpdates += dO[Ids]
    CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);

--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -78,4 +78,5 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
                   ops::SoftmaxCUDNNKernel<float>,
                   ops::SoftmaxCUDNNKernel<plat::float16>);
 REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxGradCUDNNKernel<float>);
+                   ops::SoftmaxGradCUDNNKernel<float>,
+                   ops::SoftmaxGradCUDNNKernel<plat::float16>);
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
@@ -23,4 +23,5 @@ REGISTER_OP_CUDA_KERNEL(
    ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>);
+    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>,
+    ops::SoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -11,10 +11,13 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
    ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -46,7 +46,7 @@ class SumKernel : public framework::OpKernel<T> {
      if (!in_place) {
        math::SetConstant<DeviceContext, T> constant_functor;
        constant_functor(context.template device_context<DeviceContext>(), out,
-                         0.0);
+                         static_cast<T>(0));
      }
      math::SelectedRowsAddToTensor<DeviceContext, T> functor;

--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -11,16 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <limits>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
+using paddle::platform::float16;
 template <typename T>
 struct Pair {
@@ -32,6 +35,11 @@ struct Pair {
    id = id;
  }
+  __device__ __forceinline__ void clear() {
+    v = -INFINITY;
+    id = -1;
+  }
  __device__ __forceinline__ void operator=(const Pair<T>& in) {
    v = in.v;
    id = in.id;
@@ -53,6 +61,12 @@ struct Pair {
  int64_t id;
 };
+template <>
+__device__ __forceinline__ void Pair<float16>::clear() {
+  v = platform::raw_uint16_to_float16(0x400);
+  id = -1;
+}
 template <typename T>
 __device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
                                      int beam_size) {
@@ -150,7 +164,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
        if (k < MaxLength - (*beam)) {
          topk[k] = topk[k + *beam];
        } else {
-          topk[k].set(-INFINITY, -1);
+          topk[k].clear();
        }
      }
      if (!(*is_empty)) {
@@ -160,7 +174,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
    }
    *max = topk[MaxLength - 1];
-    if ((*max).v == -1) *is_empty = true;
+    if ((*max).v == static_cast<T>(-1)) *is_empty = true;
    *beam = 0;
  }
 }
@@ -181,7 +195,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
        if (k < MaxLength - *beam) {
          topk[k] = topk[k + *beam];
        } else {
-          topk[k].set(-INFINITY, -1);
+          topk[k].set(std::numeric_limits<T>::min(), -1);
        }
      }
      if (!(*is_empty)) {
@@ -273,7 +287,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
  bool firststep = true;
  for (int k = 0; k < MaxLength; k++) {
-    topk[k].set(-INFINITY, -1);
+    topk[k].clear();
  }
  while (k) {
    ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
@@ -325,5 +339,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
-REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
+REGISTER_OP_CUDA_KERNEL(
-                        paddle::operators::TopkOpCUDAKernel<double>);
+    top_k, paddle::operators::TopkOpCUDAKernel<float>,
+    paddle::operators::TopkOpCUDAKernel<double>,
+    paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -11,10 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <glog/logging.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/transform.h"
 namespace paddle {
 namespace operators {
@@ -36,6 +40,11 @@ struct UniformGenerator {
  }
 };
+template <typename T, typename V>
+struct CastFunctor {
+  HOSTDEVICE V operator()(const T& a) { return static_cast<V>(a); }
+};
 // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -66,18 +75,50 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
    T max = static_cast<T>(context.Attr<float>("max"));
    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
    int64_t size = tensor->numel();
-    thrust::transform(index_sequence_begin, index_sequence_begin + size,
+    if (out_var->IsType<framework::LoDTensor>() &&
-                      thrust::device_ptr<T>(data),
+        std::type_index(typeid(T)) ==
-                      UniformGenerator<T>(min, max, seed));
+            std::type_index(typeid(platform::float16))) {
+      framework::Tensor master_copy_tensor;
+      master_copy_tensor.Resize(tensor->dims());
+      float* master_copy_tensor_data =
+          master_copy_tensor.mutable_data<float>(context.GetPlace());
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<float>(master_copy_tensor_data),
+                        UniformGenerator<float>(static_cast<float>(min),
+                                                static_cast<float>(max), seed));
+      platform::Transform<platform::CUDADeviceContext> trans;
+      auto* in_begin = master_copy_tensor.data<float>();
+      auto* in_end = in_begin + master_copy_tensor.numel();
+      auto* out_begin = tensor->mutable_data<T>(context.GetPlace());
+      trans(context.template device_context<platform::CUDADeviceContext>(),
+            in_begin, in_end, out_begin, CastFunctor<float, T>());
+    } else {
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        UniformGenerator<T>(min, max, seed));
+    }
+    if (VLOG_IS_ON(5)) {
+      framework::Tensor cpu_tensor;
+      framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+      auto& dev_ctx =
+          *platform::DeviceContextPool::Instance().Get(context.GetPlace());
+      dev_ctx.Wait();
+      auto x = framework::EigenVector<T>::Flatten(cpu_tensor);
+      VLOG(5) << "The Uniform output " << x;
+    }
  }
 };
 }  // namespace operators
 }  // namespace paddle
-REGISTER_OP_CUDA_KERNEL(uniform_random,
+namespace plat = paddle::platform;
-                        paddle::operators::GPUUniformRandomKernel<float>,
+REGISTER_OP_CUDA_KERNEL(
-                        paddle::operators::GPUUniformRandomKernel<double>);
+    uniform_random, paddle::operators::GPUUniformRandomKernel<float>,
-REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
+    paddle::operators::GPUUniformRandomKernel<double>,
-                        paddle::operators::GPUUniformRandomKernel<float>,
+    paddle::operators::GPUUniformRandomKernel<plat::float16>);
-                        paddle::operators::GPUUniformRandomKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    uniform_random_batch_size_like,
+    paddle::operators::GPUUniformRandomKernel<float>,
+    paddle::operators::GPUUniformRandomKernel<double>,
+    paddle::operators::GPUUniformRandomKernel<plat::float16>);
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -122,7 +122,7 @@ def __bootstrap__():
        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        'cpu_deterministic'
+        "dist_threadpool_size", 'cpu_deterministic'
    ]
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')

--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -15,7 +15,6 @@
 from . import framework
 import numpy as np
 import contextlib
-from .framework import convert_np_dtype_to_dtype_
 from .core import VarDesc
 __all__ = [

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -112,6 +112,7 @@ __all__ = [
    'log',
    'crop',
    'rank_loss',
+    'prelu',
    'flatten',
 ]
@@ -5089,7 +5090,7 @@ def random_crop(x, shape, seed=None):
    return out
-def log(x):
+def log(x, name=None):
    """
    Calculates the natural log of the given input tensor, element-wise.
@@ -5099,6 +5100,8 @@ def log(x):
    Args:
        x (Variable): Input tensor.
+        name (str|None, default None): A name for this layer If set None,
+            the layer will be named automatically.
    Returns:
        Variable: The natural log of the input tensor computed element-wise.
@@ -5116,7 +5119,7 @@ def log(x):
    return out
-def relu(x):
+def relu(x, name=None):
    """
    Relu takes one input data (Tensor) and produces one output data (Tensor)
    where the rectified linear function, y = max(0, x), is applied to
@@ -5128,6 +5131,8 @@ def relu(x):
    Args:
        x (Variable): The input tensor.
+        name (str|None, default None): A name for this layer If set None,
+            the layer will be named automatically.
    Returns:
        Variable: The output tensor with the same shape as input.
@@ -5364,6 +5369,59 @@ def rank_loss(label, left, right, name=None):
    return out
+def prelu(x, mode, param_attr=None, name=None):
+    """
+    Equation:
+        y = \max(0, x) + alpha \min(0, x)
+    Args:
+        x (Variable): The input tensor.
+	  param_attr(ParamAttr|None): The parameter attribute for the learnable
+                                    weight (alpha).
+        mode (string): The mode for weight sharing
+		       all: all elements share same weight
+ 		       channel:elements in a channel share same weight
+ 		       element:each element has a weight
+	  name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically. 
+    Returns:
+        Variable: The output tensor with the same shape as input.
+    Examples:
+        .. code-block:: python
+         x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
+            mode = 'channel'
+            output = fluid.layers.prelu(x,mode)
+    """
+    helper = LayerHelper('prelu', **locals())
+    if mode not in ['all', 'channel', 'element']:
+        raise ValueError('mode should be one of all, channel, element.')
+    alpha_shape = [1]
+    if mode == 'channel':
+        alpha_shape = [1, x.shape[1], 1, 1]
+    elif mode == 'element':
+        alpha_shape = x.shape
+    dtype = helper.input_dtype(input_param_name='x')
+    alpha = helper.create_parameter(
+        attr=param_attr,
+        shape=alpha_shape,
+        dtype='float32',
+        is_bias=False,
+        default_initializer=Constant(1.0))
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="prelu",
+        inputs={"X": x,
+                'Alpha': alpha},
+        attrs={"mode": mode},
+        outputs={"Out": out})
+    return out
 def flatten(x, axis=1, name=None):
    """
    **Flatten layer**

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -59,8 +59,8 @@ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=$
 if(WITH_DISTRIBUTE)
    py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
-    set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
+    set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
+    set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)

--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -26,6 +26,12 @@ from paddle.fluid.layers.io import ListenAndServ
 from paddle.fluid.layers.io import Recv
 from paddle.fluid.layers.io import Send
+from paddle.fluid import core
+RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
+)
+RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
 class TestSendOp(unittest.TestCase):
    def test_send(self):
@@ -89,18 +95,29 @@ class TestSendOp(unittest.TestCase):
    def init_client(self, place, port):
        main = fluid.Program()
        with fluid.program_guard(main):
+            main.global_block().append_op(
+                type="fetch_barrier",
+                inputs={},
+                outputs={},
+                attrs={
+                    "endpoints": ["127.0.0.1:{0}".format(port)],
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
            x = layers.data(
                shape=[32, 32],
                dtype='float32',
                name='X',
                append_batch_size=False)
            fluid.initializer.Constant(value=2.3)(x, main.global_block())
            get_var = main.global_block().create_var(
                name="scale_0.tmp_0",  # server side var
                dtype="float32",
                persistable=False,
                shape=[32, 32])
            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
            Send("127.0.0.1:%d" % port, [x])
            o = Recv("127.0.0.1:%d" % port, [get_var])

--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -18,6 +18,7 @@ import unittest
 import paddle.fluid as fluid
 from paddle.fluid.transpiler.distribute_transpiler import delete_ops
 import traceback
+import collections
 class TranspilerTest(unittest.TestCase):
@@ -53,9 +54,18 @@ class TranspilerTest(unittest.TestCase):
        self.origin_prog = main.clone()
        return main
-    def get_trainer(self, config=None, sync_mode=True):
+    def get_trainer(self, config=None):
-        t = self._transpiler_instance(config, sync_mode)
+        src = fluid.default_startup_program().clone()
-        return t.get_trainer_program()
+        t = self._transpiler_instance(config)
+        trainer_main = t.get_trainer_program()
+        trainer_startup = fluid.default_startup_program()
+        assert (src.num_blocks == 1)
+        assert (trainer_startup.num_blocks == src.num_blocks)
+        return trainer_main, trainer_startup
    def get_pserver(self, ep, config=None, sync_mode=True):
        t = self._transpiler_instance(config, sync_mode)
@@ -91,7 +101,21 @@ class TestBasicModel(TranspilerTest):
        pserver, startup = self.get_pserver(self.pserver1_ep)
        pserver2, startup2 = self.get_pserver(self.pserver2_ep)
-        trainer = self.get_trainer()
+        trainer, trainer_startup = self.get_trainer()
+        # splited var blocks should be in startup program
+        self.assertTrue("fc_w.block0" in trainer_startup.global_block().vars)
+        self.assertTrue("fc_w.block1" in trainer_startup.global_block().vars)
+        self.assertTrue("fc_w" in trainer_startup.global_block().vars)
+        self.assertTrue("fc_b" in trainer_startup.global_block().vars)
+        self.assertTrue("fc_w@GRAD" not in trainer_startup.global_block().vars)
+        self.assertTrue("fc_b@GRAD" not in trainer_startup.global_block().vars)
+        src = [op.type for op in trainer_startup.global_block().ops]
+        dst = ['fill_constant', 'fill_constant', 'uniform_random', 'recv', 'recv', \
+               'fetch_barrier', 'concat']
+        self.assertEqual(src, dst)
        self.assertEqual([op.type for op in trainer.global_block().ops], [
            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
@@ -142,7 +166,7 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest):
        pserver, startup = self.get_pserver(self.pserver1_ep, config)
        pserver2, startup2 = self.get_pserver(self.pserver2_ep, config)
-        trainer = self.get_trainer(config)
+        trainer, _ = self.get_trainer(config)
        self.assertEqual([op.type for op in trainer.global_block().ops], [
            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
@@ -226,7 +250,7 @@ class TestLRDecay(TranspilerTest):
    def transpiler_test_impl(self):
        pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
        self.assertEqual(len(pserver.blocks), 4)
        lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
@@ -256,7 +280,7 @@ class TestLRDecayConditional(TranspilerTest):
    def transpiler_test_impl(self):
        pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
        serv_op = pserver.blocks[0].ops[0]
        sub_blocks = []
@@ -305,7 +329,7 @@ class TestL2Decay(TranspilerTest):
    def transpiler_test_impl(self):
        pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
        self.assertEqual(len(pserver.blocks), 3)
        self.assertEqual([op.type for op in pserver.blocks[1].ops],
@@ -340,7 +364,7 @@ class TestL2DecayWithPiecewise(TranspilerTest):
    def transpiler_test_impl(self):
        pserver, startup = self.get_pserver(self.pserver1_ep)
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
        self.assertEqual(len(pserver.blocks), 9)
        self.assertEqual([op.type for op in pserver.blocks[1].ops], [
@@ -415,7 +439,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
                         ["sum", "adam", "scale", "scale"])
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
        self.assertEqual(len(trainer.blocks), 1)
        ops = [
            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
@@ -453,7 +477,7 @@ class TestDistLookupTable(TestDistLookupTableBase):
        # 5 save table
        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
-        trainer = self.get_trainer()
+        trainer, _ = self.get_trainer()
        self.assertEqual(len(trainer.blocks), 1)
        ops = [
            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',
@@ -486,7 +510,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
                         ["adam", "scale", "scale"])
-        trainer = self.get_trainer(config)
+        trainer, _ = self.get_trainer(config)
        self.assertEqual(len(trainer.blocks), 1)
        ops = [
            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
@@ -525,7 +549,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
        # 5 save table
        self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"])
-        trainer = self.get_trainer(config)
+        trainer, _ = self.get_trainer(config)
        self.assertEqual(len(trainer.blocks), 1)
        ops = [
            'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids',

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -21,6 +21,7 @@ import paddle.fluid.nets as nets
 from paddle.fluid.framework import Program, program_guard, default_main_program
 from paddle.fluid.param_attr import ParamAttr
 import decorators
+from paddle.fluid.initializer import Constant
 class TestBook(unittest.TestCase):
@@ -485,6 +486,20 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(out)
        print(str(program))
+    def test_prelu(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[5, 200, 100, 100], dtype="float32")
+            mode = 'channel'
+            out = layers.prelu(
+                input,
+                mode,
+                param_attr=ParamAttr(initializer=Constant(1.0)),
+                name='prelu')
+            self.assertIsNotNone(out)
+        print(str(program))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -20,30 +20,58 @@ from op_test import OpTest
 class PReluTest(OpTest):
    def setUp(self):
        self.op_type = "prelu"
-        x_np = np.random.normal(size=(10, 10)).astype("float32")
+        self.initTestCase()
+        x_np = np.random.normal(size=(3, 5, 5, 10)).astype("float32")
-        for pos, val in np.ndenumerate(x_np):
-            # Since zero point in prelu is not differentiable, avoid randomize
+        # Since zero point in prelu is not differentiable, avoid randomize
-            # zero.
+        # zero.
-            while abs(val) < 1e-3:
+        x_np[np.abs(x_np) < 0.005] = 0.02
-                x_np[pos] = np.random.normal()
-                val = x_np[pos]
+        if self.attrs == {'mode': "all"}:
+            alpha_np = np.random.rand(1).astype("float32")
-        x_np_sign = np.sign(x_np)
+            self.inputs = {'X': x_np, 'Alpha': alpha_np}
-        x_np = x_np_sign * np.maximum(x_np, .005)
+        elif self.attrs == {'mode': "channel"}:
-        alpha_np = np.array([.1], dtype="float32")
+            alpha_np = np.random.rand(1, x_np.shape[1], 1, 1).astype("float32")
-        self.inputs = {'X': x_np, 'Alpha': alpha_np}
+            self.inputs = {'X': x_np, 'Alpha': alpha_np}
+        else:
+            alpha_np = np.random.rand(*x_np.shape).astype("float32")
+            self.inputs = {'X': x_np, 'Alpha': alpha_np}
        out_np = np.maximum(self.inputs['X'], 0.)
        out_np = out_np + np.minimum(self.inputs['X'],
                                     0.) * self.inputs['Alpha']
        assert out_np is not self.inputs['X']
        self.outputs = {'Out': out_np}
+    def initTestCase(self):
+        self.attrs = {'mode': "channel"}
    def test_check_output(self):
        self.check_output()
    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X', 'Alpha'], 'Out')
+    def test_check_grad_ignore_x(self):
+        self.check_grad(['Alpha'], 'Out', no_grad_set=set('X'))
+    def test_check_grad_ignore_alpha(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Alpha'))
+class TestCase1(PReluTest):
+    def initTestCase(self):
+        self.attrs = {'mode': "all"}
+class TestCase2(PReluTest):
+    def initTestCase(self):
+        self.attrs = {'mode': "channel"}
+class TestCase3(PReluTest):
+    def initTestCase(self):
+        self.attrs = {'mode': "element"}
 if __name__ == "__main__":

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -195,6 +195,9 @@ class DistributeTranspiler(object):
        if program is None:
            program = default_main_program()
        self.origin_program = program
+        self.origin_startup_program = default_startup_program().clone()
+        self.startup_program = default_startup_program()
        self.trainer_num = trainers
        self.sync_mode = sync_mode
        self.trainer_id = trainer_id
@@ -205,10 +208,10 @@ class DistributeTranspiler(object):
        ps_dispatcher = self.config.split_method(self.pserver_endpoints)
        self.has_distributed_lookup_table = self._has_distributed_lookup_table()
-        # split and create vars, then put splited vars in dicts for later use.
+        # step 1: split and create vars, then put splited vars in dicts for later use.
        self._init_splited_vars()
-        # step 3.1: insert send op to send gradient vars to parameter servers
+        # step 2: insert send op to send gradient vars to parameter servers
        ps_dispatcher.reset()
        send_vars = []
@@ -265,7 +268,7 @@ class DistributeTranspiler(object):
                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                })
-        # step 3.2: insert recv op to receive parameters from parameter server
+        # step 3: insert recv op to receive parameters from parameter server
        recv_vars = []
        for _, var in enumerate(send_vars):
            recv_vars.append(self.grad_param_mapping[var])
@@ -312,6 +315,8 @@ class DistributeTranspiler(object):
                outputs={"Out": [orig_param]},
                attrs={"axis": 0})
+        self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist)
        if self.has_distributed_lookup_table:
            self._replace_lookup_table_op_with_prefetch(program,
                                                        pserver_endpoints)
@@ -328,8 +333,78 @@ class DistributeTranspiler(object):
        # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
        delete_ops(self.origin_program.global_block(), self.optimize_ops)
        self.origin_program.__str__()
        return self.origin_program
+    def _get_trainer_startup_program(self,
+                                     recv_vars,
+                                     eplist,
+                                     startup_program=None):
+        """
+        Get transpiled trainer side startup program.
+        Args:
+            startup_program(Program): Startup program.
+        Returns:
+            Program: trainer side startup program.
+        """
+        if startup_program is None:
+            startup_program = self.startup_program
+        # FIXME(gongwb): delete not need ops.
+        # note that: some parameter is not trainable and those ops can't be deleted.
+        for varname, splited_var in self.param_var_mapping.iteritems():
+            # Get the eplist of recv vars
+            eps = []
+            for var in splited_var:
+                index = [v.name for v in recv_vars].index(var.name)
+                eps.append(eplist[index])
+            for var in splited_var:
+                if startup_program.global_block().has_var(var.name):
+                    continue
+                startup_program.global_block().create_var(
+                    name=var.name,
+                    persistable=False,
+                    type=var.type,
+                    dtype=var.dtype,
+                    shape=var.shape,
+                    lod_level=var.lod_level)
+            op = startup_program.global_block().append_op(
+                type="recv",
+                inputs={},
+                outputs={"Out": splited_var},
+                attrs={
+                    "epmap": eps,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+        startup_program.global_block().append_op(
+            type="fetch_barrier",
+            inputs={},
+            outputs={},
+            attrs={
+                "endpoints": self.pserver_endpoints,
+                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+            })
+        for varname, splited_var in self.param_var_mapping.iteritems():
+            #add concat ops to merge splited parameters received from parameter servers.
+            if len(splited_var) <= 1:
+                continue
+            orig_param = startup_program.global_block().vars[varname]
+            startup_program.global_block().append_op(
+                type="concat",
+                inputs={"X": splited_var},
+                outputs={"Out": [orig_param]},
+                attrs={"axis": 0})
+        return startup_program
    def get_pserver_program(self, endpoint):
        """
        Get parameter server side program.
@@ -576,14 +651,16 @@ class DistributeTranspiler(object):
            new_outputs = dict()
            # do not append startup op if var is not on this pserver
            op_on_pserver = False
-            for key in op.output_names:
+            # TODO(gongwb): remove this line.
-                newname, _ = _get_splited_name_and_shape(op.output(key)[0])
+            if op.type not in ["recv", "fetch_barrier", "concat"]:
-                if newname:
+                for key in op.output_names:
-                    op_on_pserver = True
+                    newname, _ = _get_splited_name_and_shape(op.output(key)[0])
-                    new_outputs[key] = created_var_map[newname]
+                    if newname:
-                elif op.output(key)[0] in pserver_vars:
+                        op_on_pserver = True
-                    op_on_pserver = True
+                        new_outputs[key] = created_var_map[newname]
-                    new_outputs[key] = pserver_vars[op.output(key)[0]]
+                    elif op.output(key)[0] in pserver_vars:
+                        op_on_pserver = True
+                        new_outputs[key] = pserver_vars[op.output(key)[0]]
            if op_on_pserver:
                # most startup program ops have no inputs
@@ -1022,7 +1099,6 @@ class DistributeTranspiler(object):
                    var_mapping[varname] = \
                        [program.global_block().var(orig_var.name)]
                continue
            var_mapping[varname] = []
            orig_shape = orig_var.shape
            orig_dim1_flatten = 1