Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into average_model

d7e5e1f1 · wanghaoshuang · 8a645685 · e26f1123 · d7e5e1f1 · d7e5e1f1
20 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,7 +53,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 # TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. 
-option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         ON)
+option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)

--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
-add_subdirectory(cuda)
-add_subdirectory(function)
-add_subdirectory(utils)
-add_subdirectory(math)
-add_subdirectory(gserver)
-add_subdirectory(parameter)
-add_subdirectory(testing)
+if(NOT WITH_FLUID)
+  add_subdirectory(cuda)
+  add_subdirectory(function)
+  add_subdirectory(utils)
+  add_subdirectory(math)
+  add_subdirectory(gserver)
+  add_subdirectory(parameter)

-if(MOBILE_INFERENCE)
+  if(MOBILE_INFERENCE)
    add_subdirectory(capi)
-else()
+  else()
    add_subdirectory(pserver)
    add_subdirectory(trainer)
    add_subdirectory(scripts)
@@ -17,11 +17,13 @@ else()
      add_subdirectory(capi)
    endif()

-  if(NOT ANDROID AND NOT IOS)
-    add_subdirectory(fluid)
-  endif()
-
    if(WITH_SWIG_PY)
      add_subdirectory(api)
    endif()
+  endif()
+endif()
+
+add_subdirectory(testing)
+if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS)
+  add_subdirectory(fluid)
 endif()
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -15,23 +15,43 @@ limitations under the License. */
 #pragma once

 #include <stddef.h>  // for size_t
+#include <condition_variable>
 #include <typeindex>
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
 namespace framework {

+enum class ChannelAction {
+  SEND = 0,
+  RECEIVE = 1,
+  CLOSE = 2,
+};
+
 // Channel is the abstract class of buffered and un-buffered channels.
 template <typename T>
 class Channel {
 public:
+  virtual bool CanSend() = 0;
+  virtual bool CanReceive() = 0;
  virtual bool Send(T*) = 0;
  virtual bool Receive(T*) = 0;
  virtual size_t Cap() = 0;
  virtual void Lock() = 0;
+
  virtual void Unlock() = 0;
+  virtual bool IsClosed() = 0;
  virtual void Close() = 0;
  virtual ~Channel() {}
+
+  virtual void AddToSendQ(const void* referrer, T* data,
+                          std::shared_ptr<std::condition_variable_any> cond,
+                          std::function<bool(ChannelAction)> cb) = 0;
+  virtual void AddToReceiveQ(const void* referrer, T* data,
+                             std::shared_ptr<std::condition_variable_any> cond,
+                             std::function<bool(ChannelAction)> cb) = 0;
+  virtual void RemoveFromSendQ(const void* referrer) = 0;
+  virtual void RemoveFromReceiveQ(const void* referrer) = 0;
 };

 // Forward declaration of channel implementations.
@@ -80,6 +100,27 @@ class ChannelHolder {
    return channel != nullptr ? channel->Receive(data) : false;
  }

+  bool IsClosed() {
+    if (IsInitialized()) {
+      return holder_->IsClosed();
+    }
+    return false;
+  }
+
+  bool CanSend() {
+    if (IsInitialized()) {
+      return holder_->CanSend();
+    }
+    return false;
+  }
+
+  bool CanReceive() {
+    if (IsInitialized()) {
+      return holder_->CanReceive();
+    }
+    return false;
+  }
+
  void close() {
    if (IsInitialized()) holder_->Close();
  }
@@ -97,6 +138,50 @@ class ChannelHolder {
    if (IsInitialized()) holder_->Unlock();
  }

+  template <typename T>
+  void AddToSendQ(const void* referrer, T* data,
+                  std::shared_ptr<std::condition_variable_any> cond,
+                  std::function<bool(ChannelAction)> cb) {
+    if (IsInitialized()) {
+      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+      if (channel != nullptr) {
+        channel->AddToSendQ(referrer, data, cond, cb);
+      }
+    }
+  }
+
+  template <typename T>
+  void AddToReceiveQ(const void* referrer, T* data,
+                     std::shared_ptr<std::condition_variable_any> cond,
+                     std::function<bool(ChannelAction)> cb) {
+    if (IsInitialized()) {
+      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+      if (channel != nullptr) {
+        channel->AddToReceiveQ(referrer, data, cond, cb);
+      }
+    }
+  }
+
+  template <typename T>
+  void RemoveFromSendQ(const void* referrer) {
+    if (IsInitialized()) {
+      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+      if (channel != nullptr) {
+        channel->RemoveFromSendQ(referrer);
+      }
+    }
+  }
+
+  template <typename T>
+  void RemoveFromReceiveQ(const void* referrer) {
+    if (IsInitialized()) {
+      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+      if (channel != nullptr) {
+        channel->RemoveFromReceiveQ(referrer);
+      }
+    }
+  }
+
  inline bool IsInitialized() const { return holder_ != nullptr; }

  inline const std::type_index Type() {
@@ -113,6 +198,9 @@ class ChannelHolder {
    virtual ~Placeholder() {}
    virtual const std::type_index Type() const = 0;
    virtual void* Ptr() const = 0;
+    virtual bool IsClosed() = 0;
+    virtual bool CanSend() = 0;
+    virtual bool CanReceive() = 0;
    virtual void Close() = 0;
    virtual void Lock() = 0;
    virtual void Unlock() = 0;
@@ -129,6 +217,27 @@ class ChannelHolder {

    virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }

+    virtual bool IsClosed() {
+      if (channel_) {
+        return channel_->IsClosed();
+      }
+      return false;
+    }
+
+    virtual bool CanSend() {
+      if (channel_) {
+        return channel_->CanSend();
+      }
+      return false;
+    }
+
+    virtual bool CanReceive() {
+      if (channel_) {
+        return channel_->CanReceive();
+      }
+      return false;
+    }
+
    virtual void Close() {
      if (channel_) channel_->Close();
    }

--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -29,32 +29,50 @@ class ChannelImpl : public paddle::framework::Channel<T> {
  friend void paddle::framework::CloseChannel<T>(Channel<T> *);

 public:
+  virtual bool CanSend();
+  virtual bool CanReceive();
  virtual bool Send(T *);
  virtual bool Receive(T *);
  virtual size_t Cap() { return cap_; }
  virtual void Lock();
  virtual void Unlock();
+  virtual bool IsClosed();
  virtual void Close();
-
  ChannelImpl(size_t);
  virtual ~ChannelImpl();

+  virtual void AddToSendQ(const void *referrer, T *data,
+                          std::shared_ptr<std::condition_variable_any> cond,
+                          std::function<bool(ChannelAction)> cb);
+  virtual void AddToReceiveQ(const void *referrer, T *data,
+                             std::shared_ptr<std::condition_variable_any> cond,
+                             std::function<bool(ChannelAction)> cb);
+
+  virtual void RemoveFromSendQ(const void *referrer);
+  virtual void RemoveFromReceiveQ(const void *referrer);
+
 private:
  struct QueueMessage {
    T *data;
-    std::condition_variable_any cond;
+    std::shared_ptr<std::condition_variable_any> cond;
    bool chan_closed = false;
    bool completed = false;
+    const void *referrer;  // TODO(thuan): figure out better way to do this
+    std::function<bool(ChannelAction)> callback;

-    QueueMessage(T *item) : data(item) {}
+    QueueMessage(T *item)
+        : data(item), cond(std::make_shared<std::condition_variable_any>()) {}
+
+    QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
+        : data(item), cond(cond) {}

    void Wait(std::unique_lock<std::recursive_mutex> &lock) {
-      cond.wait(lock, [this]() { return completed; });
+      cond->wait(lock, [this]() { return completed; });
    }

    void Notify() {
      completed = true;
-      cond.notify_all();
+      cond->notify_all();
    }
  };

@@ -87,6 +105,18 @@ ChannelImpl<T>::ChannelImpl(size_t capacity)
  PADDLE_ENFORCE_GE(capacity, 0);
 }

+template <typename T>
+bool ChannelImpl<T>::CanSend() {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  return !closed_ && (!recvq.empty() || buf_.size() < cap_);
+}
+
+template <typename T>
+bool ChannelImpl<T>::CanReceive() {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0);
+}
+
 template <typename T>
 bool ChannelImpl<T>::Send(T *item) {
  send_ctr++;
@@ -105,7 +135,24 @@ bool ChannelImpl<T>::Send(T *item) {
    std::shared_ptr<QueueMessage> m = recvq.front();
    recvq.pop_front();
    // Do the data transfer
+    // We will do this data transfer if either of the following
+    // cases are true
+    // 1. callback == nullptr // This means it was a regular channel send
+    // 2. callback returns true
+    bool do_send = true;
+    if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
+    if (do_send)
      *(m->data) = std::move(*item);
+    else
+      // We cannot do the data transfer because
+      // this QueueMessage was added by Select
+      // and some other case was executed.
+      // So call the Send function again.
+      // We do not care about notifying other
+      // because they would have been notified
+      // by the executed select case.
+      return Send(item);
+
    // Wake up the blocked process and unlock
    m->Notify();
    lock.unlock();
@@ -150,7 +197,25 @@ bool ChannelImpl<T>::Receive(T *item) {
    std::shared_ptr<QueueMessage> m = sendq.front();
    sendq.pop_front();
    // Do the data transfer
+    // We will do this data transfer if either of the following
+    // cases are true
+    // 1. callback == nullptr // This means it was a regular channel send
+    // 2. callback returns true
+    bool do_receive = true;
+    if (m->callback != nullptr)
+      do_receive = m->callback(ChannelAction::RECEIVE);
+    if (do_receive)
      *item = std::move(*(m->data));
+    else
+      // We cannot do the data transfer because
+      // this QueueMessage was added by Select
+      // and some other case was executed.
+      // So call the Receive function again.
+      // We do not care about notifying other
+      // because they would have been notified
+      // by the executed select case.
+      return Receive(item);
+
    // Wake up the blocked process and unlock
    m->Notify();
    lock.unlock();
@@ -186,6 +251,12 @@ void ChannelImpl<T>::Unlock() {
  mu_.unlock();
 }

+template <typename T>
+bool ChannelImpl<T>::IsClosed() {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  return closed_;
+}
+
 template <typename T>
 void ChannelImpl<T>::Close() {
  std::unique_lock<std::recursive_mutex> lock{mu_};
@@ -203,6 +274,12 @@ void ChannelImpl<T>::Close() {
    std::shared_ptr<QueueMessage> m = recvq.front();
    recvq.pop_front();
    m->chan_closed = true;
+
+    // Execute callback function (if any)
+    if (m->callback != nullptr) {
+      m->callback(ChannelAction::CLOSE);
+    }
+
    m->Notify();
  }

@@ -211,10 +288,72 @@ void ChannelImpl<T>::Close() {
    std::shared_ptr<QueueMessage> m = sendq.front();
    sendq.pop_front();
    m->chan_closed = true;
+
+    // Execute callback function (if any)
+    if (m->callback != nullptr) {
+      m->callback(ChannelAction::CLOSE);
+    }
+
    m->Notify();
  }
 }

+template <typename T>
+void ChannelImpl<T>::AddToSendQ(
+    const void *referrer, T *data,
+    std::shared_ptr<std::condition_variable_any> cond,
+    std::function<bool(ChannelAction)> cb) {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  auto m = std::make_shared<QueueMessage>(data, cond);
+  m->referrer = referrer;
+  m->callback = cb;
+  sendq.push_back(m);
+}
+
+template <typename T>
+void ChannelImpl<T>::AddToReceiveQ(
+    const void *referrer, T *data,
+    std::shared_ptr<std::condition_variable_any> cond,
+    std::function<bool(ChannelAction)> cb) {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  auto m = std::make_shared<QueueMessage>(data, cond);
+  m->referrer = referrer;
+  m->callback = cb;
+  recvq.push_back(m);
+}
+
+template <typename T>
+void ChannelImpl<T>::RemoveFromSendQ(const void *referrer) {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+
+  for (auto it = sendq.begin(); it != sendq.end();) {
+    std::shared_ptr<QueueMessage> sendMsg = (std::shared_ptr<QueueMessage>)*it;
+
+    if (sendMsg->referrer == referrer) {
+      it = sendq.erase(it);
+      send_ctr--;
+    } else {
+      ++it;
+    }
+  }
+}
+
+template <typename T>
+void ChannelImpl<T>::RemoveFromReceiveQ(const void *referrer) {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+
+  for (auto it = recvq.begin(); it != recvq.end();) {
+    std::shared_ptr<QueueMessage> recvMsg = (std::shared_ptr<QueueMessage>)*it;
+
+    if (recvMsg->referrer == referrer) {
+      it = recvq.erase(it);
+      recv_ctr--;
+    } else {
+      ++it;
+    }
+  }
+}
+
 template <typename T>
 ChannelImpl<T>::~ChannelImpl() {
  Close();

--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -56,6 +56,7 @@ class AssignFunctor {
 private:
  void copy_tensor(const framework::LoDTensor &lod_tensor,
                   framework::LoDTensor *out) const {
+    if (lod_tensor.numel() == 0) return;
    auto &out_tensor = *out;
    TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
    out_tensor.set_lod(lod_tensor.lod());

--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -17,11 +17,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

+using framework::OpKernelType;
 using framework::Tensor;

-class MulOpShapeInference : public framework::InferShapeBase {
+class MulOp : public framework::OperatorWithKernel {
 public:
-  void operator()(framework::InferShapeContext* ctx) const override {
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -122,7 +125,7 @@ or not. But the output only shares the LoD information with input $X$.
  }
 };

-class MulOpGrad : public framework::OperatorWithKernel {
+class MulGradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

@@ -156,10 +159,7 @@ class MulOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker,
-                  ops::MulOpShapeInference,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
+REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulGradOp);
 REGISTER_OP_CPU_KERNEL(
    mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/mul_op.h"
+#include "paddle/fluid/platform/float16.h"

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad, ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
+                        ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(mul_grad,
+                        ops::MulGradKernel<plat::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -48,7 +48,7 @@ class MulKernel : public framework::OpKernel<T> {
    }
    math::matmul<DeviceContext, T>(
        context.template device_context<DeviceContext>(), x_matrix, false,
-        y_matrix, false, 1, z, 0);
+        y_matrix, false, static_cast<T>(1), z, static_cast<T>(0));
    if (z_dim.size() != 2) {
      z->Resize(z_dim);
    }

--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
@@ -106,6 +106,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
    T* recvbuffer = nullptr;
    if (root == gpu_id) {
      recvbuffer = out->mutable_data<T>(ctx.GetPlace());
+    } else {
+      out->Resize(framework::make_ddim({0}));
    }
    VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
            << " recv " << out->numel();

--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -23,24 +23,24 @@ class ScatterOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Ref"),
-                   "Input(Ref) of ScatterOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Index"),
-                   "Input(Index) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Ids"),
+                   "Input(Ids) of ScatterOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Updates"),
                   "Input(Updates) of ScatterOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of ScatterOp should not be null.");

    auto updates_dims = ctx->GetInputDim("Updates");
-    auto ref_dims = ctx->GetInputDim("Ref");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1,
-                      "Update Index should be 1-D.");
+    auto ref_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Ids").size(), 1,
+                      "Update Ids should be 1-D.");
    PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(),
-                      "Reference and Updates should have the same shape size");
+                      "Xerence and Updates should have the same shape size");
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
-                      ctx->GetInputDim("Index")[0],
-                      "Updates and Index should have same batch-size.");
+                      ctx->GetInputDim("Ids")[0],
+                      "Updates and Ids should have same batch-size.");
    framework::DDim data_dim(updates_dims);
    for (int i = 1; i < data_dim.size(); ++i) {
      PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
@@ -52,7 +52,7 @@ class ScatterOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
        ctx.device_context());
  }
 };
@@ -64,14 +64,14 @@ class ScatterGradOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override {
    ctx->SetOutputDim(framework::GradVarName("Updates"),
                      ctx->GetInputDim("Updates"));
-    ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
  }

 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
        ctx.device_context());
  }
 };
@@ -80,9 +80,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Ref", "The source input of scatter op");
-    AddInput("Index",
-             "The index input of scatter op where Ref will be updated");
+    AddInput("X", "The source input of scatter op");
+    AddInput("Ids", "The index input of scatter op where X will be updated");
    AddInput("Updates", "The updated value of updates op");
    AddOutput("Out", "The output of add op");
    AddComment(R"DOC(
@@ -91,8 +90,8 @@ Scatter Operator.
 This operator obtains output by updating the input on selected indices on the first axis:

 $$
-Out = Ref \\
-Out[Index] = Ref[Index] + Updates
+Out = X \\
+Out[Ids] = X[Ids] + Updates
 $$

 )DOC");

--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -25,14 +25,14 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "This kernel only runs on GPU device.");
-    auto *Ref = ctx.Input<Tensor>("Ref");
-    auto *Index = ctx.Input<Tensor>("Index");
+    auto *X = ctx.Input<Tensor>("X");
+    auto *Ids = ctx.Input<Tensor>("Ids");
    auto *Updates = ctx.Input<Tensor>("Updates");
    auto *Out = ctx.Output<Tensor>("Out");

-    Out->ShareDataWith(*Ref);
+    Out->ShareDataWith(*X);

-    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
+    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
  }
 };

@@ -42,16 +42,16 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "This kernel only runs on GPU device.");
-    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Ids = ctx.Input<Tensor>("Ids");
    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));

-    // In place gradient: dRef = dO
-    dRef->ShareDataWith(*dOut);
+    // In place gradient: dX = dO
+    dX->ShareDataWith(*dOut);
    dUpdates->mutable_data<T>(ctx.GetPlace());
-    // Gradient by Gather: dUpdates = dO[Index]
-    GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
+    // Gradient by Gather: dUpdates = dO[Ids]
+    GPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
  }
 };


--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -29,15 +29,15 @@ class ScatterOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                   "This kernel only runs on CPU.");
-    auto *Ref = ctx.Input<Tensor>("Ref");
-    auto *Index = ctx.Input<Tensor>("Index");
+    auto *X = ctx.Input<Tensor>("X");
+    auto *Ids = ctx.Input<Tensor>("Ids");
    auto *Updates = ctx.Input<Tensor>("Updates");
    auto *Out = ctx.Output<Tensor>("Out");

-    // In place output: Out = Ref, Out[Index] += Updates
-    Out->ShareDataWith(*Ref);
+    // In place output: Out = X, Out[Ids] += Updates
+    Out->ShareDataWith(*X);
    // Apply ScatterUpdate: Out[index] += Updates[:]
-    ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
+    ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
  }
 };

@@ -47,16 +47,16 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext &ctx) const override {
    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                   "This kernel only runs on CPU.");
-    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Ids = ctx.Input<Tensor>("Ids");
    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));

-    // In place gradient: dRef = dO
-    dRef->ShareDataWith(*dOut);
+    // In place gradient: dX = dO
+    dX->ShareDataWith(*dOut);
    dUpdates->mutable_data<T>(ctx.GetPlace());
-    // Gradient by Gather: dUpdates += dO[Index]
-    CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
+    // Gradient by Gather: dUpdates += dO[Ids]
+    CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
  }
 };


--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/cond_op.h"
 #include "paddle/fluid/operators/net_op.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
@@ -103,12 +104,14 @@ PYBIND11_PLUGIN(core) {
      .def("set", PyCPUTensorSetFromArray<double>)
      .def("set", PyCPUTensorSetFromArray<int64_t>)
      .def("set", PyCPUTensorSetFromArray<bool>)
+      .def("set", PyCPUTensorSetFromArray<uint16_t>)
 #ifdef PADDLE_WITH_CUDA
      .def("set", PyCUDATensorSetFromArray<float>)
      .def("set", PyCUDATensorSetFromArray<int>)
      .def("set", PyCUDATensorSetFromArray<double>)
      .def("set", PyCUDATensorSetFromArray<int64_t>)
      .def("set", PyCUDATensorSetFromArray<bool>)
+      .def("set", PyCUDATensorSetFromArray<uint16_t>)
 #endif
      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
      .def("set_float_element", TensorSetElement<float>)
@@ -315,7 +318,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
                  });
 // clang-format on
-
 #ifdef PADDLE_WITH_CUDA
  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
@@ -423,6 +425,12 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("init_devices", &framework::InitDevices);

  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+#ifdef PADDLE_WITH_CUDA
+  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
+    // Only GPUs with Compute Capability >= 53 support float16
+    return platform::GetCUDAComputeCapability(place.device) >= 53;
+  });
+#endif

  m.def("set_feed_variable", framework::SetFeedVariable);
  m.def("get_fetch_variable", framework::GetFetchVariable);

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/float16.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"

@@ -77,21 +78,32 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
      } else if (paddle::platform::is_cpu_place(tensor.place())) {
        dst_tensor = tensor;
      }
+
+      if (std::type_index(typeid(CUR_TYPE)) ==
+          std::type_index(typeid(platform::float16))) {
+        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+                               "e", /* np.dtype('e') == np.float16 */
+                               (size_t)framework::arity(dst_tensor.dims()),
+                               dims_outside, strides);
+      } else {
        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
                               py::format_descriptor<CUR_TYPE>::format(),
                               (size_t)framework::arity(dst_tensor.dims()),
                               dims_outside, strides);
+      }
    } else {
      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
    }
  }
 };
+
 }  // namespace details
+
 inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
  auto buffer_info =
-      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool>()(
-          tensor);
+      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
+                                  platform::float16>()(tensor);
  return buffer_info;
 }

@@ -136,6 +148,22 @@ void PyCPUTensorSetFromArray(
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }

+template <>
+void PyCPUTensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CPUPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<platform::float16>(place);
+  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
+}
+
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
 void PyCUDATensorSetFromArray(
@@ -157,6 +185,28 @@ void PyCUDATensorSetFromArray(
  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
                                   cudaMemcpyHostToDevice, dev_ctx->stream());
 }
+
+template <>
+void PyCUDATensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CUDAPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<platform::float16>(place);
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx =
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
+  paddle::platform::GpuMemcpyAsync(dst, array.data(),
+                                   sizeof(uint16_t) * array.size(),
+                                   cudaMemcpyHostToDevice, dev_ctx->stream());
+}
 #endif

 }  // namespace pybind

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
-
-file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
-file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
 file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
-file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
 file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py)
-
 set(PY_FILES paddle/__init__.py
-  ${TRAINER_PY_FILES}
-  ${HELPERS_PY_FILES}
  ${UTILS_PY_FILES}
-  ${V2_PY_FILES}
  ${FLUID_PY_FILES})

-add_custom_target(copy_paddle_master)
+if(NOT WITH_FLUID)
+  file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
+  file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
+  file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
+  set(PY_FILES ${PY_FILES}
+    ${TRAINER_PY_FILES}
+    ${HELPERS_PY_FILES}
+    ${V2_PY_FILES})
+
+  add_custom_target(copy_paddle_master)

-SET(COPY_PADDLE_MASTER "")
-if(WITH_GOLANG)
+  SET(COPY_PADDLE_MASTER "")
+  if(WITH_GOLANG)
    SET(COPY_PADDLE_MASTER "copy_paddle_master")
    add_custom_command(TARGET ${COPY_PADDLE_MASTER}
      COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
      )
    add_dependencies(copy_paddle_master paddle_master)
-endif(WITH_GOLANG)
+  endif(WITH_GOLANG)
+endif()

 set(MKL_SHARED_LIBS "")
 set(MKL_DEPENDS "")
@@ -59,23 +61,28 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})

-set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS})
-if(WITH_SWIG_PY)
+set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
+if(NOT WITH_FLUID)
+    set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model)
+    if(WITH_SWIG_PY)
        list(APPEND paddle_python_deps python_api_wheel)
+    endif()
 endif()
 add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})

 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)

 if (WITH_TESTING)
+  if(NOT WITH_FLUID)
    add_subdirectory(paddle/trainer_config_helpers/tests)
    if (WITH_SWIG_PY)
      # enable v2 API unittest only when paddle swig api is compiled
      add_subdirectory(paddle/v2/tests)
      add_subdirectory(paddle/v2/reader/tests)
      add_subdirectory(paddle/v2/plot/tests)
-    add_subdirectory(paddle/fluid/tests)
    endif()
+  endif()
+  add_subdirectory(paddle/fluid/tests)
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
    DESTINATION opt/paddle/share/wheels

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -248,12 +248,15 @@ def _callback_lookup_(op):
                        if o_argu in self.param_grad_names:
                            allreduce_out_name = o_argu + "__nccl_all_reduce__"
                            op_desc = _create_op_desc_(
-                                "ncclAllReduce", {
+                                "ncclReduce",
+                                {
                                    "X": [o_argu],
                                    "Communicator":
                                    ['nccl_com__do_not_change_']
-                                }, {"Out": [allreduce_out_name]},
-                                {"reduction": "ncclSum"})
+                                },
+                                {"Out": [allreduce_out_name]},
+                                {"reduction": "ncclSum",
+                                 "root": 0}, )
                            block.desc.append_op().copy_from(op_desc)

                            op_desc = _create_op_desc_(

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -45,31 +45,13 @@ __activations__ = [
 ]

 __all__ = [
-    'mean',
-    'mul',
-    'reshape',
-    'scale',
-    'sigmoid_cross_entropy_with_logits',
-    'elementwise_add',
-    'elementwise_div',
-    'elementwise_sub',
-    'elementwise_mul',
-    'elementwise_max',
-    'elementwise_min',
-    'elementwise_pow',
-    'clip',
-    'clip_by_norm',
-    'softmax',
-    'sequence_softmax',
-    'logical_and',
-    'logical_or',
-    'logical_xor',
-    'logical_not',
-    'uniform_random',
-    'uniform_random_batch_size_like',
-    'gaussian_random',
-    'gaussian_random_batch_size_like',
-    'cumsum',
+    'mean', 'mul', 'reshape', 'scale', 'sigmoid_cross_entropy_with_logits',
+    'elementwise_add', 'elementwise_div', 'elementwise_sub', 'elementwise_mul',
+    'elementwise_max', 'elementwise_min', 'elementwise_pow', 'clip',
+    'clip_by_norm', 'softmax', 'sequence_softmax', 'logical_and', 'logical_or',
+    'logical_xor', 'logical_not', 'uniform_random',
+    'uniform_random_batch_size_like', 'gaussian_random',
+    'gaussian_random_batch_size_like', 'cumsum', 'scatter'
 ] + __activations__

 for _OP in set(__all__):

--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -14,6 +14,7 @@

 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest


@@ -69,5 +70,42 @@ class TestMulOp2(OpTest):
            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))


+class TestFP16MulOp1(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        x = np.random.random((32, 84)).astype("float16")
+        y = np.random.random((84, 100)).astype("float16")
+        self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
+        self.outputs = {'Out': np.dot(x, y)}
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-1)
+
+
+class TestFP16MulOp2(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        x = np.random.random((15, 4, 12, 10)).astype("float16")
+        y = np.random.random((4, 30, 8, 2, 9)).astype("float16")
+        self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
+        self.attrs = {
+            'x_num_col_dims': 2,
+            'y_num_col_dims': 2,
+        }
+        result = np.dot(
+            x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9))
+        result = result.reshape(15, 4, 8, 2, 9)
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-1)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
@@ -25,7 +25,7 @@ class TestScatterOp(OpTest):
        updates_np = np.random.random((2, 3)).astype("float32")
        output_np = np.copy(ref_np)
        output_np[index_np] = updates_np
-        self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
        self.outputs = {'Out': output_np}

    def test_check_output(self):

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -62,19 +62,21 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')


 packages=['paddle',
-          'paddle.proto',
+          'paddle.utils',
+          'paddle.fluid',
+          'paddle.fluid.proto',
+          'paddle.fluid.proto.profiler',
+          'paddle.fluid.layers']
+
+if '${WITH_FLUID}'== 'OFF':
+    packages+=['paddle.proto',
               'paddle.trainer',
               'paddle.trainer_config_helpers',
-          'paddle.utils',
               'paddle.v2',
               'paddle.v2.dataset',
               'paddle.v2.reader',
               'paddle.v2.master',
               'paddle.v2.plot',
-          'paddle.fluid',
-          'paddle.fluid.proto',
-          'paddle.fluid.proto.profiler',
-          'paddle.fluid.layers',
               'py_paddle']

 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
@@ -84,12 +86,30 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
    setup_requires+=['opencv-python']

 # the prefix is sys.prefix which should always be usr
-paddle_bin_dir = 'opt/paddle/bin'
-paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
+paddle_bins = ''
+if '${WITH_FLUID}'== 'OFF':
+    paddle_bin_dir = 'opt/paddle/bin'
+    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
                   '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
                   '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
                   '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']

+package_data={'paddle.fluid': ['core.so']}
+if '${WITH_FLUID}'== 'OFF':
+    package_data['paddle.v2.master']=['libpaddle_master.so']
+    package_data['py_paddle']=['*.py','_swig_paddle.so']
+
+package_dir={
+    '': '${CMAKE_CURRENT_SOURCE_DIR}',
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
+    'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
+}
+if '${WITH_FLUID}'== 'OFF':
+    package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle'
+    
+
 paddle_rt_lib_dir = 'lib'
 paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
 if '${MKL_SHARED_LIBS}'!= '':
@@ -101,19 +121,8 @@ setup(name='${PACKAGE_NAME}',
      install_requires=setup_requires,
      packages=packages,
      ext_modules=[Extension('_foo', ['stub.cc'])],
-      package_data={
-        'paddle.v2.master': ['libpaddle_master.so'],
-        'paddle.fluid': ['core.so'],
-        'py_paddle':['*.py','_swig_paddle.so']
-      },
-      package_dir={
-          '': '${CMAKE_CURRENT_SOURCE_DIR}',
-          # The paddle.fluid.proto will be generated while compiling.
-          # So that package points to other directory.
-          'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
-          'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
-          'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
-      },
+      package_data=package_data,
+      package_dir=package_dir,
      scripts=paddle_bins,
      data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
 )