diff --git a/CMakeLists.txt b/CMakeLists.txt
index c86889c05c8cf0d521dce9adbf3e918ba91729a1..0ec65bac84b0b0d89123473a8941f80c90f1b339 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,7 +53,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 # TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. 
-option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         ON)
+option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index a7b249d43bf3ad9924749d5e66618750f19d8bf7..d2a4b1335464f553a361728e64ed5ca177ca53da 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,27 +1,29 @@
-add_subdirectory(cuda)
-add_subdirectory(function)
-add_subdirectory(utils)
-add_subdirectory(math)
-add_subdirectory(gserver)
-add_subdirectory(parameter)
-add_subdirectory(testing)
-
-if(MOBILE_INFERENCE)
-  add_subdirectory(capi)
-else()
-  add_subdirectory(pserver)
-  add_subdirectory(trainer)
-  add_subdirectory(scripts)
+if(NOT WITH_FLUID)
+  add_subdirectory(cuda)
+  add_subdirectory(function)
+  add_subdirectory(utils)
+  add_subdirectory(math)
+  add_subdirectory(gserver)
+  add_subdirectory(parameter)
 
-  if(WITH_C_API)
+  if(MOBILE_INFERENCE)
     add_subdirectory(capi)
-  endif()
+  else()
+    add_subdirectory(pserver)
+    add_subdirectory(trainer)
+    add_subdirectory(scripts)
 
-  if(NOT ANDROID AND NOT IOS)
-    add_subdirectory(fluid)
-  endif()
+    if(WITH_C_API)
+      add_subdirectory(capi)
+    endif()
 
-  if(WITH_SWIG_PY)
-    add_subdirectory(api)
+    if(WITH_SWIG_PY)
+      add_subdirectory(api)
+    endif()
   endif()
 endif()
+
+add_subdirectory(testing)
+if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS)
+  add_subdirectory(fluid)
+endif()
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 9f8fb12098d622058a86f83c1c42a1feb1cfb2e2..51e2b03f9cb9abf6b3effe4035d4eec2ba4f9fbf 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -15,23 +15,43 @@ limitations under the License. */
 #pragma once
 
 #include <stddef.h>  // for size_t
+#include <condition_variable>
 #include <typeindex>
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 
+enum class ChannelAction {
+  SEND = 0,
+  RECEIVE = 1,
+  CLOSE = 2,
+};
+
 // Channel is the abstract class of buffered and un-buffered channels.
 template <typename T>
 class Channel {
  public:
+  virtual bool CanSend() = 0;
+  virtual bool CanReceive() = 0;
   virtual bool Send(T*) = 0;
   virtual bool Receive(T*) = 0;
   virtual size_t Cap() = 0;
   virtual void Lock() = 0;
+
   virtual void Unlock() = 0;
+  virtual bool IsClosed() = 0;
   virtual void Close() = 0;
   virtual ~Channel() {}
+
+  virtual void AddToSendQ(const void* referrer, T* data,
+                          std::shared_ptr<std::condition_variable_any> cond,
+                          std::function<bool(ChannelAction)> cb) = 0;
+  virtual void AddToReceiveQ(const void* referrer, T* data,
+                             std::shared_ptr<std::condition_variable_any> cond,
+                             std::function<bool(ChannelAction)> cb) = 0;
+  virtual void RemoveFromSendQ(const void* referrer) = 0;
+  virtual void RemoveFromReceiveQ(const void* referrer) = 0;
 };
 
 // Forward declaration of channel implementations.
@@ -80,6 +100,27 @@ class ChannelHolder {
     return channel != nullptr ? channel->Receive(data) : false;
   }
 
+  bool IsClosed() {
+    if (IsInitialized()) {
+      return holder_->IsClosed();
+    }
+    return false;
+  }
+
+  bool CanSend() {
+    if (IsInitialized()) {
+      return holder_->CanSend();
+    }
+    return false;
+  }
+
+  bool CanReceive() {
+    if (IsInitialized()) {
+      return holder_->CanReceive();
+    }
+    return false;
+  }
+
   void close() {
     if (IsInitialized()) holder_->Close();
   }
@@ -97,6 +138,50 @@ class ChannelHolder {
     if (IsInitialized()) holder_->Unlock();
   }
 
+  template <typename T>
+  void AddToSendQ(const void* referrer, T* data,
+                  std::shared_ptr<std::condition_variable_any> cond,
+                  std::function<bool(ChannelAction)> cb) {
+    if (IsInitialized()) {
+      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+      if (channel != nullptr) {
+        channel->AddToSendQ(referrer, data, cond, cb);
+      }
+    }
+  }
+
+  template <typename T>
+  void AddToReceiveQ(const void* referrer, T* data,
+                     std::shared_ptr<std::condition_variable_any> cond,
+                     std::function<bool(ChannelAction)> cb) {
+    if (IsInitialized()) {
+      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+      if (channel != nullptr) {
+        channel->AddToReceiveQ(referrer, data, cond, cb);
+      }
+    }
+  }
+
+  template <typename T>
+  void RemoveFromSendQ(const void* referrer) {
+    if (IsInitialized()) {
+      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+      if (channel != nullptr) {
+        channel->RemoveFromSendQ(referrer);
+      }
+    }
+  }
+
+  template <typename T>
+  void RemoveFromReceiveQ(const void* referrer) {
+    if (IsInitialized()) {
+      Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+      if (channel != nullptr) {
+        channel->RemoveFromReceiveQ(referrer);
+      }
+    }
+  }
+
   inline bool IsInitialized() const { return holder_ != nullptr; }
 
   inline const std::type_index Type() {
@@ -113,6 +198,9 @@ class ChannelHolder {
     virtual ~Placeholder() {}
     virtual const std::type_index Type() const = 0;
     virtual void* Ptr() const = 0;
+    virtual bool IsClosed() = 0;
+    virtual bool CanSend() = 0;
+    virtual bool CanReceive() = 0;
     virtual void Close() = 0;
     virtual void Lock() = 0;
     virtual void Unlock() = 0;
@@ -129,6 +217,27 @@ class ChannelHolder {
 
     virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
 
+    virtual bool IsClosed() {
+      if (channel_) {
+        return channel_->IsClosed();
+      }
+      return false;
+    }
+
+    virtual bool CanSend() {
+      if (channel_) {
+        return channel_->CanSend();
+      }
+      return false;
+    }
+
+    virtual bool CanReceive() {
+      if (channel_) {
+        return channel_->CanReceive();
+      }
+      return false;
+    }
+
     virtual void Close() {
       if (channel_) channel_->Close();
     }
diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h
index a4561031fd8c49613269e7008ce558f25f9765e4..c194c03e264cccfa9ad755ea24bc6372e82bfb00 100644
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -29,32 +29,50 @@ class ChannelImpl : public paddle::framework::Channel<T> {
   friend void paddle::framework::CloseChannel<T>(Channel<T> *);
 
  public:
+  virtual bool CanSend();
+  virtual bool CanReceive();
   virtual bool Send(T *);
   virtual bool Receive(T *);
   virtual size_t Cap() { return cap_; }
   virtual void Lock();
   virtual void Unlock();
+  virtual bool IsClosed();
   virtual void Close();
-
   ChannelImpl(size_t);
   virtual ~ChannelImpl();
 
+  virtual void AddToSendQ(const void *referrer, T *data,
+                          std::shared_ptr<std::condition_variable_any> cond,
+                          std::function<bool(ChannelAction)> cb);
+  virtual void AddToReceiveQ(const void *referrer, T *data,
+                             std::shared_ptr<std::condition_variable_any> cond,
+                             std::function<bool(ChannelAction)> cb);
+
+  virtual void RemoveFromSendQ(const void *referrer);
+  virtual void RemoveFromReceiveQ(const void *referrer);
+
  private:
   struct QueueMessage {
     T *data;
-    std::condition_variable_any cond;
+    std::shared_ptr<std::condition_variable_any> cond;
     bool chan_closed = false;
     bool completed = false;
+    const void *referrer;  // TODO(thuan): figure out better way to do this
+    std::function<bool(ChannelAction)> callback;
 
-    QueueMessage(T *item) : data(item) {}
+    QueueMessage(T *item)
+        : data(item), cond(std::make_shared<std::condition_variable_any>()) {}
+
+    QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
+        : data(item), cond(cond) {}
 
     void Wait(std::unique_lock<std::recursive_mutex> &lock) {
-      cond.wait(lock, [this]() { return completed; });
+      cond->wait(lock, [this]() { return completed; });
     }
 
     void Notify() {
       completed = true;
-      cond.notify_all();
+      cond->notify_all();
     }
   };
 
@@ -87,6 +105,18 @@ ChannelImpl<T>::ChannelImpl(size_t capacity)
   PADDLE_ENFORCE_GE(capacity, 0);
 }
 
+template <typename T>
+bool ChannelImpl<T>::CanSend() {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  return !closed_ && (!recvq.empty() || buf_.size() < cap_);
+}
+
+template <typename T>
+bool ChannelImpl<T>::CanReceive() {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0);
+}
+
 template <typename T>
 bool ChannelImpl<T>::Send(T *item) {
   send_ctr++;
@@ -105,7 +135,24 @@ bool ChannelImpl<T>::Send(T *item) {
     std::shared_ptr<QueueMessage> m = recvq.front();
     recvq.pop_front();
     // Do the data transfer
-    *(m->data) = std::move(*item);
+    // We will do this data transfer if either of the following
+    // cases are true
+    // 1. callback == nullptr // This means it was a regular channel send
+    // 2. callback returns true
+    bool do_send = true;
+    if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
+    if (do_send)
+      *(m->data) = std::move(*item);
+    else
+      // We cannot do the data transfer because
+      // this QueueMessage was added by Select
+      // and some other case was executed.
+      // So call the Send function again.
+      // We do not care about notifying other
+      // because they would have been notified
+      // by the executed select case.
+      return Send(item);
+
     // Wake up the blocked process and unlock
     m->Notify();
     lock.unlock();
@@ -150,7 +197,25 @@ bool ChannelImpl<T>::Receive(T *item) {
     std::shared_ptr<QueueMessage> m = sendq.front();
     sendq.pop_front();
     // Do the data transfer
-    *item = std::move(*(m->data));
+    // We will do this data transfer if either of the following
+    // cases are true
+    // 1. callback == nullptr // This means it was a regular channel send
+    // 2. callback returns true
+    bool do_receive = true;
+    if (m->callback != nullptr)
+      do_receive = m->callback(ChannelAction::RECEIVE);
+    if (do_receive)
+      *item = std::move(*(m->data));
+    else
+      // We cannot do the data transfer because
+      // this QueueMessage was added by Select
+      // and some other case was executed.
+      // So call the Receive function again.
+      // We do not care about notifying other
+      // because they would have been notified
+      // by the executed select case.
+      return Receive(item);
+
     // Wake up the blocked process and unlock
     m->Notify();
     lock.unlock();
@@ -186,6 +251,12 @@ void ChannelImpl<T>::Unlock() {
   mu_.unlock();
 }
 
+template <typename T>
+bool ChannelImpl<T>::IsClosed() {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  return closed_;
+}
+
 template <typename T>
 void ChannelImpl<T>::Close() {
   std::unique_lock<std::recursive_mutex> lock{mu_};
@@ -203,6 +274,12 @@ void ChannelImpl<T>::Close() {
     std::shared_ptr<QueueMessage> m = recvq.front();
     recvq.pop_front();
     m->chan_closed = true;
+
+    // Execute callback function (if any)
+    if (m->callback != nullptr) {
+      m->callback(ChannelAction::CLOSE);
+    }
+
     m->Notify();
   }
 
@@ -211,10 +288,72 @@ void ChannelImpl<T>::Close() {
     std::shared_ptr<QueueMessage> m = sendq.front();
     sendq.pop_front();
     m->chan_closed = true;
+
+    // Execute callback function (if any)
+    if (m->callback != nullptr) {
+      m->callback(ChannelAction::CLOSE);
+    }
+
     m->Notify();
   }
 }
 
+template <typename T>
+void ChannelImpl<T>::AddToSendQ(
+    const void *referrer, T *data,
+    std::shared_ptr<std::condition_variable_any> cond,
+    std::function<bool(ChannelAction)> cb) {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  auto m = std::make_shared<QueueMessage>(data, cond);
+  m->referrer = referrer;
+  m->callback = cb;
+  sendq.push_back(m);
+}
+
+template <typename T>
+void ChannelImpl<T>::AddToReceiveQ(
+    const void *referrer, T *data,
+    std::shared_ptr<std::condition_variable_any> cond,
+    std::function<bool(ChannelAction)> cb) {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  auto m = std::make_shared<QueueMessage>(data, cond);
+  m->referrer = referrer;
+  m->callback = cb;
+  recvq.push_back(m);
+}
+
+template <typename T>
+void ChannelImpl<T>::RemoveFromSendQ(const void *referrer) {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+
+  for (auto it = sendq.begin(); it != sendq.end();) {
+    std::shared_ptr<QueueMessage> sendMsg = (std::shared_ptr<QueueMessage>)*it;
+
+    if (sendMsg->referrer == referrer) {
+      it = sendq.erase(it);
+      send_ctr--;
+    } else {
+      ++it;
+    }
+  }
+}
+
+template <typename T>
+void ChannelImpl<T>::RemoveFromReceiveQ(const void *referrer) {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+
+  for (auto it = recvq.begin(); it != recvq.end();) {
+    std::shared_ptr<QueueMessage> recvMsg = (std::shared_ptr<QueueMessage>)*it;
+
+    if (recvMsg->referrer == referrer) {
+      it = recvq.erase(it);
+      recv_ctr--;
+    } else {
+      ++it;
+    }
+  }
+}
+
 template <typename T>
 ChannelImpl<T>::~ChannelImpl() {
   Close();
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index 39ae3c0040d04a6d901f1d6c992d547a6778c28e..d372213e1b6008b0c4227103dd40730f86a84301 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -56,6 +56,7 @@ class AssignFunctor {
  private:
   void copy_tensor(const framework::LoDTensor &lod_tensor,
                    framework::LoDTensor *out) const {
+    if (lod_tensor.numel() == 0) return;
     auto &out_tensor = *out;
     TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
     out_tensor.set_lod(lod_tensor.lod());
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index e7bed2c39735b66c19e738c91f4977e46571143b..90af1e2d602ac039b4d98a69a889ff8b1b85ffc6 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -17,11 +17,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using framework::OpKernelType;
 using framework::Tensor;
 
-class MulOpShapeInference : public framework::InferShapeBase {
+class MulOp : public framework::OperatorWithKernel {
  public:
-  void operator()(framework::InferShapeContext* ctx) const override {
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -122,7 +125,7 @@ or not. But the output only shares the LoD information with input $X$.
   }
 };
 
-class MulOpGrad : public framework::OperatorWithKernel {
+class MulGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -156,10 +159,7 @@ class MulOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker,
-                  ops::MulOpShapeInference,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
+REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulGradOp);
 REGISTER_OP_CPU_KERNEL(
     mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc
index 0667530e943856576ae8c9fe4856cb6aa1448e4e..757f9c3ee2665c7ac654659416fe8dd727dca16d 100644
--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mul_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad, ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
+                        ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(mul_grad,
+                        ops::MulGradKernel<plat::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
index 38311cf87265ad0f1f815734cbf69bd682d62e62..b1260d36ebe11f65529ac274c959479dcb38ee5f 100644
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -48,7 +48,7 @@ class MulKernel : public framework::OpKernel<T> {
     }
     math::matmul<DeviceContext, T>(
         context.template device_context<DeviceContext>(), x_matrix, false,
-        y_matrix, false, 1, z, 0);
+        y_matrix, false, static_cast<T>(1), z, static_cast<T>(0));
     if (z_dim.size() != 2) {
       z->Resize(z_dim);
     }
diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc
index 4d83a70e7334a84bb98bd52f0172f6b7ecedb58d..ad623e1fe0f8941615b671a0c20bd3637ae6d407 100644
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
@@ -106,6 +106,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     T* recvbuffer = nullptr;
     if (root == gpu_id) {
       recvbuffer = out->mutable_data<T>(ctx.GetPlace());
+    } else {
+      out->Resize(framework::make_ddim({0}));
     }
     VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
             << " recv " << out->numel();
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 3fb8b56d2676f90ff7e1cefa46c459ee37f63ca8..d6fd6214711f4ee66b1daffa4db2e84aa7201e79 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -23,24 +23,24 @@ class ScatterOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Ref"),
-                   "Input(Ref) of ScatterOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Index"),
-                   "Input(Index) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Ids"),
+                   "Input(Ids) of ScatterOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Updates"),
                    "Input(Updates) of ScatterOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ScatterOp should not be null.");
 
     auto updates_dims = ctx->GetInputDim("Updates");
-    auto ref_dims = ctx->GetInputDim("Ref");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1,
-                      "Update Index should be 1-D.");
+    auto ref_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Ids").size(), 1,
+                      "Update Ids should be 1-D.");
     PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(),
-                      "Reference and Updates should have the same shape size");
+                      "Xerence and Updates should have the same shape size");
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
-                      ctx->GetInputDim("Index")[0],
-                      "Updates and Index should have same batch-size.");
+                      ctx->GetInputDim("Ids")[0],
+                      "Updates and Ids should have same batch-size.");
     framework::DDim data_dim(updates_dims);
     for (int i = 1; i < data_dim.size(); ++i) {
       PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
@@ -52,7 +52,7 @@ class ScatterOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
         ctx.device_context());
   }
 };
@@ -64,14 +64,14 @@ class ScatterGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("Updates"),
                       ctx->GetInputDim("Updates"));
-    ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
         ctx.device_context());
   }
 };
@@ -80,9 +80,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Ref", "The source input of scatter op");
-    AddInput("Index",
-             "The index input of scatter op where Ref will be updated");
+    AddInput("X", "The source input of scatter op");
+    AddInput("Ids", "The index input of scatter op where X will be updated");
     AddInput("Updates", "The updated value of updates op");
     AddOutput("Out", "The output of add op");
     AddComment(R"DOC(
@@ -91,8 +90,8 @@ Scatter Operator.
 This operator obtains output by updating the input on selected indices on the first axis:
 
 $$
-Out = Ref \\
-Out[Index] = Ref[Index] + Updates
+Out = X \\
+Out[Ids] = X[Ids] + Updates
 $$
 
 )DOC");
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
index bdabb29fa680f8f87873b4381acf0dbd2b6195d0..ef7d700659d8d713715a10910baf739954ba0786 100644
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -25,14 +25,14 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
-    auto *Ref = ctx.Input<Tensor>("Ref");
-    auto *Index = ctx.Input<Tensor>("Index");
+    auto *X = ctx.Input<Tensor>("X");
+    auto *Ids = ctx.Input<Tensor>("Ids");
     auto *Updates = ctx.Input<Tensor>("Updates");
     auto *Out = ctx.Output<Tensor>("Out");
 
-    Out->ShareDataWith(*Ref);
+    Out->ShareDataWith(*X);
 
-    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
+    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
   }
 };
 
@@ -42,16 +42,16 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
-    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Ids = ctx.Input<Tensor>("Ids");
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-    // In place gradient: dRef = dO
-    dRef->ShareDataWith(*dOut);
+    // In place gradient: dX = dO
+    dX->ShareDataWith(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
-    // Gradient by Gather: dUpdates = dO[Index]
-    GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
+    // Gradient by Gather: dUpdates = dO[Ids]
+    GPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
   }
 };
 
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
index 3c6e7ece320229e1a311ef6d7a27387d40be3c2a..2151d8a9240fc88966533f4a07d5cf56b6c1c3bc 100644
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -29,15 +29,15 @@ class ScatterOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                    "This kernel only runs on CPU.");
-    auto *Ref = ctx.Input<Tensor>("Ref");
-    auto *Index = ctx.Input<Tensor>("Index");
+    auto *X = ctx.Input<Tensor>("X");
+    auto *Ids = ctx.Input<Tensor>("Ids");
     auto *Updates = ctx.Input<Tensor>("Updates");
     auto *Out = ctx.Output<Tensor>("Out");
 
-    // In place output: Out = Ref, Out[Index] += Updates
-    Out->ShareDataWith(*Ref);
+    // In place output: Out = X, Out[Ids] += Updates
+    Out->ShareDataWith(*X);
     // Apply ScatterUpdate: Out[index] += Updates[:]
-    ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
+    ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
   }
 };
 
@@ -47,16 +47,16 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                    "This kernel only runs on CPU.");
-    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Ids = ctx.Input<Tensor>("Ids");
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-    // In place gradient: dRef = dO
-    dRef->ShareDataWith(*dOut);
+    // In place gradient: dX = dO
+    dX->ShareDataWith(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
-    // Gradient by Gather: dUpdates += dO[Index]
-    CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
+    // Gradient by Gather: dUpdates += dO[Ids]
+    CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
   }
 };
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d2e883caccdd34a9d662f06b83cf9a71d3d4a51e..6c05442466f5f3d8e04a8f0a2206443b1007a107 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/cond_op.h"
 #include "paddle/fluid/operators/net_op.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
@@ -103,12 +104,14 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCPUTensorSetFromArray<double>)
       .def("set", PyCPUTensorSetFromArray<int64_t>)
       .def("set", PyCPUTensorSetFromArray<bool>)
+      .def("set", PyCPUTensorSetFromArray<uint16_t>)
 #ifdef PADDLE_WITH_CUDA
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
       .def("set", PyCUDATensorSetFromArray<double>)
       .def("set", PyCUDATensorSetFromArray<int64_t>)
       .def("set", PyCUDATensorSetFromArray<bool>)
+      .def("set", PyCUDATensorSetFromArray<uint16_t>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
       .def("set_float_element", TensorSetElement<float>)
@@ -315,7 +318,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
                   });
 // clang-format on
-
 #ifdef PADDLE_WITH_CUDA
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
@@ -423,6 +425,12 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("init_devices", &framework::InitDevices);
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+#ifdef PADDLE_WITH_CUDA
+  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
+    // Only GPUs with Compute Capability >= 53 support float16
+    return platform::GetCUDAComputeCapability(place.device) >= 53;
+  });
+#endif
 
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 1b0916ea0370d95a0c7dd149ee3f7b294c5e2351..3b206f2f87abe01363fb7e61c319559c6dd24594 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/float16.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
@@ -77,21 +78,32 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
       } else if (paddle::platform::is_cpu_place(tensor.place())) {
         dst_tensor = tensor;
       }
-      return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
-                             py::format_descriptor<CUR_TYPE>::format(),
-                             (size_t)framework::arity(dst_tensor.dims()),
-                             dims_outside, strides);
+
+      if (std::type_index(typeid(CUR_TYPE)) ==
+          std::type_index(typeid(platform::float16))) {
+        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+                               "e", /* np.dtype('e') == np.float16 */
+                               (size_t)framework::arity(dst_tensor.dims()),
+                               dims_outside, strides);
+      } else {
+        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+                               py::format_descriptor<CUR_TYPE>::format(),
+                               (size_t)framework::arity(dst_tensor.dims()),
+                               dims_outside, strides);
+      }
     } else {
       constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
       return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
     }
   }
 };
+
 }  // namespace details
+
 inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
   auto buffer_info =
-      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool>()(
-          tensor);
+      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
+                                  platform::float16>()(tensor);
   return buffer_info;
 }
 
@@ -136,6 +148,22 @@ void PyCPUTensorSetFromArray(
   std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
 
+template <>
+void PyCPUTensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CPUPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<platform::float16>(place);
+  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
+}
+
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
 void PyCUDATensorSetFromArray(
@@ -157,6 +185,28 @@ void PyCUDATensorSetFromArray(
   paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
                                    cudaMemcpyHostToDevice, dev_ctx->stream());
 }
+
+template <>
+void PyCUDATensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CUDAPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<platform::float16>(place);
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx =
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
+  paddle::platform::GpuMemcpyAsync(dst, array.data(),
+                                   sizeof(uint16_t) * array.size(),
+                                   cudaMemcpyHostToDevice, dev_ctx->stream());
+}
 #endif
 
 }  // namespace pybind
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 6e24cbdd3f6a4f05c1691dc643d880f6f454429d..90c2dfbba78418fb7b731f5363017d70577b1ae5 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,27 +1,29 @@
-
-file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
-file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
 file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
-file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
 file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py)
-
 set(PY_FILES paddle/__init__.py
-  ${TRAINER_PY_FILES}
-  ${HELPERS_PY_FILES}
   ${UTILS_PY_FILES}
-  ${V2_PY_FILES}
   ${FLUID_PY_FILES})
 
-add_custom_target(copy_paddle_master)
+if(NOT WITH_FLUID)
+  file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
+  file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
+  file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
+  set(PY_FILES ${PY_FILES}
+    ${TRAINER_PY_FILES}
+    ${HELPERS_PY_FILES}
+    ${V2_PY_FILES})
 
-SET(COPY_PADDLE_MASTER "")
-if(WITH_GOLANG)
-  SET(COPY_PADDLE_MASTER "copy_paddle_master")
-  add_custom_command(TARGET ${COPY_PADDLE_MASTER}
-    COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
-    )
-  add_dependencies(copy_paddle_master paddle_master)
-endif(WITH_GOLANG)
+  add_custom_target(copy_paddle_master)
+
+  SET(COPY_PADDLE_MASTER "")
+  if(WITH_GOLANG)
+    SET(COPY_PADDLE_MASTER "copy_paddle_master")
+    add_custom_command(TARGET ${COPY_PADDLE_MASTER}
+      COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
+      )
+    add_dependencies(copy_paddle_master paddle_master)
+  endif(WITH_GOLANG)
+endif()
 
 set(MKL_SHARED_LIBS "")
 set(MKL_DEPENDS "")
@@ -59,23 +61,28 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
     DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
-set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS})
-if(WITH_SWIG_PY)
-    list(APPEND paddle_python_deps python_api_wheel)
+set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
+if(NOT WITH_FLUID)
+    set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model)
+    if(WITH_SWIG_PY)
+        list(APPEND paddle_python_deps python_api_wheel)
+    endif()
 endif()
 add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
 
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 
 if (WITH_TESTING)
-  add_subdirectory(paddle/trainer_config_helpers/tests)
-  if (WITH_SWIG_PY)
-    # enable v2 API unittest only when paddle swig api is compiled
-    add_subdirectory(paddle/v2/tests)
-    add_subdirectory(paddle/v2/reader/tests)
-    add_subdirectory(paddle/v2/plot/tests)
-    add_subdirectory(paddle/fluid/tests)
+  if(NOT WITH_FLUID)
+    add_subdirectory(paddle/trainer_config_helpers/tests)
+    if (WITH_SWIG_PY)
+      # enable v2 API unittest only when paddle swig api is compiled
+      add_subdirectory(paddle/v2/tests)
+      add_subdirectory(paddle/v2/reader/tests)
+      add_subdirectory(paddle/v2/plot/tests)
+    endif()
   endif()
+  add_subdirectory(paddle/fluid/tests)
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index b6f20daee3a585777a23255355f0a0e31328d23f..7af6ed1463ab737e871da487f2a687301652ef2d 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -248,12 +248,15 @@ def _callback_lookup_(op):
                         if o_argu in self.param_grad_names:
                             allreduce_out_name = o_argu + "__nccl_all_reduce__"
                             op_desc = _create_op_desc_(
-                                "ncclAllReduce", {
+                                "ncclReduce",
+                                {
                                     "X": [o_argu],
                                     "Communicator":
                                     ['nccl_com__do_not_change_']
-                                }, {"Out": [allreduce_out_name]},
-                                {"reduction": "ncclSum"})
+                                },
+                                {"Out": [allreduce_out_name]},
+                                {"reduction": "ncclSum",
+                                 "root": 0}, )
                             block.desc.append_op().copy_from(op_desc)
 
                             op_desc = _create_op_desc_(
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 0b88b639629ac73b16ec36aa5930c3d6a9665943..14ad18d5085fb945646818cc679f088a43806a70 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -45,31 +45,13 @@ __activations__ = [
 ]
 
 __all__ = [
-    'mean',
-    'mul',
-    'reshape',
-    'scale',
-    'sigmoid_cross_entropy_with_logits',
-    'elementwise_add',
-    'elementwise_div',
-    'elementwise_sub',
-    'elementwise_mul',
-    'elementwise_max',
-    'elementwise_min',
-    'elementwise_pow',
-    'clip',
-    'clip_by_norm',
-    'softmax',
-    'sequence_softmax',
-    'logical_and',
-    'logical_or',
-    'logical_xor',
-    'logical_not',
-    'uniform_random',
-    'uniform_random_batch_size_like',
-    'gaussian_random',
-    'gaussian_random_batch_size_like',
-    'cumsum',
+    'mean', 'mul', 'reshape', 'scale', 'sigmoid_cross_entropy_with_logits',
+    'elementwise_add', 'elementwise_div', 'elementwise_sub', 'elementwise_mul',
+    'elementwise_max', 'elementwise_min', 'elementwise_pow', 'clip',
+    'clip_by_norm', 'softmax', 'sequence_softmax', 'logical_and', 'logical_or',
+    'logical_xor', 'logical_not', 'uniform_random',
+    'uniform_random_batch_size_like', 'gaussian_random',
+    'gaussian_random_batch_size_like', 'cumsum', 'scatter'
 ] + __activations__
 
 for _OP in set(__all__):
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 9d1da420c7f70bd2a89d183a5f0a2b145f0ff475..40440bea1267112b84b66002a0bf921be3029265 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -14,6 +14,7 @@
 
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
@@ -69,5 +70,42 @@ class TestMulOp2(OpTest):
             ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
 
 
+class TestFP16MulOp1(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        x = np.random.random((32, 84)).astype("float16")
+        y = np.random.random((84, 100)).astype("float16")
+        self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
+        self.outputs = {'Out': np.dot(x, y)}
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-1)
+
+
+class TestFP16MulOp2(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        x = np.random.random((15, 4, 12, 10)).astype("float16")
+        y = np.random.random((4, 30, 8, 2, 9)).astype("float16")
+        self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
+        self.attrs = {
+            'x_num_col_dims': 2,
+            'y_num_col_dims': 2,
+        }
+        result = np.dot(
+            x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9))
+        result = result.reshape(15, 4, 8, 2, 9)
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
index bb02a40d449860cf6c0576662e79a5e36e6e0635..fb1728743630b3ea908ae835444eff7fd71b72c8 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
@@ -25,7 +25,7 @@ class TestScatterOp(OpTest):
         updates_np = np.random.random((2, 3)).astype("float32")
         output_np = np.copy(ref_np)
         output_np[index_np] = updates_np
-        self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
diff --git a/python/setup.py.in b/python/setup.py.in
index f830039a3af581d593d510326f15139377cb25f1..4cb5409524457b7bc5a99c88a0dbbfc8834923fa 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -62,20 +62,22 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
 
 
 packages=['paddle',
-          'paddle.proto',
-          'paddle.trainer',
-          'paddle.trainer_config_helpers',
           'paddle.utils',
-          'paddle.v2',
-          'paddle.v2.dataset',
-          'paddle.v2.reader',
-          'paddle.v2.master',
-          'paddle.v2.plot',
           'paddle.fluid',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
-          'paddle.fluid.layers',
-          'py_paddle']
+          'paddle.fluid.layers']
+
+if '${WITH_FLUID}'== 'OFF':
+    packages+=['paddle.proto',
+               'paddle.trainer',
+               'paddle.trainer_config_helpers',
+               'paddle.v2',
+               'paddle.v2.dataset',
+               'paddle.v2.reader',
+               'paddle.v2.master',
+               'paddle.v2.plot',
+               'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
     setup_requires = f.read().splitlines()
@@ -84,11 +86,29 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
     setup_requires+=['opencv-python']
 
 # the prefix is sys.prefix which should always be usr
-paddle_bin_dir = 'opt/paddle/bin'
-paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
-               '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
-               '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
-               '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
+paddle_bins = ''
+if '${WITH_FLUID}'== 'OFF':
+    paddle_bin_dir = 'opt/paddle/bin'
+    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
+                   '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
+                   '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
+                   '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
+
+package_data={'paddle.fluid': ['core.so']}
+if '${WITH_FLUID}'== 'OFF':
+    package_data['paddle.v2.master']=['libpaddle_master.so']
+    package_data['py_paddle']=['*.py','_swig_paddle.so']
+
+package_dir={
+    '': '${CMAKE_CURRENT_SOURCE_DIR}',
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
+    'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
+}
+if '${WITH_FLUID}'== 'OFF':
+    package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle'
+    
 
 paddle_rt_lib_dir = 'lib'
 paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
@@ -101,19 +121,8 @@ setup(name='${PACKAGE_NAME}',
       install_requires=setup_requires,
       packages=packages,
       ext_modules=[Extension('_foo', ['stub.cc'])],
-      package_data={
-        'paddle.v2.master': ['libpaddle_master.so'],
-        'paddle.fluid': ['core.so'],
-        'py_paddle':['*.py','_swig_paddle.so']
-      },
-      package_dir={
-          '': '${CMAKE_CURRENT_SOURCE_DIR}',
-          # The paddle.fluid.proto will be generated while compiling.
-          # So that package points to other directory.
-          'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
-          'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
-          'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
-      },
+      package_data=package_data,
+      package_dir=package_dir,
       scripts=paddle_bins,
       data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
 )