From 7620efdf1c1a6ec593c63ed017530e3fe8580f72 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Sat, 23 Sep 2017 13:55:47 +0800
Subject: [PATCH] combine gpu&cpu code in multiplex_op

---
 paddle/operators/multiplex_op.cc | 26 +++++-----
 paddle/operators/multiplex_op.cu | 70 +++------------------------
 paddle/operators/multiplex_op.h  | 81 +++++++++++++++++++++++++-------
 3 files changed, 81 insertions(+), 96 deletions(-)
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 03559d0643c..6b22c782fe2 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -22,10 +22,7 @@ using LoDTensor = framework::LoDTensor;
 
 class MultiplexOp : public framework::OperatorWithKernel {
  public:
-  MultiplexOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
@@ -64,12 +61,12 @@ class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
 Multiplex multiple tensors according to the index provided by the first
 input tensor.
 
-ins[0]: the index of the tensor to output of size batchSize.
-ins[1:N]: the candidate output tensor.
+ins[0]: the index tensor.
+ins[1:N]: the candidate output tensors.
 For each index i from 0 to batchSize - 1, the output is the i-th row of the
 the (index[i] + 1)-th tensor.
 
-For each i-th row of output:
+For i-th row of the output tensor:
 
 y[i][j] = x_{k}[i][j], j = 0,1, ... , (x_{1}.width - 1)
 
@@ -82,11 +79,7 @@ and `k = x{0}[i] + 1`.
 
 class MultiplexGradOp : public framework::OperatorWithKernel {
  public:
-  MultiplexGradOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
@@ -98,7 +91,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
                             "Input(Out@GRAD) shouldn't be null.");
     auto d_ins = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
     auto ins = ctx.MultiInput<Tensor>("X");
-    // don;t compute gradient for index
+    // don't compute gradient for index (ins[0])
     for (size_t i = 1; i < ins.size(); i++) {
       if (d_ins[i]) {
         d_ins[i]->Resize(ins[i]->dims());
@@ -113,5 +106,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, multiplex_grad,
             ops::MultiplexGradOp);
-REGISTER_OP_CPU_KERNEL(multiplex, ops::MultiplexCPUKernel<float>);
-REGISTER_OP_CPU_KERNEL(multiplex_grad, ops::MultiplexGradCPUKernel<float>);
+REGISTER_OP_CPU_KERNEL(multiplex,
+                       ops::MultiplexKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    multiplex_grad,
+    ops::MultiplexGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 055e13d1834..3d219389ba5 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -13,70 +13,12 @@
    limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T>
-class MultiplexGPUKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto rows = ins[1]->dims()[0];
-    auto cols = ins[1]->dims()[1];
-    // copy index to cpu
-    Tensor index_t_cpu;
-    index_t_cpu.CopyFrom<T>(*(ins[0]), paddle::platform::CPUPlace());
-    auto index = index_t_cpu.data<T>();
-    for (auto i = 0; i < rows; i++) {
-      int k = (int)index[i] + 1;
-      cudaMemcpy(out->data<T>() + i * cols, ins[k]->data<T>() + i * cols,
-                 cols * sizeof(T), cudaMemcpyDeviceToDevice);
-    }
-  }
-};
-
-template <typename T>
-class MultiplexGradGPUKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<Tensor>("X");
-    auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
-    for (size_t i = 1; i < d_ins.size(); ++i) {
-      if (d_ins[i]) {
-        d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto dims = d_ins[i]->dims();
-        cudaMemset(d_ins[i]->data<T>(), 0,
-                   framework::product(dims) * sizeof(T));
-      }
-    }
-
-    auto rows = ins[1]->dims()[0];
-    auto cols = ins[1]->dims()[1];
-    // copy index to cpu
-    Tensor index_t_cpu;
-    index_t_cpu.CopyFrom<T>(*(ins[0]), paddle::platform::CPUPlace());
-    auto index = index_t_cpu.data<T>();
-    for (auto i = 0; i < rows; i++) {
-      int k = (int)index[i] + 1;
-      if (d_ins[k]) {
-        cudaMemcpy(d_ins[k]->data<T>() + i * cols, d_out->data<T>() + i * cols,
-                   cols * sizeof(T), cudaMemcpyDeviceToDevice);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
+#include "paddle/operators/multiplex_op.h"
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(multiplex, ops::MultiplexGPUKernel<float>);
-REGISTER_OP_GPU_KERNEL(multiplex_grad, ops::MultiplexGradGPUKernel<float>);
+REGISTER_OP_GPU_KERNEL(multiplex,
+                       ops::MultiplexKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    multiplex_grad,
+    ops::MultiplexGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
index 82b4a2c4c75..dcc01d0f981 100644
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
@@ -17,31 +17,56 @@
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-class MultiplexCPUKernel : public framework::OpKernel {
+template <typename Place, typename T>
+class MultiplexKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto ins = ctx.MultiInput<framework::Tensor>("X");
     auto* out = ctx.Output<framework::LoDTensor>("Out");
+
     out->mutable_data<T>(ctx.GetPlace());
 
-    auto index = ins[0]->data<T>();
     auto rows = ins[1]->dims()[0];
     auto cols = ins[1]->dims()[1];
-    for (auto i = 0; i < rows; i++) {
-      int k = (int)index[i] + 1;
-      memcpy(out->data<T>() + i * cols, ins[k]->data<T>() + i * cols,
-             cols * sizeof(T));
+    if (platform::is_cpu_place(ctx.GetPlace())) {
+      auto* index = ins[0]->data<T>();
+      platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
+      for (auto i = 0; i < rows; i++) {
+        int k = (int)index[i] + 1;
+        PADDLE_ENFORCE_LT(k, ins.size(),
+                          "index exceeds the number of candidate tensors.");
+        memory::Copy(place, out->data<T>() + i * cols, place,
+                     ins[k]->data<T>() + i * cols, cols * sizeof(T));
+      }
+    } else {
+#ifndef PADDLE_ONLY_CPU
+      // copy index to cpu
+      framework::Tensor index_t_cpu;
+      index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
+      auto* index = index_t_cpu.data<T>();
+      auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                        ctx.device_context())
+                        .stream();
+      platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
+      for (auto i = 0; i < rows; i++) {
+        int k = (int)index[i] + 1;
+        PADDLE_ENFORCE_LT(k, ins.size(),
+                          "index exceeds the number of candidate tensors.");
+        memory::Copy(place, out->data<T>() + i * cols, place,
+                     ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
+      }
+#endif
     }
   }
 };
 
-template <typename T>
-class MultiplexGradCPUKernel : public framework::OpKernel {
+template <typename Place, typename T>
+class MultiplexGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
@@ -51,20 +76,42 @@ class MultiplexGradCPUKernel : public framework::OpKernel {
     for (size_t i = 1; i < d_ins.size(); i++) {
       if (d_ins[i]) {
         d_ins[i]->mutable_data<T>(ctx.GetPlace());
-        auto dims = d_ins[i]->dims();
-        memset(d_ins[i]->data<T>(), 0, framework::product(dims) * sizeof(T));
+        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
+        t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
       }
     }
 
-    auto index = ins[0]->data<T>();
     auto rows = ins[1]->dims()[0];
     auto cols = ins[1]->dims()[1];
-    for (auto i = 0; i < rows; i++) {
-      int k = (int)index[i] + 1;
-      if (d_ins[k]) {
-        memcpy(d_ins[k]->data<T>() + i * cols, d_out->data<T>() + i * cols,
-               cols * sizeof(T));
+    if (platform::is_cpu_place(ctx.GetPlace())) {
+      auto* index = ins[0]->data<T>();
+      platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
+      for (auto i = 0; i < rows; i++) {
+        int k = (int)index[i] + 1;
+        if (d_ins[k]) {
+          memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
+                       d_out->data<T>() + i * cols, cols * sizeof(T));
+        }
+      }
+    } else {
+#ifndef PADDLE_ONLY_CPU
+      // copy index to cpu
+      framework::Tensor index_t_cpu;
+      index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
+      auto* index = index_t_cpu.data<T>();
+
+      auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                        ctx.device_context())
+                        .stream();
+      platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
+      for (auto i = 0; i < rows; i++) {
+        int k = (int)index[i] + 1;
+        if (d_ins[k]) {
+          memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
+                       d_out->data<T>() + i * cols, cols * sizeof(T), stream);
+        }
       }
+#endif
     }
   }
 };
-- 
GitLab