From fb52bc6e122d249b3e2d8168de81f9e52b980322 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Mon, 25 Sep 2017 11:18:38 +0800
Subject: [PATCH] revert code layout in multiplex_op

---
 paddle/operators/multiplex_op.cc |  6 +--
 paddle/operators/multiplex_op.cu | 77 ++++++++++++++++++++++++++++++--
 paddle/operators/multiplex_op.h  | 75 +++++++------------------------
 3 files changed, 94 insertions(+), 64 deletions(-)
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 6b22c782fe2..6e77b86b569 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -106,8 +106,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, multiplex_grad,
             ops::MultiplexGradOp);
-REGISTER_OP_CPU_KERNEL(multiplex,
-                       ops::MultiplexKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    multiplex, ops::MultiplexCPUKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
     multiplex_grad,
-    ops::MultiplexGradKernel<paddle::platform::CPUPlace, float>);
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 3d219389ba5..4736f15bd59 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -15,10 +15,81 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/multiplex_op.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class MultiplexGPUKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto rows = ins[1]->dims()[0];
+    auto cols = ins[1]->dims()[1];
+    // copy index to cpu
+    framework::Tensor index_t_cpu;
+    index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
+    auto* index = index_t_cpu.data<T>();
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    Place place = boost::get<Place>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int k = (int)index[i] + 1;
+      PADDLE_ENFORCE_LT(k, ins.size(),
+                        "index exceeds the number of candidate tensors.");
+      memory::Copy(place, out->data<T>() + i * cols, place,
+                   ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
+    }
+  }
+};
+
+template <typename Place, typename T>
+class MultiplexGradGPUKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto d_ins =
+        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+    for (size_t i = 1; i < d_ins.size(); i++) {
+      if (d_ins[i]) {
+        d_ins[i]->mutable_data<T>(ctx.GetPlace());
+        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
+        t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+      }
+    }
+
+    auto rows = ins[1]->dims()[0];
+    auto cols = ins[1]->dims()[1];
+    // copy index to cpu
+    framework::Tensor index_t_cpu;
+    index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
+    auto* index = index_t_cpu.data<T>();
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    Place place = boost::get<Place>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int k = (int)index[i] + 1;
+      if (d_ins[k]) {
+        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
+                     d_out->data<T>() + i * cols, cols * sizeof(T), stream);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(multiplex,
-                       ops::MultiplexKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    multiplex, ops::MultiplexGPUKernel<paddle::platform::GPUPlace, float>);
 REGISTER_OP_GPU_KERNEL(
     multiplex_grad,
-    ops::MultiplexGradKernel<paddle::platform::GPUPlace, float>);
+    ops::MultiplexGradGPUKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
index dcc01d0f981..44e8e0c1998 100644
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
@@ -23,7 +23,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class MultiplexKernel : public framework::OpKernel {
+class MultiplexCPUKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto ins = ctx.MultiInput<framework::Tensor>("X");
@@ -33,40 +33,20 @@ class MultiplexKernel : public framework::OpKernel {
 
     auto rows = ins[1]->dims()[0];
     auto cols = ins[1]->dims()[1];
-    if (platform::is_cpu_place(ctx.GetPlace())) {
-      auto* index = ins[0]->data<T>();
-      platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
-      for (auto i = 0; i < rows; i++) {
-        int k = (int)index[i] + 1;
-        PADDLE_ENFORCE_LT(k, ins.size(),
-                          "index exceeds the number of candidate tensors.");
-        memory::Copy(place, out->data<T>() + i * cols, place,
-                     ins[k]->data<T>() + i * cols, cols * sizeof(T));
-      }
-    } else {
-#ifndef PADDLE_ONLY_CPU
-      // copy index to cpu
-      framework::Tensor index_t_cpu;
-      index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
-      auto* index = index_t_cpu.data<T>();
-      auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                        ctx.device_context())
-                        .stream();
-      platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
-      for (auto i = 0; i < rows; i++) {
-        int k = (int)index[i] + 1;
-        PADDLE_ENFORCE_LT(k, ins.size(),
-                          "index exceeds the number of candidate tensors.");
-        memory::Copy(place, out->data<T>() + i * cols, place,
-                     ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
-      }
-#endif
+    auto* index = ins[0]->data<T>();
+    Place place = boost::get<Place>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int k = (int)index[i] + 1;
+      PADDLE_ENFORCE_LT(k, ins.size(),
+                        "index exceeds the number of candidate tensors.");
+      memory::Copy(place, out->data<T>() + i * cols, place,
+                   ins[k]->data<T>() + i * cols, cols * sizeof(T));
     }
   }
 };
 
 template <typename Place, typename T>
-class MultiplexGradKernel : public framework::OpKernel {
+class MultiplexGradCPUKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
@@ -83,35 +63,14 @@ class MultiplexGradKernel : public framework::OpKernel {
 
     auto rows = ins[1]->dims()[0];
     auto cols = ins[1]->dims()[1];
-    if (platform::is_cpu_place(ctx.GetPlace())) {
-      auto* index = ins[0]->data<T>();
-      platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
-      for (auto i = 0; i < rows; i++) {
-        int k = (int)index[i] + 1;
-        if (d_ins[k]) {
-          memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                       d_out->data<T>() + i * cols, cols * sizeof(T));
-        }
-      }
-    } else {
-#ifndef PADDLE_ONLY_CPU
-      // copy index to cpu
-      framework::Tensor index_t_cpu;
-      index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
-      auto* index = index_t_cpu.data<T>();
-
-      auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                        ctx.device_context())
-                        .stream();
-      platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
-      for (auto i = 0; i < rows; i++) {
-        int k = (int)index[i] + 1;
-        if (d_ins[k]) {
-          memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
-                       d_out->data<T>() + i * cols, cols * sizeof(T), stream);
-        }
+    auto* index = ins[0]->data<T>();
+    Place place = boost::get<Place>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int k = (int)index[i] + 1;
+      if (d_ins[k]) {
+        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
+                     d_out->data<T>() + i * cols, cols * sizeof(T));
       }
-#endif
     }
   }
 };
-- 
GitLab