From 7620efdf1c1a6ec593c63ed017530e3fe8580f72 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Sat, 23 Sep 2017 13:55:47 +0800 Subject: [PATCH] combine gpu&cpu code in multiplex_op --- paddle/operators/multiplex_op.cc | 26 +++++----- paddle/operators/multiplex_op.cu | 70 +++------------------------ paddle/operators/multiplex_op.h | 81 +++++++++++++++++++++++++------- 3 files changed, 81 insertions(+), 96 deletions(-) diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc index 03559d0643c..6b22c782fe2 100644 --- a/paddle/operators/multiplex_op.cc +++ b/paddle/operators/multiplex_op.cc @@ -22,10 +22,7 @@ using LoDTensor = framework::LoDTensor; class MultiplexOp : public framework::OperatorWithKernel { public: - MultiplexOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} + using framework::OperatorWithKernel::OperatorWithKernel; protected: void InferShape(const framework::InferShapeContext &ctx) const override { @@ -64,12 +61,12 @@ class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker { Multiplex multiple tensors according to the index provided by the first input tensor. -ins[0]: the index of the tensor to output of size batchSize. -ins[1:N]: the candidate output tensor. +ins[0]: the index tensor. +ins[1:N]: the candidate output tensors. For each index i from 0 to batchSize - 1, the output is the i-th row of the the (index[i] + 1)-th tensor. -For each i-th row of output: +For i-th row of the output tensor: y[i][j] = x_{k}[i][j], j = 0,1, ... , (x_{1}.width - 1) @@ -82,11 +79,7 @@ and `k = x{0}[i] + 1`. class MultiplexGradOp : public framework::OperatorWithKernel { public: - MultiplexGradOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} + using framework::OperatorWithKernel::OperatorWithKernel; protected: void InferShape(const framework::InferShapeContext &ctx) const override { @@ -98,7 +91,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel { "Input(Out@GRAD) shouldn't be null."); auto d_ins = ctx.MultiOutput(framework::GradVarName("X")); auto ins = ctx.MultiInput("X"); - // don;t compute gradient for index + // don't compute gradient for index (ins[0]) for (size_t i = 1; i < ins.size(); i++) { if (d_ins[i]) { d_ins[i]->Resize(ins[i]->dims()); @@ -113,5 +106,8 @@ namespace ops = paddle::operators; REGISTER_OP(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, multiplex_grad, ops::MultiplexGradOp); -REGISTER_OP_CPU_KERNEL(multiplex, ops::MultiplexCPUKernel); -REGISTER_OP_CPU_KERNEL(multiplex_grad, ops::MultiplexGradCPUKernel); +REGISTER_OP_CPU_KERNEL(multiplex, + ops::MultiplexKernel); +REGISTER_OP_CPU_KERNEL( + multiplex_grad, + ops::MultiplexGradKernel); diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu index 055e13d1834..3d219389ba5 100644 --- a/paddle/operators/multiplex_op.cu +++ b/paddle/operators/multiplex_op.cu @@ -13,70 +13,12 @@ limitations under the License. */ #include "paddle/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -class MultiplexGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto rows = ins[1]->dims()[0]; - auto cols = ins[1]->dims()[1]; - // copy index to cpu - Tensor index_t_cpu; - index_t_cpu.CopyFrom(*(ins[0]), paddle::platform::CPUPlace()); - auto index = index_t_cpu.data(); - for (auto i = 0; i < rows; i++) { - int k = (int)index[i] + 1; - cudaMemcpy(out->data() + i * cols, ins[k]->data() + i * cols, - cols * sizeof(T), cudaMemcpyDeviceToDevice); - } - } -}; - -template -class MultiplexGradGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto ins = ctx.MultiInput("X"); - auto d_ins = ctx.MultiOutput(framework::GradVarName("X")); - for (size_t i = 1; i < d_ins.size(); ++i) { - if (d_ins[i]) { - d_ins[i]->mutable_data(ctx.GetPlace()); - auto dims = d_ins[i]->dims(); - cudaMemset(d_ins[i]->data(), 0, - framework::product(dims) * sizeof(T)); - } - } - - auto rows = ins[1]->dims()[0]; - auto cols = ins[1]->dims()[1]; - // copy index to cpu - Tensor index_t_cpu; - index_t_cpu.CopyFrom(*(ins[0]), paddle::platform::CPUPlace()); - auto index = index_t_cpu.data(); - for (auto i = 0; i < rows; i++) { - int k = (int)index[i] + 1; - if (d_ins[k]) { - cudaMemcpy(d_ins[k]->data() + i * cols, d_out->data() + i * cols, - cols * sizeof(T), cudaMemcpyDeviceToDevice); - } - } - } -}; -} // namespace operators -} // namespace paddle +#include "paddle/operators/multiplex_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(multiplex, ops::MultiplexGPUKernel); -REGISTER_OP_GPU_KERNEL(multiplex_grad, ops::MultiplexGradGPUKernel); +REGISTER_OP_GPU_KERNEL(multiplex, + ops::MultiplexKernel); +REGISTER_OP_GPU_KERNEL( + multiplex_grad, + ops::MultiplexGradKernel); diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h index 82b4a2c4c75..dcc01d0f981 100644 --- a/paddle/operators/multiplex_op.h +++ b/paddle/operators/multiplex_op.h @@ -17,31 +17,56 @@ #include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" +#include "paddle/memory/memcpy.h" namespace paddle { namespace operators { -template -class MultiplexCPUKernel : public framework::OpKernel { +template +class MultiplexKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { auto ins = ctx.MultiInput("X"); auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); - auto index = ins[0]->data(); auto rows = ins[1]->dims()[0]; auto cols = ins[1]->dims()[1]; - for (auto i = 0; i < rows; i++) { - int k = (int)index[i] + 1; - memcpy(out->data() + i * cols, ins[k]->data() + i * cols, - cols * sizeof(T)); + if (platform::is_cpu_place(ctx.GetPlace())) { + auto* index = ins[0]->data(); + platform::CPUPlace place = boost::get(ctx.GetPlace()); + for (auto i = 0; i < rows; i++) { + int k = (int)index[i] + 1; + PADDLE_ENFORCE_LT(k, ins.size(), + "index exceeds the number of candidate tensors."); + memory::Copy(place, out->data() + i * cols, place, + ins[k]->data() + i * cols, cols * sizeof(T)); + } + } else { +#ifndef PADDLE_ONLY_CPU + // copy index to cpu + framework::Tensor index_t_cpu; + index_t_cpu.CopyFrom(*(ins[0]), platform::CPUPlace()); + auto* index = index_t_cpu.data(); + auto stream = reinterpret_cast( + ctx.device_context()) + .stream(); + platform::GPUPlace place = boost::get(ctx.GetPlace()); + for (auto i = 0; i < rows; i++) { + int k = (int)index[i] + 1; + PADDLE_ENFORCE_LT(k, ins.size(), + "index exceeds the number of candidate tensors."); + memory::Copy(place, out->data() + i * cols, place, + ins[k]->data() + i * cols, cols * sizeof(T), stream); + } +#endif } } }; -template -class MultiplexGradCPUKernel : public framework::OpKernel { +template +class MultiplexGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { auto* d_out = ctx.Input(framework::GradVarName("Out")); @@ -51,20 +76,42 @@ class MultiplexGradCPUKernel : public framework::OpKernel { for (size_t i = 1; i < d_ins.size(); i++) { if (d_ins[i]) { d_ins[i]->mutable_data(ctx.GetPlace()); - auto dims = d_ins[i]->dims(); - memset(d_ins[i]->data(), 0, framework::product(dims) * sizeof(T)); + auto t = framework::EigenVector::Flatten(*d_ins[i]); + t.device(ctx.GetEigenDevice()) = t.constant(static_cast(0)); } } - auto index = ins[0]->data(); auto rows = ins[1]->dims()[0]; auto cols = ins[1]->dims()[1]; - for (auto i = 0; i < rows; i++) { - int k = (int)index[i] + 1; - if (d_ins[k]) { - memcpy(d_ins[k]->data() + i * cols, d_out->data() + i * cols, - cols * sizeof(T)); + if (platform::is_cpu_place(ctx.GetPlace())) { + auto* index = ins[0]->data(); + platform::CPUPlace place = boost::get(ctx.GetPlace()); + for (auto i = 0; i < rows; i++) { + int k = (int)index[i] + 1; + if (d_ins[k]) { + memory::Copy(place, d_ins[k]->data() + i * cols, place, + d_out->data() + i * cols, cols * sizeof(T)); + } + } + } else { +#ifndef PADDLE_ONLY_CPU + // copy index to cpu + framework::Tensor index_t_cpu; + index_t_cpu.CopyFrom(*(ins[0]), platform::CPUPlace()); + auto* index = index_t_cpu.data(); + + auto stream = reinterpret_cast( + ctx.device_context()) + .stream(); + platform::GPUPlace place = boost::get(ctx.GetPlace()); + for (auto i = 0; i < rows; i++) { + int k = (int)index[i] + 1; + if (d_ins[k]) { + memory::Copy(place, d_ins[k]->data() + i * cols, place, + d_out->data() + i * cols, cols * sizeof(T), stream); + } } +#endif } } }; -- GitLab