提交 7620efdf 编写于 作者: Y Yibing Liu

combine gpu&cpu code in multiplex_op

上级 85a5d384
...@@ -22,10 +22,7 @@ using LoDTensor = framework::LoDTensor; ...@@ -22,10 +22,7 @@ using LoDTensor = framework::LoDTensor;
class MultiplexOp : public framework::OperatorWithKernel { class MultiplexOp : public framework::OperatorWithKernel {
public: public:
MultiplexOp(const std::string &type, const framework::VariableNameMap &inputs, using framework::OperatorWithKernel::OperatorWithKernel;
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
protected: protected:
void InferShape(const framework::InferShapeContext &ctx) const override { void InferShape(const framework::InferShapeContext &ctx) const override {
...@@ -64,12 +61,12 @@ class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -64,12 +61,12 @@ class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
Multiplex multiple tensors according to the index provided by the first Multiplex multiple tensors according to the index provided by the first
input tensor. input tensor.
ins[0]: the index of the tensor to output of size batchSize. ins[0]: the index tensor.
ins[1:N]: the candidate output tensor. ins[1:N]: the candidate output tensors.
For each index i from 0 to batchSize - 1, the output is the i-th row of the For each index i from 0 to batchSize - 1, the output is the i-th row of the
the (index[i] + 1)-th tensor. the (index[i] + 1)-th tensor.
For each i-th row of output: For i-th row of the output tensor:
y[i][j] = x_{k}[i][j], j = 0,1, ... , (x_{1}.width - 1) y[i][j] = x_{k}[i][j], j = 0,1, ... , (x_{1}.width - 1)
...@@ -82,11 +79,7 @@ and `k = x{0}[i] + 1`. ...@@ -82,11 +79,7 @@ and `k = x{0}[i] + 1`.
class MultiplexGradOp : public framework::OperatorWithKernel { class MultiplexGradOp : public framework::OperatorWithKernel {
public: public:
MultiplexGradOp(const std::string &type, using framework::OperatorWithKernel::OperatorWithKernel;
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
protected: protected:
void InferShape(const framework::InferShapeContext &ctx) const override { void InferShape(const framework::InferShapeContext &ctx) const override {
...@@ -98,7 +91,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel { ...@@ -98,7 +91,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
"Input(Out@GRAD) shouldn't be null."); "Input(Out@GRAD) shouldn't be null.");
auto d_ins = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X")); auto d_ins = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
auto ins = ctx.MultiInput<Tensor>("X"); auto ins = ctx.MultiInput<Tensor>("X");
// don;t compute gradient for index // don't compute gradient for index (ins[0])
for (size_t i = 1; i < ins.size(); i++) { for (size_t i = 1; i < ins.size(); i++) {
if (d_ins[i]) { if (d_ins[i]) {
d_ins[i]->Resize(ins[i]->dims()); d_ins[i]->Resize(ins[i]->dims());
...@@ -113,5 +106,8 @@ namespace ops = paddle::operators; ...@@ -113,5 +106,8 @@ namespace ops = paddle::operators;
REGISTER_OP(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, multiplex_grad, REGISTER_OP(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, multiplex_grad,
ops::MultiplexGradOp); ops::MultiplexGradOp);
REGISTER_OP_CPU_KERNEL(multiplex, ops::MultiplexCPUKernel<float>); REGISTER_OP_CPU_KERNEL(multiplex,
REGISTER_OP_CPU_KERNEL(multiplex_grad, ops::MultiplexGradCPUKernel<float>); ops::MultiplexKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
multiplex_grad,
ops::MultiplexGradKernel<paddle::platform::CPUPlace, float>);
...@@ -13,70 +13,12 @@ ...@@ -13,70 +13,12 @@
limitations under the License. */ limitations under the License. */
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/multiplex_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T>
class MultiplexGPUKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto ins = ctx.MultiInput<Tensor>("X");
auto* out = ctx.Output<LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto rows = ins[1]->dims()[0];
auto cols = ins[1]->dims()[1];
// copy index to cpu
Tensor index_t_cpu;
index_t_cpu.CopyFrom<T>(*(ins[0]), paddle::platform::CPUPlace());
auto index = index_t_cpu.data<T>();
for (auto i = 0; i < rows; i++) {
int k = (int)index[i] + 1;
cudaMemcpy(out->data<T>() + i * cols, ins[k]->data<T>() + i * cols,
cols * sizeof(T), cudaMemcpyDeviceToDevice);
}
}
};
template <typename T>
class MultiplexGradGPUKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto ins = ctx.MultiInput<Tensor>("X");
auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
for (size_t i = 1; i < d_ins.size(); ++i) {
if (d_ins[i]) {
d_ins[i]->mutable_data<T>(ctx.GetPlace());
auto dims = d_ins[i]->dims();
cudaMemset(d_ins[i]->data<T>(), 0,
framework::product(dims) * sizeof(T));
}
}
auto rows = ins[1]->dims()[0];
auto cols = ins[1]->dims()[1];
// copy index to cpu
Tensor index_t_cpu;
index_t_cpu.CopyFrom<T>(*(ins[0]), paddle::platform::CPUPlace());
auto index = index_t_cpu.data<T>();
for (auto i = 0; i < rows; i++) {
int k = (int)index[i] + 1;
if (d_ins[k]) {
cudaMemcpy(d_ins[k]->data<T>() + i * cols, d_out->data<T>() + i * cols,
cols * sizeof(T), cudaMemcpyDeviceToDevice);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(multiplex, ops::MultiplexGPUKernel<float>); REGISTER_OP_GPU_KERNEL(multiplex,
REGISTER_OP_GPU_KERNEL(multiplex_grad, ops::MultiplexGradGPUKernel<float>); ops::MultiplexKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
multiplex_grad,
ops::MultiplexGradKernel<paddle::platform::GPUPlace, float>);
...@@ -17,31 +17,56 @@ ...@@ -17,31 +17,56 @@
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/memory/memcpy.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T> template <typename Place, typename T>
class MultiplexCPUKernel : public framework::OpKernel { class MultiplexKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& ctx) const { void Compute(const framework::ExecutionContext& ctx) const {
auto ins = ctx.MultiInput<framework::Tensor>("X"); auto ins = ctx.MultiInput<framework::Tensor>("X");
auto* out = ctx.Output<framework::LoDTensor>("Out"); auto* out = ctx.Output<framework::LoDTensor>("Out");
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
auto index = ins[0]->data<T>();
auto rows = ins[1]->dims()[0]; auto rows = ins[1]->dims()[0];
auto cols = ins[1]->dims()[1]; auto cols = ins[1]->dims()[1];
for (auto i = 0; i < rows; i++) { if (platform::is_cpu_place(ctx.GetPlace())) {
int k = (int)index[i] + 1; auto* index = ins[0]->data<T>();
memcpy(out->data<T>() + i * cols, ins[k]->data<T>() + i * cols, platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
cols * sizeof(T)); for (auto i = 0; i < rows; i++) {
int k = (int)index[i] + 1;
PADDLE_ENFORCE_LT(k, ins.size(),
"index exceeds the number of candidate tensors.");
memory::Copy(place, out->data<T>() + i * cols, place,
ins[k]->data<T>() + i * cols, cols * sizeof(T));
}
} else {
#ifndef PADDLE_ONLY_CPU
// copy index to cpu
framework::Tensor index_t_cpu;
index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
auto* index = index_t_cpu.data<T>();
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context())
.stream();
platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
for (auto i = 0; i < rows; i++) {
int k = (int)index[i] + 1;
PADDLE_ENFORCE_LT(k, ins.size(),
"index exceeds the number of candidate tensors.");
memory::Copy(place, out->data<T>() + i * cols, place,
ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
}
#endif
} }
} }
}; };
template <typename T> template <typename Place, typename T>
class MultiplexGradCPUKernel : public framework::OpKernel { class MultiplexGradKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& ctx) const { void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out")); auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
...@@ -51,20 +76,42 @@ class MultiplexGradCPUKernel : public framework::OpKernel { ...@@ -51,20 +76,42 @@ class MultiplexGradCPUKernel : public framework::OpKernel {
for (size_t i = 1; i < d_ins.size(); i++) { for (size_t i = 1; i < d_ins.size(); i++) {
if (d_ins[i]) { if (d_ins[i]) {
d_ins[i]->mutable_data<T>(ctx.GetPlace()); d_ins[i]->mutable_data<T>(ctx.GetPlace());
auto dims = d_ins[i]->dims(); auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
memset(d_ins[i]->data<T>(), 0, framework::product(dims) * sizeof(T)); t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
} }
} }
auto index = ins[0]->data<T>();
auto rows = ins[1]->dims()[0]; auto rows = ins[1]->dims()[0];
auto cols = ins[1]->dims()[1]; auto cols = ins[1]->dims()[1];
for (auto i = 0; i < rows; i++) { if (platform::is_cpu_place(ctx.GetPlace())) {
int k = (int)index[i] + 1; auto* index = ins[0]->data<T>();
if (d_ins[k]) { platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
memcpy(d_ins[k]->data<T>() + i * cols, d_out->data<T>() + i * cols, for (auto i = 0; i < rows; i++) {
cols * sizeof(T)); int k = (int)index[i] + 1;
if (d_ins[k]) {
memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
d_out->data<T>() + i * cols, cols * sizeof(T));
}
}
} else {
#ifndef PADDLE_ONLY_CPU
// copy index to cpu
framework::Tensor index_t_cpu;
index_t_cpu.CopyFrom<T>(*(ins[0]), platform::CPUPlace());
auto* index = index_t_cpu.data<T>();
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context())
.stream();
platform::GPUPlace place = boost::get<platform::GPUPlace>(ctx.GetPlace());
for (auto i = 0; i < rows; i++) {
int k = (int)index[i] + 1;
if (d_ins[k]) {
memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
d_out->data<T>() + i * cols, cols * sizeof(T), stream);
}
} }
#endif
} }
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册