提交 273f7375 编写于 作者: Q qiaolongfei

optimize code

上级 5d718a58
...@@ -88,35 +88,6 @@ class ReduceGradKernel : public framework::OpKernel<T> { ...@@ -88,35 +88,6 @@ class ReduceGradKernel : public framework::OpKernel<T> {
auto* output = context.Output<Tensor>(framework::GradVarName("X")); auto* output = context.Output<Tensor>(framework::GradVarName("X"));
output->mutable_data<T>(context.GetPlace()); output->mutable_data<T>(context.GetPlace());
if (context.GetPlace().type() == typeid(platform::CPUPlace)) {
const auto* input2_d = input2->data<T>();
auto* output_d = output->data<T>();
// CPU reduce_all_grad
if (reduce_all) {
PADDLE_ENFORCE(input2->dims().size() == 1 && input2->dims()[0] == 1,
"output should be a scalar");
for (int64_t i = 0; i < framework::product(input0->dims()); ++i) {
output_d[i] = input2_d[0];
}
return;
}
if (input0->dims().size() == 2 && dims.size() == 1) {
auto& input_dim = input0->dims();
for (int64_t i = 0; i < input_dim[0]; ++i) {
for (int64_t j = 0; j < input_dim[1]; ++j) {
if (dims[0] == 0) {
output_d[i * input_dim[1] + j] = input2_d[j];
} else {
output_d[i * input_dim[1] + j] = input2_d[i];
}
}
}
return;
}
}
if (reduce_all) { if (reduce_all) {
auto x = EigenVector<T>::Flatten(*input0); auto x = EigenVector<T>::Flatten(*input0);
auto x_reduce = EigenVector<T>::From(*input1); auto x_reduce = EigenVector<T>::From(*input1);
......
...@@ -23,12 +23,13 @@ REGISTER_OP_CPU_KERNEL( ...@@ -23,12 +23,13 @@ REGISTER_OP_CPU_KERNEL(
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>, ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t, ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::SumFunctor>); ops::SumFunctor>);
REGISTER_OP_CPU_KERNEL(reduce_sum_grad, REGISTER_OP_CPU_KERNEL(
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, reduce_sum_grad,
float, ops::SumGradFunctor>, ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, float,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, ops::SumGradFunctor>,
double, ops::SumGradFunctor>, ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, double,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, ops::SumGradFunctor>,
int, ops::SumGradFunctor>, ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, int,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, ops::SumGradFunctor>,
int64_t, ops::SumGradFunctor>); ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::SumGradFunctor>);
...@@ -14,11 +14,69 @@ ...@@ -14,11 +14,69 @@
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/operators/reduce_op.h" #include "paddle/fluid/operators/reduce_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
// use for loop to speed up Eigen broadcast. 4 timer faster then broadcast
template <typename DeviceContext, typename T, typename Functor>
class ReduceSumGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto dims = context.Attr<std::vector<int>>("dim");
if (context.GetPlace().type() == typeid(platform::CPUPlace) &&
dims.size() == 1) {
auto* input0 = context.Input<Tensor>("X");
auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
auto* output = context.Output<Tensor>(framework::GradVarName("X"));
output->mutable_data<T>(context.GetPlace());
const auto* input2_d = input2->data<T>();
auto* output_d = output->data<T>();
// handle reduce_all
if (input2->dims().size() == 1 && input2->dims()[0] == 1) {
for (int64_t i = 0; i < framework::product(input0->dims()); ++i) {
output_d[i] = input2_d[0];
}
return;
}
// handle reduce by one dimension
int reduce_dim_index = dims[0];
if (reduce_dim_index < 0) {
reduce_dim_index += input0->dims().size();
}
auto& input_dim = input0->dims();
int64_t before_dim = 1;
for (int i = 0; i < reduce_dim_index; ++i) {
before_dim *= input_dim[i];
}
int64_t reduce_dim = input_dim[reduce_dim_index];
int64_t after_dim = 1;
for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) {
after_dim *= input_dim[i];
}
for (int64_t i = 0; i < before_dim; ++i) {
for (int64_t j = 0; j < reduce_dim; ++j) {
for (int64_t k = 0; k < after_dim; ++k) {
output_d[i * reduce_dim * after_dim + j * after_dim + k] =
input2_d[i * after_dim + k];
}
}
}
return;
}
// default use Eigen broadcast
ReduceGradKernel<DeviceContext, T, Functor> kernel;
kernel.Compute(context);
}
};
struct SumFunctor { struct SumFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim> template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
......
...@@ -115,14 +115,56 @@ class Test2DReduce1(Test1DReduce): ...@@ -115,14 +115,56 @@ class Test2DReduce1(Test1DReduce):
self.op_type = "reduce_sum" self.op_type = "reduce_sum"
self.attrs = {'dim': [1]} self.attrs = {'dim': [1]}
self.inputs = {'X': np.random.random((20, 10)).astype("float64")} self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
self.outputs = {'Out': self.inputs['X'].sum(axis=1)} self.outputs = {
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
}
class Test3DReduce0(Test1DReduce):
def setUp(self):
self.op_type = "reduce_sum"
self.attrs = {'dim': [1]}
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
self.outputs = {
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
}
class Test3DReduce1(Test1DReduce):
def setUp(self):
self.op_type = "reduce_sum"
self.attrs = {'dim': [2]}
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
self.outputs = {
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
}
class Test3DReduce2(Test1DReduce):
def setUp(self):
self.op_type = "reduce_sum"
self.attrs = {'dim': [-2]}
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
self.outputs = {
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
}
class Test3DReduce3(Test1DReduce):
def setUp(self):
self.op_type = "reduce_sum"
self.attrs = {'dim': [1, 2]}
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
self.outputs = {
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
}
class TestKeepDimReduce(Test1DReduce): class TestKeepDimReduce(Test1DReduce):
def setUp(self): def setUp(self):
self.op_type = "reduce_sum" self.op_type = "reduce_sum"
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
self.attrs = {'dim': [-2], 'keep_dim': True} self.attrs = {'dim': [1], 'keep_dim': True}
self.outputs = { self.outputs = {
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']), 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
keepdims=self.attrs['keep_dim']) keepdims=self.attrs['keep_dim'])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册