未验证 提交 0a890050 编写于 作者: Y Yiqun Liu 提交者: GitHub

[X86] Optimize gru and softmax (#2877)

* Optimize softmax. When the input tensor is 2-D and axis is 1, there is no need to resize.

* Optimize the gru, avoid calling Tensor::Slice.
test=develop

* Remove a std::vector in softmax.
test=develop

* Define CalculateSeqWidth to get the width of a sequence.
test=develop
上级 31f1a852
...@@ -142,14 +142,13 @@ void StrideScal(const T* a, const T* x, T* y, int n, int stride); ...@@ -142,14 +142,13 @@ void StrideScal(const T* a, const T* x, T* y, int n, int stride);
// remain is the product of dimension shapes after the axis dimension // remain is the product of dimension shapes after the axis dimension
template <typename T> template <typename T>
void Softmax(const T* x, T* y, int n, int bs, int remain = 1) { void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
std::vector<T> entities(bs);
for (int i = 0; i < bs; ++i) { for (int i = 0; i < bs; ++i) {
entities[i] = x[i * n]; T entity = x[i * n];
for (int c = 1; c < n; ++c) { for (int c = 1; c < n; ++c) {
entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i]; entity = x[i * n + c] > entity ? x[i * n + c] : entity;
} }
for (int c = 0; c < n; ++c) { for (int c = 0; c < n; ++c) {
y[i * n + c] = x[i * n + c] - entities[i]; y[i * n + c] = x[i * n + c] - entity;
} }
} }
VExp(y, y, n * bs); VExp(y, y, n * bs);
......
...@@ -110,11 +110,7 @@ void set_constant(const lite::Context<Target>& context, ...@@ -110,11 +110,7 @@ void set_constant(const lite::Context<Target>& context,
lite::Tensor* tensor, lite::Tensor* tensor,
float value) { float value) {
TensorSetConstantWithTarget<Target> func(context, tensor, value); TensorSetConstantWithTarget<Target> func(context, tensor, value);
// #ifdef PADDLE_WITH_CUDA
// tensor->target().apply_visitor(func);
// #else
func(); func();
// #endif
} }
template <typename T> template <typename T>
...@@ -123,7 +119,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> { ...@@ -123,7 +119,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
const lite::Tensor& input, const lite::Tensor& input,
const lite::Tensor& vector, const lite::Tensor& vector,
lite::Tensor* output) { lite::Tensor* output) {
auto in_dims = input.dims(); const auto& in_dims = input.dims();
auto size = input.numel() / in_dims[0]; auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size); PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims); PADDLE_ENFORCE_EQ(output->dims(), in_dims);
......
...@@ -48,6 +48,10 @@ inline void ReorderInitState(const lite::Context<TARGET(kX86)>& context, ...@@ -48,6 +48,10 @@ inline void ReorderInitState(const lite::Context<TARGET(kX86)>& context,
row_shuffle(context, src, index_lod, dst, indexed_src); row_shuffle(context, src, index_lod, dst, indexed_src);
} }
static inline int64_t CalculateSeqWidth(const DDim& dims) {
return dims.count(1, dims.size());
}
template <typename T> template <typename T>
class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public: public:
...@@ -65,15 +69,16 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -65,15 +69,16 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto* bias = param.bias; auto* bias = param.bias;
auto* batch_gate = param.batch_gate; auto* batch_gate = param.batch_gate;
batch_gate->mutable_data<T>();
auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev; auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev;
batch_reset_hidden_prev->mutable_data<T>();
auto* batch_hidden = param.batch_hidden; auto* batch_hidden = param.batch_hidden;
batch_hidden->mutable_data<T>(); T* batch_gate_ptr = batch_gate->mutable_data<T>();
T* batch_reset_hidden_prev_ptr = batch_reset_hidden_prev->mutable_data<T>();
T* batch_hidden_ptr = batch_hidden->mutable_data<T>();
auto* hidden = param.hidden; auto* hidden = param.hidden;
hidden->mutable_data<T>(); hidden->mutable_data<T>();
auto hidden_dims = hidden->dims(); const auto& hidden_dims = hidden->dims();
lite::x86::math::LoDTensor2BatchFunctor<TARGET(kX86), T> to_batch; lite::x86::math::LoDTensor2BatchFunctor<TARGET(kX86), T> to_batch;
to_batch(context, *input, batch_gate, true, is_reverse); to_batch(context, *input, batch_gate, true, is_reverse);
...@@ -90,19 +95,23 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -90,19 +95,23 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
const_cast<T*>(weight_data + 2 * frame_size * frame_size); const_cast<T*>(weight_data + 2 * frame_size * frame_size);
Tensor ordered_h0; Tensor ordered_h0;
std::vector<size_t> order(batch_gate->lod()[2]);
if (h0) { if (h0) {
// Since the batch computing for GRU reorders the input sequences // Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs // according to their length. The initialized cell state also needs
// to reorder. // to reorder.
const std::vector<size_t>& order(batch_gate->lod()[2]);
ReorderInitState<T>(context, *h0, order, &ordered_h0, true); ReorderInitState<T>(context, *h0, order, &ordered_h0, true);
gru_value.prev_out_value = ordered_h0.mutable_data<T>(); gru_value.prev_out_value = ordered_h0.mutable_data<T>();
} else { } else {
gru_value.prev_out_value = nullptr; gru_value.prev_out_value = nullptr;
} }
auto batch_starts = batch_gate->lod()[0];
const auto& batch_starts = batch_gate->lod()[0];
size_t seq_len = batch_starts.size() - 1; size_t seq_len = batch_starts.size() - 1;
int64_t batch_gate_width = CalculateSeqWidth(batch_gate->dims());
int64_t batch_reset_hidden_prev_width =
CalculateSeqWidth(batch_reset_hidden_prev->dims());
int64_t batch_hidden_width = CalculateSeqWidth(batch_hidden->dims());
auto active_node = auto active_node =
lite::x86::math::detail::GetActivationType(param.activation); lite::x86::math::detail::GetActivationType(param.activation);
auto active_gate = auto active_gate =
...@@ -145,13 +154,10 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -145,13 +154,10 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
int64_t bend = static_cast<int64_t>(batch_starts[n + 1]); int64_t bend = static_cast<int64_t>(batch_starts[n + 1]);
int64_t cur_batch_size = bend - bstart; int64_t cur_batch_size = bend - bstart;
Tensor gate_t = batch_gate->Slice<T>(bstart, bend); gru_value.output_value = batch_hidden_ptr + bstart * batch_hidden_width;
Tensor reset_hidden_prev_t = gru_value.gate_value = batch_gate_ptr + bstart * batch_gate_width;
batch_reset_hidden_prev->Slice<T>(bstart, bend); gru_value.reset_output_value = batch_reset_hidden_prev_ptr +
Tensor hidden_t = batch_hidden->Slice<T>(bstart, bend); bstart * batch_reset_hidden_prev_width;
gru_value.output_value = hidden_t.mutable_data<T>();
gru_value.gate_value = gate_t.mutable_data<T>();
gru_value.reset_output_value = reset_hidden_prev_t.mutable_data<T>();
if (gru_value.prev_out_value) { if (gru_value.prev_out_value) {
blas.GEMM_COMPUTE(CblasNoTrans, blas.GEMM_COMPUTE(CblasNoTrans,
...@@ -188,13 +194,10 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -188,13 +194,10 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
int64_t bend = static_cast<int64_t>(batch_starts[n + 1]); int64_t bend = static_cast<int64_t>(batch_starts[n + 1]);
int64_t cur_batch_size = bend - bstart; int64_t cur_batch_size = bend - bstart;
Tensor gate_t = batch_gate->Slice<T>(bstart, bend); gru_value.output_value = batch_hidden_ptr + bstart * batch_hidden_width;
Tensor reset_hidden_prev_t = gru_value.gate_value = batch_gate_ptr + bstart * batch_gate_width;
batch_reset_hidden_prev->Slice<T>(bstart, bend); gru_value.reset_output_value = batch_reset_hidden_prev_ptr +
Tensor hidden_t = batch_hidden->Slice<T>(bstart, bend); bstart * batch_reset_hidden_prev_width;
gru_value.output_value = hidden_t.mutable_data<T>();
gru_value.gate_value = gate_t.mutable_data<T>();
gru_value.reset_output_value = reset_hidden_prev_t.mutable_data<T>();
lite::x86::math::GRUUnitFunctor<TARGET(kX86), T>::compute( lite::x86::math::GRUUnitFunctor<TARGET(kX86), T>::compute(
context, context,
......
...@@ -55,24 +55,33 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -55,24 +55,33 @@ class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
auto& context = ctx_->As<X86Context>(); auto& context = ctx_->As<X86Context>();
CHECK(param.output); CHECK(param.output);
CHECK(param.x); CHECK(param.x);
param.output->mutable_data<T>();
const int rank = param.x->dims().size(); auto* x = param.x;
auto* output = param.output;
output->mutable_data<T>();
const int rank = x->dims().size();
const int axis = CanonicalAxis(param.axis, rank); const int axis = CanonicalAxis(param.axis, rank);
int axis_dim = param.x->dims()[axis]; int axis_dim = x->dims()[axis];
const int n = SizeToAxis(axis, param.x->dims()); if (rank == 2 && axis == 1) {
const int d = SizeFromAxis(axis, param.x->dims()); lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
context, axis_dim, x, output);
} else {
const int n = SizeToAxis(axis, x->dims());
const int d = SizeFromAxis(axis, x->dims());
DDim shape(std::vector<DDim::value_type>{n, d}); DDim x_dims = x->dims();
DDim out_dims = output->dims();
Tensor input_2d; DDim shape_2d(std::vector<DDim::value_type>{n, d});
Tensor out_2d; x->Resize(shape_2d);
input_2d.ShareDataWith(*param.x); output->Resize(shape_2d);
input_2d.Resize(shape);
out_2d.ShareDataWith(*param.output);
out_2d.Resize(shape);
lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()( lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
context, axis_dim, &input_2d, &out_2d); context, axis_dim, x, output);
x->Resize(x_dims);
output->Resize(out_dims);
}
} }
virtual ~SoftmaxCompute() = default; virtual ~SoftmaxCompute() = default;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册