From 6b30c58a711d8268b208578e74a6581a3341d843 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Fri, 14 Feb 2020 14:53:34 +0800 Subject: [PATCH] [X86] Optimize gru and softmax (#2877) * Optimize softmax. When the input tensor is 2-D and axis is 1, there is no need to resize. * Optimize the gru, avoid calling Tensor::Slice. test=develop * Remove a std::vector in softmax. test=develop * Define CalculateSeqWidth to get the width of a sequence. test=develop --- lite/backends/x86/jit/more/mkl/mkl.h | 7 ++-- lite/backends/x86/math/math_function.cc | 6 +--- lite/kernels/x86/gru_compute.h | 45 +++++++++++++------------ lite/kernels/x86/softmax_compute.h | 37 ++++++++++++-------- 4 files changed, 51 insertions(+), 44 deletions(-) diff --git a/lite/backends/x86/jit/more/mkl/mkl.h b/lite/backends/x86/jit/more/mkl/mkl.h index 8b713e537e..6bc791e645 100644 --- a/lite/backends/x86/jit/more/mkl/mkl.h +++ b/lite/backends/x86/jit/more/mkl/mkl.h @@ -142,14 +142,13 @@ void StrideScal(const T* a, const T* x, T* y, int n, int stride); // remain is the product of dimension shapes after the axis dimension template void Softmax(const T* x, T* y, int n, int bs, int remain = 1) { - std::vector entities(bs); for (int i = 0; i < bs; ++i) { - entities[i] = x[i * n]; + T entity = x[i * n]; for (int c = 1; c < n; ++c) { - entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i]; + entity = x[i * n + c] > entity ? x[i * n + c] : entity; } for (int c = 0; c < n; ++c) { - y[i * n + c] = x[i * n + c] - entities[i]; + y[i * n + c] = x[i * n + c] - entity; } } VExp(y, y, n * bs); diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc index f242e14ad1..a17807e8a9 100644 --- a/lite/backends/x86/math/math_function.cc +++ b/lite/backends/x86/math/math_function.cc @@ -110,11 +110,7 @@ void set_constant(const lite::Context& context, lite::Tensor* tensor, float value) { TensorSetConstantWithTarget func(context, tensor, value); - // #ifdef PADDLE_WITH_CUDA - // tensor->target().apply_visitor(func); - // #else func(); - // #endif } template @@ -123,7 +119,7 @@ struct RowwiseAdd { const lite::Tensor& input, const lite::Tensor& vector, lite::Tensor* output) { - auto in_dims = input.dims(); + const auto& in_dims = input.dims(); auto size = input.numel() / in_dims[0]; PADDLE_ENFORCE_EQ(vector.numel(), size); PADDLE_ENFORCE_EQ(output->dims(), in_dims); diff --git a/lite/kernels/x86/gru_compute.h b/lite/kernels/x86/gru_compute.h index 948485105a..89076b51da 100644 --- a/lite/kernels/x86/gru_compute.h +++ b/lite/kernels/x86/gru_compute.h @@ -48,6 +48,10 @@ inline void ReorderInitState(const lite::Context& context, row_shuffle(context, src, index_lod, dst, indexed_src); } +static inline int64_t CalculateSeqWidth(const DDim& dims) { + return dims.count(1, dims.size()); +} + template class GRUCompute : public KernelLite { public: @@ -65,15 +69,16 @@ class GRUCompute : public KernelLite { auto* bias = param.bias; auto* batch_gate = param.batch_gate; - batch_gate->mutable_data(); auto* batch_reset_hidden_prev = param.batch_reset_hidden_prev; - batch_reset_hidden_prev->mutable_data(); auto* batch_hidden = param.batch_hidden; - batch_hidden->mutable_data(); + T* batch_gate_ptr = batch_gate->mutable_data(); + T* batch_reset_hidden_prev_ptr = batch_reset_hidden_prev->mutable_data(); + T* batch_hidden_ptr = batch_hidden->mutable_data(); + auto* hidden = param.hidden; hidden->mutable_data(); - auto hidden_dims = hidden->dims(); + const auto& hidden_dims = hidden->dims(); lite::x86::math::LoDTensor2BatchFunctor to_batch; to_batch(context, *input, batch_gate, true, is_reverse); @@ -90,19 +95,23 @@ class GRUCompute : public KernelLite { const_cast(weight_data + 2 * frame_size * frame_size); Tensor ordered_h0; - std::vector order(batch_gate->lod()[2]); - if (h0) { // Since the batch computing for GRU reorders the input sequences // according to their length. The initialized cell state also needs // to reorder. + const std::vector& order(batch_gate->lod()[2]); ReorderInitState(context, *h0, order, &ordered_h0, true); gru_value.prev_out_value = ordered_h0.mutable_data(); } else { gru_value.prev_out_value = nullptr; } - auto batch_starts = batch_gate->lod()[0]; + + const auto& batch_starts = batch_gate->lod()[0]; size_t seq_len = batch_starts.size() - 1; + int64_t batch_gate_width = CalculateSeqWidth(batch_gate->dims()); + int64_t batch_reset_hidden_prev_width = + CalculateSeqWidth(batch_reset_hidden_prev->dims()); + int64_t batch_hidden_width = CalculateSeqWidth(batch_hidden->dims()); auto active_node = lite::x86::math::detail::GetActivationType(param.activation); auto active_gate = @@ -145,13 +154,10 @@ class GRUCompute : public KernelLite { int64_t bend = static_cast(batch_starts[n + 1]); int64_t cur_batch_size = bend - bstart; - Tensor gate_t = batch_gate->Slice(bstart, bend); - Tensor reset_hidden_prev_t = - batch_reset_hidden_prev->Slice(bstart, bend); - Tensor hidden_t = batch_hidden->Slice(bstart, bend); - gru_value.output_value = hidden_t.mutable_data(); - gru_value.gate_value = gate_t.mutable_data(); - gru_value.reset_output_value = reset_hidden_prev_t.mutable_data(); + gru_value.output_value = batch_hidden_ptr + bstart * batch_hidden_width; + gru_value.gate_value = batch_gate_ptr + bstart * batch_gate_width; + gru_value.reset_output_value = batch_reset_hidden_prev_ptr + + bstart * batch_reset_hidden_prev_width; if (gru_value.prev_out_value) { blas.GEMM_COMPUTE(CblasNoTrans, @@ -188,13 +194,10 @@ class GRUCompute : public KernelLite { int64_t bend = static_cast(batch_starts[n + 1]); int64_t cur_batch_size = bend - bstart; - Tensor gate_t = batch_gate->Slice(bstart, bend); - Tensor reset_hidden_prev_t = - batch_reset_hidden_prev->Slice(bstart, bend); - Tensor hidden_t = batch_hidden->Slice(bstart, bend); - gru_value.output_value = hidden_t.mutable_data(); - gru_value.gate_value = gate_t.mutable_data(); - gru_value.reset_output_value = reset_hidden_prev_t.mutable_data(); + gru_value.output_value = batch_hidden_ptr + bstart * batch_hidden_width; + gru_value.gate_value = batch_gate_ptr + bstart * batch_gate_width; + gru_value.reset_output_value = batch_reset_hidden_prev_ptr + + bstart * batch_reset_hidden_prev_width; lite::x86::math::GRUUnitFunctor::compute( context, diff --git a/lite/kernels/x86/softmax_compute.h b/lite/kernels/x86/softmax_compute.h index 8063cf6566..5a18a80227 100644 --- a/lite/kernels/x86/softmax_compute.h +++ b/lite/kernels/x86/softmax_compute.h @@ -55,24 +55,33 @@ class SoftmaxCompute : public KernelLite { auto& context = ctx_->As(); CHECK(param.output); CHECK(param.x); - param.output->mutable_data(); - const int rank = param.x->dims().size(); + + auto* x = param.x; + auto* output = param.output; + output->mutable_data(); + + const int rank = x->dims().size(); const int axis = CanonicalAxis(param.axis, rank); - int axis_dim = param.x->dims()[axis]; - const int n = SizeToAxis(axis, param.x->dims()); - const int d = SizeFromAxis(axis, param.x->dims()); + int axis_dim = x->dims()[axis]; + if (rank == 2 && axis == 1) { + lite::x86::math::SoftmaxFunctor()( + context, axis_dim, x, output); + } else { + const int n = SizeToAxis(axis, x->dims()); + const int d = SizeFromAxis(axis, x->dims()); - DDim shape(std::vector{n, d}); + DDim x_dims = x->dims(); + DDim out_dims = output->dims(); - Tensor input_2d; - Tensor out_2d; - input_2d.ShareDataWith(*param.x); - input_2d.Resize(shape); - out_2d.ShareDataWith(*param.output); - out_2d.Resize(shape); + DDim shape_2d(std::vector{n, d}); + x->Resize(shape_2d); + output->Resize(shape_2d); - lite::x86::math::SoftmaxFunctor()( - context, axis_dim, &input_2d, &out_2d); + lite::x86::math::SoftmaxFunctor()( + context, axis_dim, x, output); + x->Resize(x_dims); + output->Resize(out_dims); + } } virtual ~SoftmaxCompute() = default; -- GitLab