diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc index 4847eb362699053be59bfab9726a10b037c39c87..2b5094925c1ab7a7cca5c8f04b5f6c5d9e7e29f1 100644 --- a/paddle/fluid/operators/gru_op.cc +++ b/paddle/fluid/operators/gru_op.cc @@ -14,6 +14,11 @@ limitations under the License. */ #include "paddle/fluid/operators/gru_op.h" #include +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h" +#include "paddle/fluid/operators/math/detail/gru_kernel.h" + +DECLARE_int32(paddle_num_threads); namespace paddle { namespace operators { @@ -264,76 +269,94 @@ class GRUCPUKernel : public framework::OpKernel { gru_value.prev_out_value = nullptr; } auto batch_starts = batch_gate->lod()[0]; - size_t num_batch = batch_starts.size() - 1; + size_t seq_len = batch_starts.size() - 1; auto active_node = math::detail::GetActivationType( context.Attr("activation")); auto active_gate = math::detail::GetActivationType( context.Attr("gate_activation")); #ifdef PADDLE_WITH_MKLML - auto blas = math::GetBlas(dev_ctx); - // TODO(TJ): make a class - T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, - frame_size * 2 /*width of weight*/, - frame_size /*height of height*/); - PADDLE_ENFORCE(packed_gate); - blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2, - frame_size, T(1.0), gru_value.gate_weight, frame_size * 2, - packed_gate); - T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, - frame_size /*width of weight*/, - frame_size /*height of height*/); - PADDLE_ENFORCE(packed_state); - blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size, - frame_size, T(1.0), gru_value.state_weight, frame_size, - packed_state); -#endif - for (size_t n = 0; n < num_batch; n++) { - int bstart = static_cast(batch_starts[n]); - int bend = static_cast(batch_starts[n + 1]); - int cur_batch_size = bend - bstart; - - Tensor gate_t = batch_gate->Slice(bstart, bend); - Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); - Tensor hidden_t = batch_hidden->Slice(bstart, bend); - gru_value.output_value = hidden_t.data(); - gru_value.gate_value = gate_t.data(); - gru_value.reset_output_value = reset_hidden_prev_t.data(); + if (FLAGS_paddle_num_threads >= 4) { + auto blas = math::GetBlas(dev_ctx); + T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, + frame_size * 2 /*width of weight*/, + frame_size /*height of height*/); + PADDLE_ENFORCE(packed_gate); + blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2, + frame_size, T(1.0), gru_value.gate_weight, frame_size * 2, + packed_gate); + T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, + frame_size /*width of weight*/, + frame_size /*height of height*/); + PADDLE_ENFORCE(packed_state); + blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size, + frame_size, T(1.0), gru_value.state_weight, frame_size, + packed_state); + for (size_t n = 0; n < seq_len; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + int cur_batch_size = bend - bstart; -#ifdef PADDLE_WITH_MKLML - if (gru_value.prev_out_value) { - blas.GEMM_COMPUTE(CblasNoTrans, CblasPacked, cur_batch_size, - frame_size * 2, frame_size, gru_value.prev_out_value, - frame_size, packed_gate, frame_size * 2, T(1), - gru_value.gate_value, frame_size * 3); - } + Tensor gate_t = batch_gate->Slice(bstart, bend); + Tensor reset_hidden_prev_t = + batch_reset_hidden_prev->Slice(bstart, bend); + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + gru_value.output_value = hidden_t.data(); + gru_value.gate_value = gate_t.data(); + gru_value.reset_output_value = reset_hidden_prev_t.data(); - math::detail::forward_reset_output( - math::detail::forward::gru_resetOutput(), gru_value, frame_size, - cur_batch_size, active_gate); + if (gru_value.prev_out_value) { + blas.GEMM_COMPUTE( + CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2, + frame_size, gru_value.prev_out_value, frame_size, packed_gate, + frame_size * 2, T(1), gru_value.gate_value, frame_size * 3); + } - if (gru_value.prev_out_value) { - blas.GEMM_COMPUTE( - CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size, - gru_value.reset_output_value, frame_size, packed_state, frame_size, - T(1), gru_value.gate_value + frame_size * 2, frame_size * 3); + math::detail::forward_reset_output( + math::detail::forward::gru_resetOutput(), gru_value, frame_size, + cur_batch_size, active_gate); + + if (gru_value.prev_out_value) { + blas.GEMM_COMPUTE( + CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size, + gru_value.reset_output_value, frame_size, packed_state, + frame_size, T(1), gru_value.gate_value + frame_size * 2, + frame_size * 3); + } + + math::detail::forward_final_output( + math::detail::forward::gru_finalOutput(), gru_value, frame_size, + cur_batch_size, active_node); + + gru_value.prev_out_value = gru_value.output_value; } - math::detail::forward_final_output( - math::detail::forward::gru_finalOutput(), gru_value, frame_size, - cur_batch_size, active_node); -#else - math::GRUUnitFunctor::compute( - dev_ctx, gru_value, frame_size, cur_batch_size, active_node, - active_gate); + blas.GEMM_FREE(packed_gate); + blas.GEMM_FREE(packed_state); + } else { #endif - gru_value.prev_out_value = gru_value.output_value; - } + for (size_t n = 0; n < seq_len; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + int cur_batch_size = bend - bstart; + + Tensor gate_t = batch_gate->Slice(bstart, bend); + Tensor reset_hidden_prev_t = + batch_reset_hidden_prev->Slice(bstart, bend); + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + gru_value.output_value = hidden_t.data(); + gru_value.gate_value = gate_t.data(); + gru_value.reset_output_value = reset_hidden_prev_t.data(); + + math::GRUUnitFunctor::compute( + dev_ctx, gru_value, frame_size, cur_batch_size, active_node, + active_gate); + + gru_value.prev_out_value = gru_value.output_value; + } #ifdef PADDLE_WITH_MKLML - blas.GEMM_FREE(packed_gate); - blas.GEMM_FREE(packed_state); + } #endif - math::Batch2LoDTensorFunctor to_seq; batch_hidden->set_lod(batch_gate->lod()); to_seq(dev_ctx, *batch_hidden, hidden); diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h index 0bf4e6bc447ad02cc621dd6c45afafcf4aa992e5..0b551e8046be16c95f7d6b10b68b32a9af594f73 100644 --- a/paddle/fluid/operators/gru_op.h +++ b/paddle/fluid/operators/gru_op.h @@ -16,10 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h" -#include "paddle/fluid/operators/math/detail/gru_kernel.h" #include "paddle/fluid/operators/math/gru_compute.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence2batch.h"