diff --git a/lite/backends/x86/math/CMakeLists.txt b/lite/backends/x86/math/CMakeLists.txt index 5f440947fe39c4b4dac921d1f9114b8fb65c8f3c..5cc4a9f0770e79683197bce0bb83336b8c79c364 100644 --- a/lite/backends/x86/math/CMakeLists.txt +++ b/lite/backends/x86/math/CMakeLists.txt @@ -30,13 +30,13 @@ math_library(sample_prob) math_library(sampler) math_library(gru_compute DEPS activation_functions math_function) -## math_library(lstm_compute DEPS activation_functions) +math_library(lstm_compute DEPS activation_functions) lite_cc_library(blas SRCS blas.cc DEPS cblas framework_proto eigen3) math_library(math_function DEPS blas) math_library(maxouting) math_library(pooling) -# math_library(selected_rows_functor DEPS selected_rows math_function blas) +math_library(selected_rows_functor DEPS selected_rows math_function blas) math_library(sequence2batch) math_library(sequence_padding) math_library(sequence_pooling DEPS math_function jit_kernel_helper) diff --git a/lite/backends/x86/math/detail/lstm_cpu_kernel.h b/lite/backends/x86/math/detail/lstm_cpu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..3091cc5679e053d0d93855822b14abc1f412b753 --- /dev/null +++ b/lite/backends/x86/math/detail/lstm_cpu_kernel.h @@ -0,0 +1,431 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "lite/backends/x86/math/detail/activation_functions.h" +#include "lite/backends/x86/math/lstm_compute.h" + +#if defined(_WIN32) +#if defined(__AVX2__) || defined(__AVX__) +inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } +#endif +#endif + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { +namespace detail { + +#ifndef __NVCC__ + +template +void naive_lstm_forward_one_sequence(Op op, + LstmMetaValue value, + int frame_size, + T cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + T r_checkI; + T r_checkF; + T r_checkO; + T r_state; + T r_prev_state = 0; + T r_state_atv; + T r_out; + + T *value_in = value.gate_value; + T *value_ig = value.gate_value + frame_size; + T *value_fg = value.gate_value + frame_size * 2; + T *value_og = value.gate_value + frame_size * 3; + + for (int i = 0; i < frame_size; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + r_checkI = value.check_ig ? value.check_ig[i] : 0; + r_checkF = value.check_fg ? value.check_fg[i] : 0; + r_checkO = value.check_og ? value.check_og[i] : 0; + + if (value.prev_state_value) { + r_prev_state = value.prev_state_value[i]; + } + + op(&r_value_in, + &r_value_ig, + &r_value_fg, + &r_value_og, + &r_prev_state, + &r_state, + &r_state_atv, + &r_out, + &r_checkI, + &r_checkF, + &r_checkO, + &cell_clip, + active_node, + active_gate, + active_state); + + value_in[i] = r_value_in; + value_ig[i] = r_value_ig; + value_fg[i] = r_value_fg; + value_og[i] = r_value_og; + value.state_value[i] = r_state; + value.state_active_value[i] = r_state_atv; + value.output_value[i] = r_out; + } +} + +template +void naive_lstm_backward_one_sequence(Op op, + LstmMetaValue value, + LstmMetaGrad grad, + int frame_size, + T cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + T r_grad_in; + T r_grad_ig; + T r_grad_fg; + T r_grad_og; + T r_prev_state = 0; + T r_prev_state_grad; + T r_state; + T r_state_grad; + T r_state_atv; + T r_output_grad; + T r_checkI; + T r_checkF; + T r_checkO; + T r_checkIGrad; + T r_checkFGrad; + T r_checkOGrad; + + T *value_in = value.gate_value; + T *value_ig = value.gate_value + frame_size; + T *value_fg = value.gate_value + frame_size * 2; + T *value_og = value.gate_value + frame_size * 3; + T *grad_in = grad.gate_grad; + T *grad_ig = grad.gate_grad + frame_size; + T *grad_fg = grad.gate_grad + frame_size * 2; + T *grad_og = grad.gate_grad + frame_size * 3; + + for (int i = 0; i < frame_size; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + r_checkI = value.check_ig ? value.check_ig[i] : 0; + r_checkF = value.check_fg ? value.check_fg[i] : 0; + r_checkO = value.check_og ? value.check_og[i] : 0; + r_state = value.state_value[i]; + r_state_atv = value.state_active_value[i]; + r_output_grad = grad.output_grad[i]; + r_state_grad = grad.state_grad[i]; + if (value.prev_state_value) { + r_prev_state = value.prev_state_value[i]; + } + + op(&r_value_in, + &r_value_ig, + &r_value_fg, + &r_value_og, + &r_grad_in, + &r_grad_ig, + &r_grad_fg, + &r_grad_og, + &r_prev_state, + &r_prev_state_grad, + &r_state, + &r_state_grad, + &r_state_atv, + &r_output_grad, + &r_checkI, + &r_checkF, + &r_checkO, + &r_checkIGrad, + &r_checkFGrad, + &r_checkOGrad, + &cell_clip, + active_node, + active_gate, + active_state); + + grad_in[i] = r_grad_in; + grad_ig[i] = r_grad_ig; + grad_fg[i] = r_grad_fg; + grad_og[i] = r_grad_og; + grad.state_grad[i] = r_state_grad; + + if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad; + if (value.prev_state_value) { + if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad; + if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad; + } + if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad; + } +} + +template +void avx_lstm_forward_one_sequence(Op op, + LstmMetaValue value, + int frame_size, + T cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { +#ifdef __AVX__ + __m256 r_value_in; + __m256 r_value_ig; + __m256 r_value_fg; + __m256 r_value_og; + __m256 r_checkI = _mm256_set1_ps(0.0f); + __m256 r_checkF = _mm256_set1_ps(0.0f); + __m256 r_checkO = _mm256_set1_ps(0.0f); + __m256 r_state; + __m256 r_prev_state = _mm256_set1_ps(0.0f); + __m256 r_state_atv; + __m256 r_out; + + __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value); + __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size); + __m256 *value_fg = + reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2); + __m256 *value_og = + reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + if (value.check_ig) { + r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i]; + r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i]; + r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i]; + } + + if (value.prev_state_value) { + r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i]; + } + + op(&r_value_in, + &r_value_ig, + &r_value_fg, + &r_value_og, + &r_prev_state, + &r_state, + &r_state_atv, + &r_out, + &r_checkI, + &r_checkF, + &r_checkO, + &cell_clip, + active_node, + active_gate, + active_state); + + value_in[i] = r_value_in; + value_ig[i] = r_value_ig; + value_fg[i] = r_value_fg; + value_og[i] = r_value_og; + (reinterpret_cast<__m256 *>(value.state_value))[i] = r_state; + (reinterpret_cast<__m256 *>(value.state_active_value))[i] = r_state_atv; + (reinterpret_cast<__m256 *>(value.output_value))[i] = r_out; + } +#endif +} + +template +void avx_lstm_backward_one_sequence(Op op, + LstmMetaValue value, + LstmMetaGrad grad, + int frame_size, + T cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { +#ifdef __AVX__ + __m256 r_value_in; + __m256 r_value_ig; + __m256 r_value_fg; + __m256 r_value_og; + __m256 r_grad_in; + __m256 r_grad_ig; + __m256 r_grad_fg; + __m256 r_grad_og; + __m256 r_prev_state = _mm256_set1_ps(0.0f); + __m256 r_prev_state_grad; + __m256 r_state_grad; + __m256 r_state; + __m256 r_state_atv; + __m256 r_output_grad; + __m256 r_checkI = _mm256_set1_ps(0.0f); + __m256 r_checkF = _mm256_set1_ps(0.0f); + __m256 r_checkO = _mm256_set1_ps(0.0f); + __m256 r_checkIGrad; + __m256 r_checkFGrad; + __m256 r_checkOGrad; + + __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value); + __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size); + __m256 *value_fg = + reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2); + __m256 *value_og = + reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3); + __m256 *grad_in = reinterpret_cast<__m256 *>(grad.gate_grad); + __m256 *grad_ig = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size); + __m256 *grad_fg = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 2); + __m256 *grad_og = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 3); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + if (value.check_ig) { + r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i]; + r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i]; + r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i]; + } + r_state = (reinterpret_cast<__m256 *>(value.state_value))[i]; + r_state_atv = (reinterpret_cast<__m256 *>(value.state_active_value))[i]; + r_output_grad = (reinterpret_cast<__m256 *>(grad.output_grad))[i]; + r_state_grad = (reinterpret_cast<__m256 *>(grad.state_grad))[i]; + if (value.prev_state_value) { + r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i]; + } + + op(&r_value_in, + &r_value_ig, + &r_value_fg, + &r_value_og, + &r_grad_in, + &r_grad_ig, + &r_grad_fg, + &r_grad_og, + &r_prev_state, + &r_prev_state_grad, + &r_state, + &r_state_grad, + &r_state_atv, + &r_output_grad, + &r_checkI, + &r_checkF, + &r_checkO, + &r_checkIGrad, + &r_checkFGrad, + &r_checkOGrad, + &cell_clip, + active_node, + active_gate, + active_state); + + grad_in[i] = r_grad_in; + grad_ig[i] = r_grad_ig; + grad_fg[i] = r_grad_fg; + grad_og[i] = r_grad_og; + (reinterpret_cast<__m256 *>(grad.state_grad))[i] = r_state_grad; + + if (grad.prev_state_grad) + (reinterpret_cast<__m256 *>(grad.prev_state_grad))[i] = r_prev_state_grad; + if (value.prev_state_value) { + if (grad.check_ig_grad) + (reinterpret_cast<__m256 *>(grad.check_ig_grad))[i] += r_checkIGrad; + if (grad.check_fg_grad) + (reinterpret_cast<__m256 *>(grad.check_fg_grad))[i] += r_checkFGrad; + } + if (grad.check_og_grad) + (reinterpret_cast<__m256 *>(grad.check_og_grad))[i] += r_checkOGrad; + } +#endif +} + +template +void cpu_lstm_forward(Op op, + LstmMetaValue value, + int frame_size, + T cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { + avx_lstm_forward_one_sequence(op, + value, + frame_size, + cell_clip, + active_node, + active_gate, + active_state); + } else { + naive_lstm_forward_one_sequence(op, + value, + frame_size, + cell_clip, + active_node, + active_gate, + active_state); + } +} + +template +void cpu_lstm_backward(Op op, + LstmMetaValue value, + LstmMetaGrad grad, + int frame_size, + T cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { + avx_lstm_backward_one_sequence(op, + value, + grad, + frame_size, + cell_clip, + active_node, + active_gate, + active_state); + } else { + naive_lstm_backward_one_sequence(op, + value, + grad, + frame_size, + cell_clip, + active_node, + active_gate, + active_state); + } +} + +#endif + +} // namespace detail +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/math/detail/lstm_kernel.h b/lite/backends/x86/math/detail/lstm_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..1286f2e8b70b85c0a5aea709b99c854771fea72f --- /dev/null +++ b/lite/backends/x86/math/detail/lstm_kernel.h @@ -0,0 +1,236 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "lite/backends/x86/math/detail/activation_functions.h" + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { +namespace detail { + +namespace forward { + +template +class lstm { + public: + HOSTDEVICE void operator()(T *value_in, + T *value_ig, + T *value_fg, + T *value_og, + T *prev_state, + T *state, + T *state_atv, + T *output, + T *checkI, + T *checkF, + T *checkO, + T *cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + *value_in = activation(*value_in, active_node); + *value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate); + *value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate); + *state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg); + + if (*cell_clip > 0.0) { + if (*state < -1.0 * (*cell_clip)) { + *state = -1.0 * (*cell_clip); + } + if (*state > *cell_clip) { + *state = *cell_clip; + } + } + *value_og = activation(*value_og + (*state) * (*checkO), active_gate); + *state_atv = activation(*state, active_state); + *output = (*value_og) * (*state_atv); + } +#ifndef __NVCC__ +#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default + static const bool avx = false; +#else + // Only float support AVX optimization + static const bool avx = std::is_same::value; + + HOSTDEVICE void operator()(__m256 *value_in, + __m256 *value_ig, + __m256 *value_fg, + __m256 *value_og, + __m256 *prev_state, + __m256 *state, + __m256 *state_atv, + __m256 *output, + __m256 *checkI, + __m256 *checkF, + __m256 *checkO, + T *cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + *value_in = activation(*value_in, active_node); + *value_ig = activation( + _mm256_add_ps(*value_ig, _mm256_mul_ps(*prev_state, *checkI)), + active_gate); + *value_fg = activation( + _mm256_add_ps(*value_fg, _mm256_mul_ps(*prev_state, *checkF)), + active_gate); + *state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig), + _mm256_mul_ps(*prev_state, *value_fg)); + + if (*cell_clip > 0.0f) { + __m256 min = _mm256_set1_ps(0.0f - *cell_clip); + __m256 max = _mm256_set1_ps(*cell_clip); + *state = _mm256_min_ps(max, *state); + *state = _mm256_max_ps(min, *state); + } + *value_og = activation( + _mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate); + *state_atv = activation(*state, active_state); + *output = _mm256_mul_ps(*value_og, *state_atv); + } +#endif +#endif +}; + +} // namespace forward + +namespace backward { + +template +class lstm { + public: + HOSTDEVICE void operator()(T *value_in, + T *value_ig, + T *value_fg, + T *value_og, + T *grad_in, + T *grad_ig, + T *grad_fg, + T *grad_og, + T *prev_state, + T *prev_state_grad, + T *state, + T *state_grad, + T *state_atv, + T *output_grad, + T *checkI, + T *checkF, + T *checkO, + T *checkIGrad, + T *checkFGrad, + T *checkOGrad, + T *cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + *grad_og = + activation((*output_grad) * (*state_atv), *value_og, active_gate); + if (*cell_clip > 0.0f) { + if (*state >= (*cell_clip) || *state <= (0.0f - (*cell_clip))) { + *state_grad = 0.0f; + } else { + *state_grad += + activation((*output_grad) * (*value_og), *state_atv, active_state) + + (*grad_og) * (*checkO); + } + } else { + *state_grad += + activation((*output_grad) * (*value_og), *state_atv, active_state) + + (*grad_og) * (*checkO); + } + + *grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node); + *grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate); + *grad_fg = + activation((*state_grad) * (*prev_state), *value_fg, active_gate); + *prev_state_grad = (*grad_ig) * (*checkI) + (*grad_fg) * (*checkF) + + (*state_grad) * (*value_fg); + *checkIGrad = (*grad_ig) * (*prev_state); + *checkFGrad = (*grad_fg) * (*prev_state); + *checkOGrad = (*grad_og) * (*state); + } +#ifndef __NVCC__ +#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default + static const bool avx = false; +#else + // Only float support AVX optimization + static const bool avx = std::is_same::value; + HOSTDEVICE void operator()(__m256 *value_in, + __m256 *value_ig, + __m256 *value_fg, + __m256 *value_og, + __m256 *grad_in, + __m256 *grad_ig, + __m256 *grad_fg, + __m256 *grad_og, + __m256 *prev_state, + __m256 *prev_state_grad, + __m256 *state, + __m256 *state_grad, + __m256 *state_atv, + __m256 *output_grad, + __m256 *checkI, + __m256 *checkF, + __m256 *checkO, + __m256 *checkIGrad, + __m256 *checkFGrad, + __m256 *checkOGrad, + T *cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + *grad_og = activation( + _mm256_mul_ps(*output_grad, *state_atv), *value_og, active_gate); + if (*cell_clip > 0.0f) { + T *state_ = reinterpret_cast(state); + if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) { + *state_grad = _mm256_set1_ps(0.0f); + } else { + *state_grad = + _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og), + *state_atv, + active_state), + *state_grad); + *state_grad = + _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad); + } + } + *grad_in = activation( + _mm256_mul_ps(*state_grad, *value_ig), *value_in, active_node); + *grad_ig = activation( + _mm256_mul_ps(*state_grad, *value_in), *value_ig, active_gate); + *grad_fg = activation( + _mm256_mul_ps(*state_grad, *prev_state), *value_fg, active_gate); + *prev_state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_ig, *checkI), + _mm256_mul_ps(*grad_fg, *checkF)); + *prev_state_grad = + _mm256_add_ps(_mm256_mul_ps(*state_grad, *value_fg), *prev_state_grad); + *checkIGrad = _mm256_mul_ps(*grad_ig, *prev_state); + *checkFGrad = _mm256_mul_ps(*grad_fg, *prev_state); + *checkOGrad = _mm256_mul_ps(*grad_og, *state); + } +#endif +#endif +}; + +} // namespace backward + +} // namespace detail +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/math/lstm_compute.cc b/lite/backends/x86/math/lstm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..639aff02fa84f2d3a3acc726915a2349365bb0f2 --- /dev/null +++ b/lite/backends/x86/math/lstm_compute.cc @@ -0,0 +1,101 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "lite/backends/x86/math/lstm_compute.h" +#include "lite/backends/x86/math/detail/lstm_cpu_kernel.h" +#include "lite/backends/x86/math/detail/lstm_kernel.h" + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { + +template +struct LstmUnitFunctor { + static void compute(const lite::X86Context& context, + LstmMetaValue value, + int frame_size, + int batch_size, + T cell_clip, + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { + for (int b = 0; b < batch_size; b++) { + detail::cpu_lstm_forward(detail::forward::lstm(), + value, + frame_size, + cell_clip, + cand_act, + gate_act, + cell_act); + value.gate_value += frame_size * 4; + value.state_value += frame_size; + value.state_active_value += frame_size; + value.output_value += frame_size; + if (value.prev_state_value) { + value.prev_state_value += frame_size; + } + } + } +}; + +template +struct LstmUnitGradFunctor { + static void compute(const lite::X86Context& context, + LstmMetaValue value, + LstmMetaGrad grad, + int frame_size, + int batch_size, + T cell_clip, + const detail::ActivationType& gate_act, + const detail::ActivationType& cell_act, + const detail::ActivationType& cand_act) { + for (int b = 0; b < batch_size; b++) { + detail::cpu_lstm_backward(detail::backward::lstm(), + value, + grad, + frame_size, + cell_clip, + cand_act, + gate_act, + cell_act); + + value.gate_value += frame_size * 4; + value.state_value += frame_size; + value.state_active_value += frame_size; + value.output_value += frame_size; + if (value.prev_state_value) { + value.prev_state_value += frame_size; + } + + grad.gate_grad += frame_size * 4; + grad.state_grad += frame_size; + grad.state_active_grad += frame_size; + grad.output_grad += frame_size; + if (grad.prev_state_grad) { + grad.prev_state_grad += frame_size; + } + } + } +}; + +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; + +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/math/lstm_compute.h b/lite/backends/x86/math/lstm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..ddb7bea9995ebcca978be97f8295eb07b0e4e17e --- /dev/null +++ b/lite/backends/x86/math/lstm_compute.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "lite/backends/x86/math/detail/activation_functions.h" +#include "lite/core/context.h" +#include "lite/utils/paddle_enforce.h" + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { + +template +struct LstmMetaValue { + T *gate_value; + T *prev_state_value; + T *state_value; + T *state_active_value; + T *output_value; + T *check_ig; + T *check_fg; + T *check_og; +}; + +template +struct LstmMetaGrad { + T *gate_grad; + T *prev_state_grad; + T *state_grad; + T *state_active_grad; + T *output_grad; + T *check_ig_grad; + T *check_fg_grad; + T *check_og_grad; +}; + +template +class LstmUnitFunctor { + public: + static void compute(const lite::Context &context, + LstmMetaValue value, + int frame_size, + int batch_size, + T cell_clip, + const detail::ActivationType &gate_act, + const detail::ActivationType &cell_act, + const detail::ActivationType &cand_act); +}; + +template +class LstmUnitGradFunctor { + public: + static void compute(const lite::Context &context, + LstmMetaValue value, + LstmMetaGrad grad, + int frame_size, + int batch_size, + T cell_clip, + const detail::ActivationType &gate_act, + const detail::ActivationType &cell_act, + const detail::ActivationType &cand_act); +}; + +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/math/selected_rows_functor.cc b/lite/backends/x86/math/selected_rows_functor.cc new file mode 100644 index 0000000000000000000000000000000000000000..f8f1b42361832771ba04d1bdc8b3e2e05f954e29 --- /dev/null +++ b/lite/backends/x86/math/selected_rows_functor.cc @@ -0,0 +1,437 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "lite/backends/x86/math/blas.h" +#include "lite/backends/x86/math/selected_rows_functor.h" + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { + +template +struct SelectedRowsAdd { + void operator()(const lite::X86Context& context, + const fluid::SelectedRows& input1, + const fluid::SelectedRows& input2, + fluid::SelectedRows* output) { + auto in1_height = input1.height(); + PADDLE_ENFORCE_EQ(in1_height, input2.height()); + output->set_height(in1_height); + + auto& in1_rows = input1.rows(); + auto& in2_rows = input2.rows(); + std::vector out_rows; + out_rows.reserve(in1_rows.size() + in2_rows.size()); + + // concat rows + out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end()); + out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end()); + output->set_rows(out_rows); + + auto* out_value = output->mutable_value(); + auto& in1_value = input1.value(); + auto& in2_value = input2.value(); + + auto in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); + PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); + + auto* out_data = out_value->mutable_data(); + auto* in1_data = in1_value.data(); + std::copy_n(in1_data, in1_value.numel(), out_data); + + auto* in2_data = in2_value.data(); + std::copy_n(in2_data, in2_value.numel(), out_data + in1_value.numel()); + } +}; + +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; + +template +struct SelectedRowsAddTensor { + void operator()(const lite::X86Context& context, + const fluid::SelectedRows& input1, + const lite::Tensor& input2, + lite::Tensor* output) { + auto in1_height = input1.height(); + auto in2_dims = input2.dims(); + auto out_dims = output->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); + PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); + + SetConstant functor; + functor(context, output, 0.0); + + auto* in1_data = in1_value.data(); + auto* out_data = output->mutable_data(); + + for (size_t i = 0; i < in1_rows.size(); i++) { + for (int64_t j = 0; j < in1_row_numel; j++) { + out_data[in1_rows[i] * in1_row_numel + j] += + in1_data[i * in1_row_numel + j]; + } + } + + auto out_eigen = fluid::EigenVector::Flatten(*output); + auto in2_eigen = fluid::EigenVector::Flatten(input2); + out_eigen.device(lite::fluid::EigenDeviceType()) = + out_eigen + in2_eigen; + } +}; + +template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; + +template +struct SelectedRowsAddTo { + void operator()(const lite::X86Context& context, + const fluid::SelectedRows& input1, + const int64_t input2_offset, + fluid::SelectedRows* input2) { + auto in1_height = input1.height(); + PADDLE_ENFORCE_EQ(in1_height, input2->height()); + + auto& in1_rows = input1.rows(); + auto& in2_rows = *(input2->mutable_rows()); + + auto& in1_value = input1.value(); + auto* in2_value = input2->mutable_value(); + + // concat rows + in2_rows.reserve(in2_rows.size() + + size_t(in1_rows.end() - in1_rows.begin())); + in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end()); + + auto* in1_data = in1_value.data(); + auto* in2_data = in2_value->mutable_data(); + std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset); + } +}; + +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; + +template +struct SelectedRowsSumTo { + void operator()(const lite::X86Context& context, + const std::vector& input1, + const std::vector& input2_offsets, + fluid::SelectedRows* input2) { + // Ensure all selected rows have the same height + size_t size = 0u; + for (auto iter = input1.begin(); iter != input1.end(); ++iter) { + auto& in_rows = (*iter)->rows(); + size += in_rows.end() - in_rows.begin(); + auto in1_height = (*iter)->height(); + PADDLE_ENFORCE_EQ(in1_height, input2->height()); + } + // concat rows + std::vector in2_rows; + in2_rows.reserve(in2_rows.size() + size); + for (auto iter = input1.begin(); iter != input1.end(); ++iter) { + const std::vector& in_rows = (*iter)->rows(); + in2_rows.insert(in2_rows.end(), in_rows.begin(), in_rows.end()); + } + input2->set_rows(in2_rows); + + auto* in2_value = input2->mutable_value(); + T* in2_data = in2_value->mutable_data(); + auto blas = math::GetBlas(context); + size_t offset = 0u; + for (size_t i = 0u; i != input1.size(); ++i) { + auto& in_value = input1[i]->value(); + const T* in_data = in_value.data(); + offset += input2_offsets[i]; + blas.VCOPY(in_value.numel(), in_data, in2_data + offset); + } + } +}; + +template struct SelectedRowsSumTo; +template struct SelectedRowsSumTo; + +template +struct SelectedRowsAddToTensor { + void operator()(const lite::X86Context& context, + const fluid::SelectedRows& input1, + lite::Tensor* input2) { + CHECK(input1.rows().size() != 0) << "input selected rows is empty!"; + + auto in1_height = input1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* input2_data = input2->mutable_data(); + + for (size_t i = 0; i < in1_rows.size(); i++) { + for (int64_t j = 0; j < in1_row_numel; j++) { + input2_data[in1_rows[i] * in1_row_numel + j] += + in1_data[i * in1_row_numel + j]; + } + } + } +}; + +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; + +// This is a separated namespace for manipulate SelectedRows typed +// data. Like merge duplicated rows, adding two SelectedRows etc. +// +// Another group of functors is called "scatter updates", which means +// use SelectedRows to update a dense tensor with different Ops, like +// add or mul. +namespace scatter { + +template +typename std::enable_if< + std::is_floating_point::value && + std::is_same::value>::type +elementwise_add_to(const DeviceContext& ctx, + BlasT* blas, + size_t data_len, + const T* in, + T* out) { + blas->AXPY(data_len, 1., in, out); +} + +template +typename std::enable_if< + !std::is_floating_point::value && + std::is_same::value>::type +elementwise_add_to(const DeviceContext& ctx, + BlasT* blas, + size_t data_len, + const T* in, + T* out) { + for (size_t i = 0; i < data_len; i++) { + out[i] += in[i]; + } +} + +template +struct MergeAdd { + fluid::SelectedRows operator()(const lite::X86Context& context, + const fluid::SelectedRows& input, + const bool sorted_result = false) { + fluid::SelectedRows out; + (*this)(context, input, &out, sorted_result); + return out; + } + + void operator()(const lite::X86Context& context, + const fluid::SelectedRows& input, + fluid::SelectedRows* output, + const bool sorted_result = false) { + std::vector inputs; + inputs.push_back(&input); + (*this)(context, inputs, output, sorted_result); + } + + void operator()(const lite::X86Context& context, + const std::vector& inputs, + fluid::SelectedRows* output, + const bool sorted_result = false) { + if (inputs.size() == 0) { + VLOG(3) << "no input! return"; + return; + } + const fluid::SelectedRows* has_value_input = nullptr; + for (auto* in : inputs) { + if (in->rows().size() > 0) { + has_value_input = in; + break; + } + } + if (has_value_input == nullptr) { + VLOG(3) << "no input has value! just return" << std::endl; + return; + } + auto input_width = has_value_input->value().dims()[1]; + auto input_height = has_value_input->height(); + fluid::SelectedRows& out = *output; + std::set merged_row_set; + size_t row_num = 0; + for (auto* input : inputs) { + if (input->rows().size() == 0) { + continue; + } + PADDLE_ENFORCE_EQ(input_width, + input->value().dims()[1], + "all input should have same " + "dimension except for the first one"); + PADDLE_ENFORCE_EQ( + input_height, input->height(), "all input should have same height"); + row_num += input->rows().size(); + merged_row_set.insert(input->rows().begin(), input->rows().end()); + } + + out.set_height(input_height); + lite::DDim dims(std::vector( + {static_cast(merged_row_set.size()), input_width})); + out.mutable_value()->Resize(dims); + auto* out_data = out.mutable_value()->mutable_data(); + + if (merged_row_set.size() == row_num && !sorted_result) { + // no duplicated ids, just concat the result together + std::vector merge_rows; + merge_rows.reserve(row_num); + // concat rows + for (auto* in : inputs) { + merge_rows.insert( + merge_rows.end(), in->rows().begin(), in->rows().end()); + } + out.set_rows(merge_rows); + int64_t copied_numel = 0; + for (auto* in : inputs) { + auto* in_data = in->value().data(); + auto in_numel = in->value().numel(); + std::copy_n(in_data, in_numel, out_data + copied_numel); + copied_numel += in_numel; + } + } else { + std::vector merge_rows(merged_row_set.begin(), + merged_row_set.end()); + + if (sorted_result) { + std::sort(merge_rows.begin(), merge_rows.end()); + } + + out.set_rows(merge_rows); + math::SetConstant constant_functor; + constant_functor(context, out.mutable_value(), 0.0); + + std::unordered_map rows_to_id; + for (size_t i = 0; i < merge_rows.size(); ++i) { + rows_to_id[merge_rows[i]] = i; + } + + auto blas = math::GetBlas(context); + for (auto* input : inputs) { + if (input->rows().size() == 0) { + continue; + } + auto* input_data = input->value().data(); + auto& input_rows = input->rows(); + + for (size_t i = 0; i < input_rows.size(); i++) { + size_t out_i = rows_to_id[input_rows[i]]; + elementwise_add_to( + context, + &blas, + static_cast(input_width), + &input_data[i * input_width], + &out_data[out_i * input_width]); + } + } + } + } +}; + +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; + +template +struct UpdateToTensor { + void operator()(const lite::X86Context& context, + const ScatterOps& op, + const fluid::SelectedRows& input1, + lite::Tensor* input2) { + auto in1_height = input1.height(); + auto in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); + + auto* in1_data = in1_value.data(); + auto* input2_data = input2->data(); + + // FIXME(typhoonzero): use macro fix the below messy code. + switch (op) { + case ScatterOps::ASSIGN: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::ADD: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] += + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::SUB: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] -= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::SUBBY: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j] - + input2_data[in1_rows[i] * in1_row_numel + j]; + break; + case ScatterOps::MUL: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] *= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::DIV: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] /= + in1_data[i * in1_row_numel + j]; + break; + case ScatterOps::DIVBY: + INLINE_FOR2(in1_rows.size(), in1_row_numel) + input2_data[in1_rows[i] * in1_row_numel + j] = + in1_data[i * in1_row_numel + j] / + input2_data[in1_rows[i] * in1_row_numel + j]; + break; + } + } +}; + +} // namespace scatter +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/math/selected_rows_functor.h b/lite/backends/x86/math/selected_rows_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..fc3636e1e6adb4aaf812deba4131a9cbff5cbdc4 --- /dev/null +++ b/lite/backends/x86/math/selected_rows_functor.h @@ -0,0 +1,112 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include + +#include "lite/backends/x86/math/blas.h" +#include "lite/backends/x86/math/math_function.h" +#include "lite/core/context.h" +#include "lite/fluid/eigen.h" +#include "lite/fluid/selected_rows.h" + +#define INLINE_FOR2(sizei, sizej) \ + for (int64_t i = 0; i < sizei; i++) \ + for (int64_t j = 0; j < sizej; j++) + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { + +template +struct SelectedRowsAdd { + void operator()(const lite::Context& context, + const fluid::SelectedRows& input1, + const fluid::SelectedRows& input2, + fluid::SelectedRows* output); +}; + +template +struct SelectedRowsAddTensor { + void operator()(const lite::Context& context, + const fluid::SelectedRows& input1, + const lite::Tensor& input2, + lite::Tensor* output); +}; + +// input2 = input1 + input2 +template +struct SelectedRowsAddTo { + void operator()(const lite::Context& context, + const fluid::SelectedRows& input1, + const int64_t input2_offset, + fluid::SelectedRows* input2); +}; + +// input2 = [all input in input1] + input2 +template +struct SelectedRowsSumTo { + void operator()(const lite::Context& context, + const std::vector& input1, + const std::vector& input2_offsets, + fluid::SelectedRows* input2); +}; + +// FIXME: The result of SelectedRowsAddToTensor maybe non deterministic, +// because it uses CudaAtomicAdd. +// input2 = input1 + input2 +template +struct SelectedRowsAddToTensor { + void operator()(const lite::Context& context, + const fluid::SelectedRows& input1, + lite::Tensor* input2); +}; + +namespace scatter { +// functors for manuplating SelectedRows data +template +struct MergeAdd { + // unary functor, merge by adding duplicated rows in + // the input SelectedRows object. + fluid::SelectedRows operator()(const lite::Context& context, + const fluid::SelectedRows& input, + const bool sorted_result = false); + void operator()(const lite::Context& context, + const fluid::SelectedRows& input, + fluid::SelectedRows* output, + const bool sorted_result = false); + void operator()(const lite::Context& context, + const std::vector& inputs, + fluid::SelectedRows* output, + const bool sorted_result = false); +}; + +enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; + +// out = selected_rows_in / tensor +template +struct UpdateToTensor { + void operator()(const lite::Context& context, + const ScatterOps& op, + const fluid::SelectedRows& input1, + lite::Tensor* input2); +}; + +} // namespace scatter +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle diff --git a/lite/fluid/CMakeLists.txt b/lite/fluid/CMakeLists.txt index 308dcb2c3052a338c52cf888cec789e66cb8e887..ceb1f7d982392cfbf130719fc04cbd2337fad28c 100644 --- a/lite/fluid/CMakeLists.txt +++ b/lite/fluid/CMakeLists.txt @@ -1,4 +1,4 @@ if (LITE_WITH_X86) lite_cc_library(fluid_data_type SRCS data_type.cc DEPS framework_proto eigen3) -# lite_cc_library(selected_rows SRCS selected_rows.cc) +lite_cc_library(selected_rows SRCS selected_rows.cc DEPS tensor model_parser) endif() diff --git a/lite/fluid/rw_lock.h b/lite/fluid/rw_lock.h new file mode 100644 index 0000000000000000000000000000000000000000..eb9829425eca9d8bd363a45961302a7f3818e513 --- /dev/null +++ b/lite/fluid/rw_lock.h @@ -0,0 +1,101 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if !defined(_WIN32) +#include +#else +#include // NOLINT +#endif // !_WIN32 + +#include "lite/utils/paddle_enforce.h" + +namespace paddle { +namespace lite { +namespace fluid { + +#if !defined(_WIN32) +struct RWLock { + RWLock() { pthread_rwlock_init(&lock_, nullptr); } + + ~RWLock() { pthread_rwlock_destroy(&lock_); } + + inline void RDLock() { + PADDLE_ENFORCE_EQ( + pthread_rwlock_rdlock(&lock_), 0, "acquire read lock failed"); + } + + inline void WRLock() { + PADDLE_ENFORCE_EQ( + pthread_rwlock_wrlock(&lock_), 0, "acquire write lock failed"); + } + + inline void UNLock() { + PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed"); + } + + private: + pthread_rwlock_t lock_; +}; +// TODO(paddle-dev): Support RWLock for WIN32 for correctness. +#else +// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive +// In windows, rw_lock seems like a hack. Use empty object and do nothing. +struct RWLock { + // FIXME(minqiyang): use mutex here to do fake lock + inline void RDLock() { mutex_.lock(); } + + inline void WRLock() { mutex_.lock(); } + + inline void UNLock() { mutex_.unlock(); } + + private: + std::mutex mutex_; +}; +#endif + +class AutoWRLock { + public: + explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } + + ~AutoWRLock() { UnLock(); } + + private: + inline void Lock() { lock_->WRLock(); } + + inline void UnLock() { lock_->UNLock(); } + + private: + RWLock* lock_; +}; + +class AutoRDLock { + public: + explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } + + ~AutoRDLock() { UnLock(); } + + private: + inline void Lock() { lock_->RDLock(); } + + inline void UnLock() { lock_->UNLock(); } + + private: + RWLock* lock_; +}; + +} // namespace fluid +} // namespace lite +} // namespace paddle diff --git a/lite/fluid/selected_rows.cc b/lite/fluid/selected_rows.cc new file mode 100644 index 0000000000000000000000000000000000000000..18221d498deb2804294a1a9b9241450ac603611a --- /dev/null +++ b/lite/fluid/selected_rows.cc @@ -0,0 +1,247 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "lite/fluid/selected_rows.h" +namespace paddle { +namespace lite { +namespace fluid { + +struct ReAllocateVisitor { + ReAllocateVisitor(const lite::DDim& dims, lite::Tensor* tensor) + : dims_(dims), tensor_(tensor) {} + + template + void operator()() const { + lite::Tensor cpu_tensor; + T* ptr = cpu_tensor.mutable_data(lite::TargetType::kX86, dims_); + const T* old_ptr = + tensor_->memory_size() == 0 ? nullptr : tensor_->mutable_data(); + if (old_ptr != nullptr) { + std::copy(old_ptr, old_ptr + tensor_->numel(), ptr); + } + tensor_->ShareDataWith(cpu_tensor); + } + + lite::DDim dims_; + lite::Tensor* tensor_; +}; + +struct TensorCopyVisitor { + TensorCopyVisitor(lite::Tensor* dst, + int64_t dst_offset, + const lite::Tensor src, + int64_t src_offset, + int64_t size) + : dst_(dst), + dst_offset_(dst_offset), + src_(src), + src_offset_(src_offset), + size_(size) {} + + template + void apply() const { + // TODO(Yancey1989): support other place + std::copy_n(src_.data() + src_offset_, + size_, + dst_->mutable_data(lite::TargetType::kX86) + dst_offset_); + } + + lite::Tensor* dst_; + int64_t dst_offset_; + lite::Tensor src_; + int64_t src_offset_; + int64_t size_; +}; + +struct TensorFillVisitor { + TensorFillVisitor(lite::Tensor* dst, + int64_t dst_offset, + int64_t size, + float value) + : dst_(dst), dst_offset_(dst_offset), size_(size) {} + + template + void apply() const { + // TODO(qiao): support other place + // paddle::platform::CPUPlace cpu; + auto* tensor_data = dst_->mutable_data(lite::TargetType::kX86); + auto* start = tensor_data + dst_offset_; + auto* end = start + size_; + std::fill(start, end, static_cast(0.0)); + } + + lite::Tensor* dst_; + int64_t dst_offset_; + int64_t size_; +}; + +void SerializeToStream(std::ostream& os, + const SelectedRows& selected_rows, + const lite::Context& dev_ctx) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { + // the 2st field, rows information + auto& rows = selected_rows.rows(); + uint64_t size = rows.size(); + os.write(reinterpret_cast(&size), sizeof(size)); + for (uint64_t i = 0; i < size; ++i) { + os.write(reinterpret_cast(&rows[i]), sizeof(rows[i])); + } + } + { + // the 3st field, the height of SelectedRows + int64_t height = selected_rows.height(); + os.write(reinterpret_cast(&height), sizeof(height)); + } + // the 4st field, Tensor data + TensorToStream(os, selected_rows.value()); +} + +void DeserializeFromStream( + std::istream& is, + SelectedRows* selected_rows, + const lite::Context& dev_ctx) { + { + // the 1st field, unit32_t version for SelectedRows + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + } + { + // the 2st field, rows information + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + auto& rows = *selected_rows->mutable_rows(); + rows.resize(size); + for (uint64_t i = 0; i < size; ++i) { + is.read(reinterpret_cast(&rows[i]), sizeof(int64_t)); + } + } + { + // the 3st field, the height of the SelectedRows + int64_t height; + is.read(reinterpret_cast(&height), sizeof(int64_t)); + selected_rows->set_height(height); + } + // the 4st field, tensor which contains the data + TensorFromStream(is, selected_rows->mutable_value()); +} + +bool SelectedRows::HasKey(int64_t key) const { + return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false + : true; +} + +int64_t SelectedRows::AutoGrownIndex(int64_t key, + bool auto_grown, + bool is_test) { + if (is_test) { + auto iter = id_to_index_.find(key); + if (iter == id_to_index_.end()) { + return -1; + } else { + return iter->second; + } + } + + rwlock_->RDLock(); + auto iter = id_to_index_.find(key); + if (iter == id_to_index_.end()) { + rwlock_->UNLock(); + if (!auto_grown) { + PADDLE_THROW("key %d not found", key); + } + rwlock_->WRLock(); + auto map_size = id_to_index_.size(); + auto vector_size = rows_.size(); + if (map_size != vector_size) { + rwlock_->UNLock(); + PADDLE_THROW( + "id_to_index_ size %d should have the same size with rows_ %d", + map_size, + vector_size); + } + auto write_iter = id_to_index_.find(key); + if (write_iter == id_to_index_.end()) { + int row_num = rows_.size(); + if (row_num == value_->dims()[0]) { + rwlock_->UNLock(); + PADDLE_THROW("selected rows is full, then length exceed %d", row_num); + } + // key logic to put a key into id_to_index_ + rows_.push_back(key); + auto index = static_cast(rows_.size() - 1); + id_to_index_[key] = index; + rwlock_->UNLock(); + return index; + } else { + auto index = write_iter->second; + rwlock_->UNLock(); + return index; + } + } else { + auto index = iter->second; + rwlock_->UNLock(); + return index; + } +} + +void SelectedRows::SyncIndex() { + rwlock_->WRLock(); + id_to_index_.clear(); + for (size_t i = 0; i < rows_.size(); ++i) { + id_to_index_[rows_[i]] = i; + } + rwlock_->UNLock(); +} + +void SelectedRows::Get(const lite::Tensor& ids, + lite::Tensor* value, + bool auto_grown, + bool is_test) { + PADDLE_ENFORCE(value->IsInitialized(), + "The value tensor should be initialized."); + if (ids.numel() == 0) { + VLOG(3) << "keys is empty, please check data!"; + } else { + int64_t value_width = value_->numel() / value_->dims()[0]; + PADDLE_ENFORCE_EQ(value_width, + value->numel() / value->dims()[0], + "output tensor should have the same shape with table " + "except the dims[0]."); + for (int i = 0; i < ids.numel(); ++i) { + auto id = ids.data()[i]; + int64_t index = AutoGrownIndex(id, auto_grown, is_test); + if (index < 0) { + VLOG(5) << "id " << id << " not in the table, return 0"; + TensorFillVisitor(value, i * value_width, value_width, 0.0) + .apply(); + } else { + TensorCopyVisitor(value, + i * value_width, + *value_.get(), + index * value_width, + value_width) + .apply(); + } + } + } +} + +} // namespace fluid +} // namespace lite +} // namespace paddle diff --git a/lite/fluid/selected_rows.h b/lite/fluid/selected_rows.h new file mode 100644 index 0000000000000000000000000000000000000000..16f7cbc1788c00728fe62a416068348a04eb39cc --- /dev/null +++ b/lite/fluid/selected_rows.h @@ -0,0 +1,173 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include +#include + +#include "lite/core/context.h" +#include "lite/core/tensor.h" +#include "lite/fluid/rw_lock.h" +#include "lite/model_parser/model_parser.h" +namespace paddle { +namespace lite { +namespace fluid { + +class SelectedRows { + /* + * @brief We can use the SelectedRows structure to reproduce a sparse table. + * A sparse table is a key-value structure that the key is an `int64_t`, + * and the value is a Tensor which the first dimension is 0. + * You can use the following interface to operate the sparse table, and you + * can find + * some detail information from the comments of each interface: + * + * HasKey(key), whether the sparse table has the specified key. + * Set(key, value), set a key-value pair into the sparse table. + * Get(keys, value*), get value by given key list and apply it to the given + * value pointer + * with the specified offset. + * + */ + public: + SelectedRows(const std::vector& rows, const int64_t& height) + : rows_(rows), height_(height) { + value_.reset(new Tensor()); + rwlock_.reset(new RWLock); + } + + SelectedRows() { + height_ = 0; + value_.reset(new Tensor()); + rwlock_.reset(new RWLock); + } + + TargetType target() const { return value_->target(); } + + const Tensor& value() const { return *value_; } + + Tensor* mutable_value() { return value_.get(); } + + int64_t height() const { return height_; } + + void set_height(int64_t height) { height_ = height; } + + const std::vector& rows() const { return rows_; } + + std::vector* mutable_rows() { return &rows_; } + + void set_rows(const std::vector& rows) { rows_ = rows; } + + /* + * @brief Get the index of key in rows + * + * @return -1 if the key does not exists. + */ + int64_t Index(int64_t key) const { + auto it = std::find(rows_.begin(), rows_.end(), key); + if (it == rows_.end()) { + PADDLE_THROW("id %s not in table", key); + } + return static_cast(std::distance(rows_.begin(), it)); + } + + /* + * @brief whether has the specified key in the table. + * + * @return true if the key is exists. + */ + bool HasKey(int64_t key) const; + + /* + * @brief Get value by the key list. + * Note!!! this interface is only used when selected_rows is used as + * parameters + * for distribute lookup table. + * + * @return a list of pair which contains the non-exists key and the index in + * the value + */ + void Get(const lite::Tensor& ids, + lite::Tensor* value, + bool auto_grown = false, + bool is_test = false); + + /* + * @brief Get the index of the key from id_to_index_ map. If the key not + * exist, + * add the key into id_to_index_. + * + * Note!!! this interface is only used when selected_rows is used as + * parameters + * for distribute lookup table. + * + * @return index of the key. + */ + int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false); + + /* + * @brief Get the index of the key from id_to_index_ map. + */ + inline int64_t GetIndexFromId(int64_t key) { + auto iter = id_to_index_.find(key); + if (iter == id_to_index_.end()) { + return -1; + } else { + return iter->second; + } + } + + void SyncIndex(); + /* + * @brief Get complete Dims before + */ + DDim GetCompleteDims() const { + DDim dims = value_->dims(); + dims[0] = height_; + return dims; + } + + private: + // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here. + // SelectedRows are simply concated when adding together. Until a + // SelectedRows add a Tensor, will the duplicate rows be handled. + std::vector rows_; + std::unordered_map + id_to_index_; // should not be used when rows_ has duplicate member + std::unique_ptr value_{nullptr}; + int64_t height_; // height indicates the underline tensor's height + std::unique_ptr rwlock_{nullptr}; +}; + +/* + * Serialize/Desiralize SelectedRows to std::ostream + * You can pass ofstream or ostringstream to serilize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, + const SelectedRows& selected_rows, + const lite::Context& dev_ctx); +void DeserializeFromStream( + std::istream& is, + SelectedRows* selected_rows, + const lite::Context& dev_ctx); + +} // namespace fluid +} // namespace lite +} // namespace paddle diff --git a/lite/model_parser/model_parser.h b/lite/model_parser/model_parser.h index 81be2579e3932d7165480afd5bb89f567155cf36..bca7533c24af517994dae677c7b63e088f2ef1ca 100644 --- a/lite/model_parser/model_parser.h +++ b/lite/model_parser/model_parser.h @@ -72,7 +72,7 @@ void SerializeTensor(std::ostream& os, // LoDTensor to ostream void TensorToStream(std::ostream& os, const lite::Tensor& tensor); - +void TensorFromStream(std::istream& is, lite::Tensor* tensor); void ReadBinaryFile(const std::string& filename, std::string* contents); // For naive buffer