From 1c8a0c4bd466aa2accbc6fa257142dbe76a01f6d Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 31 Oct 2017 17:26:52 +0800 Subject: [PATCH] Refine activation function pointer for LSTM operator. --- paddle/framework/CMakeLists.txt | 3 +- paddle/operators/math/detail/CMakeLists.txt | 4 +- .../math/detail/activation_functions.h | 170 ++++++++++++++++ .../{hl_avx_functions.cc => avx_functions.cc} | 22 +- .../math/detail/hl_activation_functions.h | 188 ------------------ .../operators/math/detail/hl_avx_functions.h | 32 --- .../operators/math/detail/hl_cpu_functions.cc | 89 --------- paddle/operators/math/detail/hl_functions.h | 71 ------- .../operators/math/detail/hl_gpu_functions.h | 93 --------- .../operators/math/detail/lstm_cpu_kernel.h | 28 ++- .../operators/math/detail/lstm_gpu_kernel.h | 30 ++- paddle/operators/math/detail/lstm_kernel.h | 135 +++++-------- .../paddle/v2/framework/tests/test_lstm_op.py | 4 +- 13 files changed, 279 insertions(+), 590 deletions(-) create mode 100644 paddle/operators/math/detail/activation_functions.h rename paddle/operators/math/detail/{hl_avx_functions.cc => avx_functions.cc} (84%) delete mode 100644 paddle/operators/math/detail/hl_activation_functions.h delete mode 100644 paddle/operators/math/detail/hl_avx_functions.h delete mode 100644 paddle/operators/math/detail/hl_cpu_functions.cc delete mode 100644 paddle/operators/math/detail/hl_functions.h delete mode 100644 paddle/operators/math/detail/hl_gpu_functions.h diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index f4fef055daf..2be21e825ae 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -20,7 +20,8 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_library(attribute SRCS attribute.cc DEPS framework_proto) -cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc) +cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc +device_context) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute) cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) diff --git a/paddle/operators/math/detail/CMakeLists.txt b/paddle/operators/math/detail/CMakeLists.txt index 49cf228de22..92eac9d3623 100644 --- a/paddle/operators/math/detail/CMakeLists.txt +++ b/paddle/operators/math/detail/CMakeLists.txt @@ -1,5 +1,3 @@ if(WITH_AVX) - cc_library(activation_functions SRCS hl_cpu_functions.cc hl_avx_functions.cc) -else() - cc_library(activation_functions SRCS hl_cpu_functions.cc) + cc_library(activation_functions SRCS avx_functions.cc) endif() diff --git a/paddle/operators/math/detail/activation_functions.h b/paddle/operators/math/detail/activation_functions.h new file mode 100644 index 00000000000..8a186a51d6b --- /dev/null +++ b/paddle/operators/math/detail/activation_functions.h @@ -0,0 +1,170 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/platform/hostdevice.h" + +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { +namespace detail { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 + +namespace forward { + +template +DEVICE T linear(const T a) { + return a; +} + +template +DEVICE T relu(const T a) { + return a > static_cast(0.0) ? a : static_cast(0.0); +} + +template +DEVICE T sigmoid(const T a) { + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + T tmp = (a < min) ? min : ((a > max) ? max : a); + return static_cast(1.0) / (static_cast(1.0) + exp(-tmp)); +} + +template +DEVICE T tanh(const T a) { + T tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + return (2.0 / (1.0 + exp(tmp))) - 1.0; +} + +} // namespace forward + +namespace backward { + +template +DEVICE T linear(const T a, const T b) { + return a; +} + +template +DEVICE T relu(const T a, const T b) { + return a * (b > 0.0 ? 1.0 : 0.0); +} + +template +DEVICE T sigmoid(const T a, const T b) { + return a * b * (1.0 - b); +} + +template +DEVICE T tanh(const T a, const T b) { + return a * (1.0 - b * b); +} + +} // namespace backward + +template +struct Active { + typedef T (*Act)(T); + typedef T (*ActGrad)(T, T); +}; + +static DEVICE Active::Act kActFloat[] = { + &forward::sigmoid, &forward::relu, &forward::tanh, + &forward::linear}; + +static DEVICE Active::ActGrad kActGradFloat[] = { + &backward::sigmoid, &backward::relu, &backward::tanh, + &backward::linear}; + +static DEVICE Active::Act kActDouble[] = { + &forward::sigmoid, &forward::relu, &forward::tanh, + &forward::linear}; + +static DEVICE Active::ActGrad kActGradDouble[] = { + &backward::sigmoid, &backward::relu, + &backward::tanh, &backward::linear}; + +namespace forward { +inline DEVICE float activation(float a, int index) { + return kActFloat[index](a); +} + +inline DEVICE double activation(double a, int index) { + return kActDouble[index](a); +} + +} // namespace forward + +namespace backward { +inline DEVICE float activation(float a, float b, int index) { + return kActGradFloat[index](a, b); +} + +inline DEVICE double activation(double a, double b, int index) { + return kActGradDouble[index](a, b); +} +} // namespace backward + +#ifdef __AVX__ +namespace forward { +namespace avx { +__m256 relu(const __m256 a); +__m256 sigmoid(const __m256 a); +__m256 tanh(const __m256 a); +__m256 linear(const __m256 a); +} // namespace avx +} // namespace forward + +namespace backward { +namespace avx { +__m256 relu(const __m256 a, const __m256 b); +__m256 sigmoid(const __m256 a, const __m256 b); +__m256 tanh(const __m256 a, const __m256 b); +__m256 linear(const __m256 a, const __m256 b); +} // namespace avx +} // namespace backward + +static Active<__m256>::Act kActAvx[] = { + &forward::avx::sigmoid, &forward::avx::relu, &forward::avx::tanh, + &forward::avx::linear}; + +static Active<__m256>::ActGrad kActGradAvx[] = { + &backward::avx::sigmoid, &backward::avx::relu, &backward::avx::tanh, + &backward::avx::linear}; + +namespace forward { +inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); } +} // namespace forward + +namespace backward { +inline __m256 activation(__m256 a, __m256 b, int index) { + return kActGradAvx[index](a, b); +} +} // namespace backward + +#endif + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/detail/hl_avx_functions.cc b/paddle/operators/math/detail/avx_functions.cc similarity index 84% rename from paddle/operators/math/detail/hl_avx_functions.cc rename to paddle/operators/math/detail/avx_functions.cc index 415bac5d93e..b8f014d30eb 100644 --- a/paddle/operators/math/detail/hl_avx_functions.cc +++ b/paddle/operators/math/detail/avx_functions.cc @@ -13,14 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "hl_functions.h" +#include "paddle/operators/math/detail/activation_functions.h" // TODO(qingqing) refine this dependence #include "paddle/cuda/src/avx_mathfun.h" -namespace hppl { +namespace paddle { +namespace operators { +namespace math { +namespace detail { __m256 exp(__m256 a) { return exp256_ps(a); } +namespace forward { +namespace avx { __m256 relu(const __m256 a) { __m256 tmp = _mm256_set1_ps(0.0f); return _mm256_max_ps(a, tmp); @@ -50,6 +55,11 @@ __m256 tanh(const __m256 a) { __m256 linear(const __m256 a) { return a; } +} // namespace avx +} // namespace forward + +namespace backward { +namespace avx { __m256 relu(const __m256 a, const __m256 b) { return _mm256_mul_ps( a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS), @@ -67,4 +77,10 @@ __m256 tanh(const __m256 a, const __m256 b) { } __m256 linear(const __m256 a, const __m256 b) { return a; } -} // namespace hppl +} // namespace avx +} // namespace backward + +} // namespace detail +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/detail/hl_activation_functions.h b/paddle/operators/math/detail/hl_activation_functions.h deleted file mode 100644 index 9d7d9914f00..00000000000 --- a/paddle/operators/math/detail/hl_activation_functions.h +++ /dev/null @@ -1,188 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef HL_ACTIVATION_FUNCTIONS_H_ -#define HL_ACTIVATION_FUNCTIONS_H_ - -#include "hl_functions.h" -#include "paddle/operators/math/lstm_compute.h" - -/** - * Active functions: sigmoid, relu, tanh and linear. - */ -#define FLOAT_ACTIVE_FUNCTION \ - { \ - hppl::typef::sigmoid, hppl::typef::relu, hppl::typef::tanh, \ - hppl::typef::linear \ - } - -#define DOUBLE_ACTIVE_FUNCTION \ - { \ - hppl::typed::sigmoid, hppl::typed::relu, hppl::typed::tanh, \ - hppl::typed::linear \ - } - -#define AVX_ACTIVE_FUNCTION \ - { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear } - -namespace hppl { - -using activation_mode_t = paddle::operators::math::activation_mode_t; - -/** - * Hppl supports sigmoid, relu, tanh, linear active functions - * for neural networks' forward and backward activation. - */ -template -class Active { - public: - typedef T (*forward)(T); - typedef T (*backward)(T, T); -}; - -template -struct ForwardActType; - -template <> -struct ForwardActType { - using type = Active::forward; -}; - -template <> -struct ForwardActType { - using type = Active::forward; -}; - -template -struct BackwardActType; - -template <> -struct BackwardActType { - using type = Active::backward; -}; - -template <> -struct BackwardActType { - using type = Active::backward; -}; - -#ifdef __NVCC__ -namespace gpu { -static __device__ Active::forward forward[] = FLOAT_ACTIVE_FUNCTION; -static __device__ Active::backward backward[] = FLOAT_ACTIVE_FUNCTION; - -static __device__ Active::forward forward_d[] = DOUBLE_ACTIVE_FUNCTION; -static __device__ Active::backward backward_d[] = - DOUBLE_ACTIVE_FUNCTION; - -template -struct ForwardAct { - __device__ typename ForwardActType::type operator()( - activation_mode_t type); -}; - -template <> -struct ForwardAct { - __device__ ForwardActType::type operator()(activation_mode_t type) { - return forward[type]; - } -}; - -template <> -struct ForwardAct { - __device__ ForwardActType::type operator()(activation_mode_t type) { - return forward_d[type]; - } -}; - -template -struct BackwardAct { - __device__ typename BackwardActType::type operator()( - activation_mode_t type); -}; - -template <> -struct BackwardAct { - __device__ BackwardActType::type operator()(activation_mode_t type) { - return backward[type]; - } -}; - -template <> -struct BackwardAct { - __device__ BackwardActType::type operator()(activation_mode_t type) { - return backward_d[type]; - } -}; - -} // namespace gpu -#else -namespace cpu { -static Active::forward forward[] = FLOAT_ACTIVE_FUNCTION; -static Active::backward backward[] = FLOAT_ACTIVE_FUNCTION; - -static Active::forward forward_d[] = DOUBLE_ACTIVE_FUNCTION; -static Active::backward backward_d[] = DOUBLE_ACTIVE_FUNCTION; - -template -struct ForwardAct { - typename ForwardActType::type operator()(activation_mode_t type); -}; - -template <> -struct ForwardAct { - ForwardActType::type operator()(activation_mode_t type) { - return forward[type]; - } -}; - -template <> -struct ForwardAct { - ForwardActType::type operator()(activation_mode_t type) { - return forward_d[type]; - } -}; - -template -struct BackwardAct { - typename BackwardActType::type operator()(activation_mode_t type); -}; - -template <> -struct BackwardAct { - BackwardActType::type operator()(activation_mode_t type) { - return backward[type]; - } -}; - -template <> -struct BackwardAct { - BackwardActType::type operator()(activation_mode_t type) { - return backward_d[type]; - } -}; - -} // namespace cpu - -#ifdef __AVX__ -namespace avx { -static Active<__m256>::forward forward[] = AVX_ACTIVE_FUNCTION; -static Active<__m256>::backward backward[] = AVX_ACTIVE_FUNCTION; -} // namespace avx -#endif -#endif - -} // namespace hppl - -#endif // HL_ACTIVATION_FUNCTIONS_H_ diff --git a/paddle/operators/math/detail/hl_avx_functions.h b/paddle/operators/math/detail/hl_avx_functions.h deleted file mode 100644 index 35f4eabb4c0..00000000000 --- a/paddle/operators/math/detail/hl_avx_functions.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef HL_AVX_FUNCTIONS_H_ -#define HL_AVX_FUNCTIONS_H_ - -#include - -namespace hppl { -__m256 relu(const __m256 a); -__m256 sigmoid(const __m256 a); -__m256 tanh(const __m256 a); -__m256 linear(const __m256 a); - -__m256 relu(const __m256 a, const __m256 b); -__m256 sigmoid(const __m256 a, const __m256 b); -__m256 tanh(const __m256 a, const __m256 b); -__m256 linear(const __m256 a, const __m256 b); -} // namespace hppl - -#endif // HL_AVX_FUNCTIONS_H_ diff --git a/paddle/operators/math/detail/hl_cpu_functions.cc b/paddle/operators/math/detail/hl_cpu_functions.cc deleted file mode 100644 index 21ec78f9629..00000000000 --- a/paddle/operators/math/detail/hl_cpu_functions.cc +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "hl_functions.h" - -namespace hppl { -namespace typef { - -float relu(const float a) { - return a > static_cast(0.0) ? a : static_cast(0.0); -} - -float sigmoid(const float a) { - const float min = SIGMOID_THRESHOLD_MIN; - const float max = SIGMOID_THRESHOLD_MAX; - float tmp = (a < min) ? min : ((a > max) ? max : a); - return static_cast(1.0) / (static_cast(1.0) + exp(-tmp)); -} - -float tanh(const float a) { - float tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - return (2.0 / (1.0 + exp(tmp))) - 1.0; -} - -float linear(const float a) { return a; } - -float relu(const float a, const float b) { return a * (b > 0.0 ? 1.0 : 0.0); } - -float sigmoid(const float a, const float b) { - return a * b * (static_cast(1) - b); -} - -float tanh(const float a, const float b) { - return a * (static_cast(1) - b * b); -} - -float linear(const float a, const float b) { return a; } - -} // namespace typef - -namespace typed { -double relu(const double a) { - return a > static_cast(0.0) ? a : static_cast(0.0); -} - -double sigmoid(const double a) { - const double min = SIGMOID_THRESHOLD_MIN; - const double max = SIGMOID_THRESHOLD_MAX; - double tmp = (a < min) ? min : ((a > max) ? max : a); - return static_cast(1.0) / (static_cast(1.0) + exp(-tmp)); -} - -double tanh(const double a) { - double tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - return (2.0 / (1.0 + exp(tmp))) - 1.0; -} - -double linear(const double a) { return a; } - -double relu(const double a, const double b) { - return a * (b > 0.0 ? 1.0 : 0.0); -} - -double sigmoid(const double a, const double b) { - return a * b * (static_cast(1) - b); -} - -double tanh(const double a, const double b) { - return a * (static_cast(1) - b * b); -} - -double linear(const double a, const double b) { return a; } - -} // namespace typed -} // namespace hppl diff --git a/paddle/operators/math/detail/hl_functions.h b/paddle/operators/math/detail/hl_functions.h deleted file mode 100644 index 3e2f0c9ee6d..00000000000 --- a/paddle/operators/math/detail/hl_functions.h +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef HL_FUNCTIONS_H_ -#define HL_FUNCTIONS_H_ - -/** - * sigmoid threshold maximum - */ -#define SIGMOID_THRESHOLD_MIN -40.0 - -/** - * sigmoid threshold minimum - */ -#define SIGMOID_THRESHOLD_MAX 13.0 - -/** - * The maximum input value for exp, used to avoid overflow problem. - * currently only used for tanh function. - */ -#define EXP_MAX_INPUT 40.0 - -#ifndef __NVCC__ -namespace hppl { -namespace typef { -float relu(const float a); -float sigmoid(const float a); -float tanh(const float a); -float linear(const float a); - -float relu(const float a, const float b); -float sigmoid(const float a, const float b); -float tanh(const float a, const float b); -float linear(const float a, const float b); - -} // namespace typef - -namespace typed { -double relu(const double a); -double sigmoid(const double a); -double tanh(const double a); -double linear(const double a); - -double relu(const double a, const double b); -double sigmoid(const double a, const double b); -double tanh(const double a, const double b); -double linear(const double a, const double b); -} // namespace typed - -} // namespace hppl - -#ifdef __AVX__ -#include "hl_avx_functions.h" -#endif - -#else -#include "hl_gpu_functions.h" -#endif - -#endif // HL_FUNCTIONS_H_ diff --git a/paddle/operators/math/detail/hl_gpu_functions.h b/paddle/operators/math/detail/hl_gpu_functions.h deleted file mode 100644 index 72f2204e7b2..00000000000 --- a/paddle/operators/math/detail/hl_gpu_functions.h +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef HL_GPU_FUNCTIONS_CUH_ -#define HL_GPU_FUNCTIONS_CUH_ - -#include "hl_base.h" - -namespace hppl { -namespace typef { - -__device__ static float relu(const float a) { return a > 0.0f ? a : 0.0f; } - -__device__ static float sigmoid(const float a) { - const float min = SIGMOID_THRESHOLD_MIN; - const float max = SIGMOID_THRESHOLD_MAX; - float tmp = (a < min) ? min : ((a > max) ? max : a); - return __fdividef(1.0f, 1.0f + __expf(-tmp)); -} - -__device__ static float tanh(const float a) { - float tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - return __fdividef(2.0f, (1.0f + __expf(-2.0f * tmp))) - 1.0f; -} - -__device__ static float linear(const float a) { return a; } - -__device__ static float relu(const float a, const float b) { - return a * (b > 0.0f ? 1.0f : 0.0f); -} - -__device__ static float sigmoid(const float a, const float b) { - return a * b * (1.0f - b); -} - -__device__ static float tanh(const float a, const float b) { - return a * (1.0f - b * b); -} - -__device__ static float linear(const float a, const float b) { return a; } - -} // namespace typef - -namespace typed { - -__device__ static double relu(const double a) { return a > 0.0 ? a : 0.0; } - -__device__ static double sigmoid(const double a) { - const double min = SIGMOID_THRESHOLD_MIN; - const double max = SIGMOID_THRESHOLD_MAX; - double tmp = (a < min) ? min : ((a > max) ? max : a); - return 1.0 / (1.0 + exp(-tmp)); -} - -__device__ static double tanh(const double a) { - double tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - return (2.0 / (1.0 + exp(-2.0 * a))) - 1.0; -} - -__device__ static double linear(const double a) { return a; } - -__device__ static double relu(const double a, const double b) { - return a * (b > 0.0 ? 1.0 : 0.0); -} - -__device__ static double sigmoid(const double a, const double b) { - return a * b * (1 - b); -} - -__device__ static double tanh(const double a, const double b) { - return a * (1.0 - b * b); -} - -__device__ static double linear(const double a, const double b) { return a; } - -} // namespace typef - -} // namespace hppl - -#endif // HL_GPU_FUNCTIONS_CUH_ diff --git a/paddle/operators/math/detail/lstm_cpu_kernel.h b/paddle/operators/math/detail/lstm_cpu_kernel.h index d0ed55ea168..f5b0dd85c9d 100644 --- a/paddle/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/operators/math/detail/lstm_cpu_kernel.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once #include -#include "paddle/operators/math/detail/hl_activation_functions.h" +#include "paddle/operators/math/detail/activation_functions.h" #include "paddle/operators/math/lstm_compute.h" namespace paddle { @@ -26,7 +26,10 @@ namespace detail { template void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, - int frameSize) { + int frameSize, + activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { T rValueIn; T rValueIg; T rValueFg; @@ -58,7 +61,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, } op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, - rOut, rCheckI, rCheckF, rCheckO); + rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); valueIn[i] = rValueIn; valueIg[i] = rValueIg; @@ -72,7 +75,10 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, template void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, - LstmMetaGrad grad, int frameSize) { + LstmMetaGrad grad, int frameSize, + activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { T rValueIn; T rValueIg; T rValueFg; @@ -122,7 +128,7 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, - rCheckOGrad); + rCheckOGrad, active_node, active_gate, active_state); gradIn[i] = rGradIn; gradIg[i] = rGradIg; @@ -176,8 +182,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, int frameSize, } op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, - rOut, rCheckI, rCheckF, rCheckO, hppl::avx::forward[active_node], - hppl::avx::forward[active_gate], hppl::avx::forward[active_state]); + rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); valueIn[i] = rValueIn; valueIg[i] = rValueIg; @@ -246,8 +251,7 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, - rCheckOGrad, hppl::avx::backward[active_node], - hppl::avx::backward[active_gate], hppl::avx::backward[active_state]); + rCheckOGrad, active_node, active_gate, active_state); gradIn[i] = rGradIn; gradIg[i] = rGradIg; @@ -274,7 +278,8 @@ void cpu_lstm_forward(Op op, LstmMetaValue value, int frameSize, avx_lstm_forward_one_sequence(op, value, frameSize, active_node, active_gate, active_state); } else { - naive_lstm_forward_one_sequence(op, value, frameSize); + naive_lstm_forward_one_sequence(op, value, frameSize, active_node, + active_gate, active_state); } } @@ -287,7 +292,8 @@ void cpu_lstm_backward(Op op, LstmMetaValue value, LstmMetaGrad grad, avx_lstm_backward_one_sequence(op, value, grad, frameSize, active_node, active_gate, active_state); } else { - naive_lstm_backward_one_sequence(op, value, grad, frameSize); + naive_lstm_backward_one_sequence(op, value, grad, frameSize, active_node, + active_gate, active_state); } } diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/operators/math/detail/lstm_gpu_kernel.h index c06f164f84a..d3e5e381a50 100644 --- a/paddle/operators/math/detail/lstm_gpu_kernel.h +++ b/paddle/operators/math/detail/lstm_gpu_kernel.h @@ -13,13 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include -#include "paddle/operators/math/detail/hl_activation_functions.h" +#include "paddle/operators/math/detail/activation_functions.h" #include "paddle/operators/math/lstm_compute.h" #include "paddle/platform/cuda_helper.h" #include "paddle/platform/device_context.h" -#include +#include namespace paddle { namespace operators { @@ -32,7 +31,9 @@ namespace detail { */ template __global__ void KeLstmForward(Op op, LstmMetaValue value, int frameSize, - int batchSize) { + int batchSize, activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; if (frameIdx >= frameSize) return; @@ -69,7 +70,7 @@ __global__ void KeLstmForward(Op op, LstmMetaValue value, int frameSize, } op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, - rOut, rCheckI, rCheckF, rCheckO); + rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); value.gateValue[frameIdx] = rValueIn; value.gateValue[frameIdx + frameSize] = rValueIg; @@ -88,7 +89,9 @@ __global__ void KeLstmForward(Op op, LstmMetaValue value, int frameSize, template __global__ void KeLstmBackward(Op op, LstmMetaValue value, LstmMetaGrad grad, int frameSize, - int batchSize) { + int batchSize, activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; if (frameIdx >= frameSize) return; @@ -141,7 +144,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue value, op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad, - rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad); + rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad, + active_node, active_gate, active_state); grad.gateGrad[frameIdx] = rGradIn; grad.gateGrad[frameIdx + frameSize] = rGradIg; @@ -197,11 +201,13 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op, if (batchSize == 1) { KeLstmForward<<>>( - op, value, frameSize, batchSize); + op, value, frameSize, batchSize, active_node, active_gate, + active_state); } else { KeLstmForward<<>>( - op, value, frameSize, batchSize); + op, value, frameSize, batchSize, active_node, active_gate, + active_state); } } @@ -230,11 +236,13 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op, if (batchSize == 1) { KeLstmBackward<<>>( - op, value, grad, frameSize, batchSize); + op, value, grad, frameSize, batchSize, active_node, active_gate, + active_state); } else { KeLstmBackward<<>>( - op, value, grad, frameSize, batchSize); + op, value, grad, frameSize, batchSize, active_node, active_gate, + active_state); } } diff --git a/paddle/operators/math/detail/lstm_kernel.h b/paddle/operators/math/detail/lstm_kernel.h index 461039a4d51..9daaf91981a 100644 --- a/paddle/operators/math/detail/lstm_kernel.h +++ b/paddle/operators/math/detail/lstm_kernel.h @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/operators/math/detail/hl_activation_functions.h" +#include "paddle/operators/math/detail/activation_functions.h" #include "paddle/platform/hostdevice.h" #include @@ -24,45 +24,22 @@ namespace detail { namespace forward { -template -DEVICE inline T sigmoid(const T a) { - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - T tmp = (a < min) ? min : ((a > max) ? max : a); - return static_cast(1.0) / (static_cast(1.0) + exp(-tmp)); -} - -template -DEVICE inline T tanh(const T a) { - T tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - return (2.0 / (1.0 + exp(tmp))) - 1.0; -} - template class lstm { public: HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg, T &prevState, T &state, T &stateAtv, T &output, - T &checkI, T &checkF, T &checkO) { -#if 0 - // TODO(qingqing) support to activation speficed by users - valueIn = actInput(valueIn); - valueIg = actGate(valueIg + prevState * checkI); - valueFg = actGate(valueFg + prevState * checkF); - state = valueIn * valueIg + prevState * valueFg; - valueOg = actGate(valueOg + state * checkO); - stateAtv = actState(state); - output = valueOg * stateAtv; -#else - valueIn = tanh(valueIn); - valueIg = sigmoid(valueIg + prevState * checkI); - valueFg = sigmoid(valueFg + prevState * checkF); + T &checkI, T &checkF, T &checkO, + activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { + valueIn = activation(valueIn, active_node); + valueIg = activation(valueIg + prevState * checkI, active_gate); + valueFg = activation(valueFg + prevState * checkF, active_gate); state = valueIn * valueIg + prevState * valueFg; - valueOg = sigmoid(valueOg + state * checkO); - stateAtv = tanh(state); + valueOg = activation(valueOg + state * checkO, active_gate); + stateAtv = activation(state, active_state); output = valueOg * stateAtv; -#endif } #ifndef __NVCC__ #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default @@ -75,16 +52,19 @@ class lstm { __m256 &valueOg, __m256 &prevState, __m256 &state, __m256 &stateAtv, __m256 &output, __m256 &checkI, __m256 &checkF, __m256 &checkO, - hppl::Active<__m256>::forward actInput, - hppl::Active<__m256>::forward actGate, - hppl::Active<__m256>::forward actState) { - valueIn = actInput(valueIn); - valueIg = actGate(_mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI))); - valueFg = actGate(_mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF))); + activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { + valueIn = activation(valueIn, active_node); + valueIg = activation( + _mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)), active_gate); + valueFg = activation( + _mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)), active_gate); state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg), _mm256_mul_ps(prevState, valueFg)); - valueOg = actGate(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO))); - stateAtv = actState(state); + valueOg = activation(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)), + active_gate); + stateAtv = activation(state, active_state); output = _mm256_mul_ps(valueOg, stateAtv); } #endif @@ -95,16 +75,6 @@ class lstm { namespace backward { -template -DEVICE inline T sigmoid(const T a, const T b) { - return a * b * (1.0 - b); -} - -template -DEVICE inline T tanh(const T a, const T b) { - return a * (1.0 - b * b); -} - template class lstm { public: @@ -113,29 +83,20 @@ class lstm { T &prevState, T &prevStateGrad, T &state, T &stateGrad, T &stateAtv, T &outputGrad, T &checkI, T &checkF, T &checkO, T &checkIGrad, - T &checkFGrad, T &checkOGrad) { -#if 0 - // TODO(qingqing) support to activation speficed by users - gradOg = actGate(outputGrad * stateAtv, valueOg); - stateGrad += actState(outputGrad * valueOg, stateAtv) + gradOg * checkO; - gradIn = actInput(stateGrad * valueIg, valueIn); - gradIg = actGate(stateGrad * valueIn, valueIg); - gradFg = actGate(stateGrad * prevState, valueFg); + T &checkFGrad, T &checkOGrad, + activation_mode_t active_node, + activation_mode_t active_gate, + activation_mode_t active_state) { + gradOg = activation(outputGrad * stateAtv, valueOg, active_gate); + stateGrad += activation(outputGrad * valueOg, stateAtv, active_state) + + gradOg * checkO; + gradIn = activation(stateGrad * valueIg, valueIn, active_node); + gradIg = activation(stateGrad * valueIn, valueIg, active_gate); + gradFg = activation(stateGrad * prevState, valueFg, active_gate); prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg; checkIGrad = gradIg * prevState; checkFGrad = gradFg * prevState; checkOGrad = gradOg * state; -#else - gradOg = sigmoid(outputGrad * stateAtv, valueOg); - stateGrad += tanh(outputGrad * valueOg, stateAtv) + gradOg * checkO; - gradIn = tanh(stateGrad * valueIg, valueIn); - gradIg = sigmoid(stateGrad * valueIn, valueIg); - gradFg = sigmoid(stateGrad * prevState, valueFg); - prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg; - checkIGrad = gradIg * prevState; - checkFGrad = gradFg * prevState; - checkOGrad = gradOg * state; -#endif } #ifndef __NVCC__ #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default @@ -143,24 +104,26 @@ class lstm { #else // Only float support AVX optimization static const bool avx = std::is_same::value; - HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg, - __m256 &valueOg, __m256 &gradIn, __m256 &gradIg, - __m256 &gradFg, __m256 &gradOg, __m256 &prevState, - __m256 &prevStateGrad, __m256 &state, - __m256 &stateGrad, __m256 &stateAtv, - __m256 &outputGrad, __m256 &checkI, __m256 &checkF, - __m256 &checkO, __m256 &checkIGrad, - __m256 &checkFGrad, __m256 &checkOGrad, - hppl::Active<__m256>::backward actInput, - hppl::Active<__m256>::backward actGate, - hppl::Active<__m256>::backward actState) { - gradOg = actGate(_mm256_mul_ps(outputGrad, stateAtv), valueOg); + HOSTDEVICE void operator()( + __m256 &valueIn, __m256 &valueIg, __m256 &valueFg, __m256 &valueOg, + __m256 &gradIn, __m256 &gradIg, __m256 &gradFg, __m256 &gradOg, + __m256 &prevState, __m256 &prevStateGrad, __m256 &state, + __m256 &stateGrad, __m256 &stateAtv, __m256 &outputGrad, __m256 &checkI, + __m256 &checkF, __m256 &checkO, __m256 &checkIGrad, __m256 &checkFGrad, + __m256 &checkOGrad, activation_mode_t active_node, + activation_mode_t active_gate, activation_mode_t active_state) { + gradOg = + activation(_mm256_mul_ps(outputGrad, stateAtv), valueOg, active_gate); stateGrad = _mm256_add_ps( - actState(_mm256_mul_ps(outputGrad, valueOg), stateAtv), stateGrad); + activation(_mm256_mul_ps(outputGrad, valueOg), stateAtv, active_state), + stateGrad); stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad); - gradIn = actInput(_mm256_mul_ps(stateGrad, valueIg), valueIn); - gradIg = actGate(_mm256_mul_ps(stateGrad, valueIn), valueIg); - gradFg = actGate(_mm256_mul_ps(stateGrad, prevState), valueFg); + gradIn = + activation(_mm256_mul_ps(stateGrad, valueIg), valueIn, active_node); + gradIg = + activation(_mm256_mul_ps(stateGrad, valueIn), valueIg, active_gate); + gradFg = + activation(_mm256_mul_ps(stateGrad, prevState), valueFg, active_gate); prevStateGrad = _mm256_add_ps(_mm256_mul_ps(gradIg, checkI), _mm256_mul_ps(gradFg, checkF)); prevStateGrad = diff --git a/python/paddle/v2/framework/tests/test_lstm_op.py b/python/paddle/v2/framework/tests/test_lstm_op.py index fe7f9783e46..ff75160083f 100644 --- a/python/paddle/v2/framework/tests/test_lstm_op.py +++ b/python/paddle/v2/framework/tests/test_lstm_op.py @@ -157,7 +157,7 @@ class TestLstmOp(OpTest): } def test_check_output(self): - self.check_output() + self.check_output(atol=1e-8) #TODO(qingqing) add more unit testing case def test_check_grad(self): @@ -167,7 +167,7 @@ class TestLstmOp(OpTest): self.outputs['BatchCellPreAct'] = np.zeros( (N, self.D)).astype('float64') self.check_grad( - ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=0.02) + ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4) class TestLstmOpHasNoInitial(TestLstmOp): -- GitLab