diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h index 1fe2774cc5a291dbafb61b50d63553b086512e4d..02fa6bc3ace32ff5ffe51dcce9c49757a990a9b2 100644 --- a/paddle/cuda/include/hl_base.h +++ b/paddle/cuda/include/hl_base.h @@ -209,6 +209,15 @@ typedef struct { #define HL_FLOAT_MIN 2.2250738585072014e-308 #endif + +/** + * The maximum input value for exp, used to avoid overflow problem. + * + * Currently only used for tanh function. + */ +#define EXP_MAX_INPUT 40.0 + + /** * @brief DIVUP(x, y) is similar to ceil(x / y). * @note For CUDA, DIVUP will be used to specify diff --git a/paddle/cuda/src/hl_avx_functions.cc b/paddle/cuda/src/hl_avx_functions.cc index 2d471206f61f281eebf6939443a2b28470ecf808..08976180fff5b099475b1406b16f967655867e5b 100644 --- a/paddle/cuda/src/hl_avx_functions.cc +++ b/paddle/cuda/src/hl_avx_functions.cc @@ -38,7 +38,9 @@ namespace hppl { } __m256 tanh(const __m256 a) { + __m256 max = _mm256_set1_ps(EXP_MAX_INPUT); __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a); + tmp = _mm256_min_ps(tmp, max); tmp = exp(tmp); return _mm256_sub_ps( _mm256_div_ps(_mm256_set1_ps(2.0f), diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc index 3fd6b278d053714a6b6f0fe33831a32e2c64e3ae..b8352c2d537fba5ec9cd3237fe8f3fa9c25cbffe 100644 --- a/paddle/cuda/src/hl_cpu_functions.cc +++ b/paddle/cuda/src/hl_cpu_functions.cc @@ -30,7 +30,9 @@ namespace hppl { } real tanh(const real a) { - return (2.0 / (1.0 + exp(-2.0*a))) - 1.0; + real tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + return (2.0 / (1.0 + exp(tmp))) - 1.0; } real linear(const real a) { diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 5397b952bced8f572204e98f01adf3b3ba71d1ba..4e01fa91ed2ba6b40882d9995e52e9dbeb37f57e 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -995,7 +995,7 @@ TEST(Layer, LstmLayer) { TestConfig config; config.layerConfig.set_type("lstmemory"); config.layerConfig.set_size(4); - config.layerConfig.set_active_type("sigmoid"); + config.layerConfig.set_active_type("tanh"); config.layerConfig.set_active_state_type("sigmoid"); config.layerConfig.set_active_gate_type("sigmoid"); config.biasSize = 28; diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp index 9b933b153d158bef565c0964232525ba99b8b3d4..1c8497e8c526f84cabf6e0862ea96653f99f64be 100644 --- a/paddle/gserver/tests/test_RecurrentLayer.cpp +++ b/paddle/gserver/tests/test_RecurrentLayer.cpp @@ -369,7 +369,7 @@ TEST(Layer, LstmLayer) { LayerConfig layerConfig; layerConfig.set_type("lstmemory"); layerConfig.set_active_type("relu"); - layerConfig.set_active_state_type("sigmoid"); + layerConfig.set_active_state_type("tanh"); layerConfig.set_active_gate_type("sigmoid"); layerConfig.add_inputs(); diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu index 8b888b1ee5e46ec5cac316d9f90095a7e314ae13..d81b99e5441584b21fb023dcae65ccec7dd27996 100644 --- a/paddle/math/BaseMatrix.cu +++ b/paddle/math/BaseMatrix.cu @@ -625,7 +625,10 @@ void BaseMatrixT::squareDerivative(BaseMatrixT& b) { applyBinary(binary::SquareDerivative(), b); } -DEFINE_MATRIX_BINARY_OP(Tanh, b = 2.0 / (1.0 + exp(-2 * a)) - 1.0); +DEFINE_MATRIX_BINARY_OP(Tanh, + T tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + b = 2.0 / (1.0 + std::exp(tmp)) - 1.0); template<> void BaseMatrixT::tanh(BaseMatrixT& b) { applyBinary(binary::Tanh(), b); diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index f8132066477db3b9762348e9baf7a5112d302fd6..e0b2a2bb5b2cdbd845d9be08a8926f0514398458 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -200,7 +200,10 @@ void vLog1p(const int n, const T* a, T* r) { binary::vLog1p(), const_cast(a), r, 1, n, n, n); } -DEFINE_MATRIX_BINARY_OP(vTanh, b = 2.0 / (1.0 + std::exp(-2 * a)) - 1.0); +DEFINE_MATRIX_BINARY_OP(vTanh, + T tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + b = 2.0 / (1.0 + std::exp(tmp)) - 1.0); template void vTanh(const int n, const T* a, T* r) { hl_cpu_apply_binary_op, 0, 0>( diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index d901ba93492ac548c803c526b23001e5deb4fe51..4fc9b2d0893665c3d478fbec55f810c5bc99e236 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -3471,9 +3471,7 @@ void CpuMatrix::tanh(Matrix& output) { size_t dim = getWidth(); CHECK_EQ(output.getHeight(), numSamples); CHECK_EQ(output.getWidth(), dim); - errno = 0; vTanh(numSamples * dim, getData(), output.getData()); - CHECK_EQ(errno, 0) << "vTanh error"; } void CpuMatrix::tanhDerivative(Matrix& output) { @@ -3495,10 +3493,8 @@ void CpuMatrix::softrelu(Matrix& output) { out[j] = x; } } - errno = 0; vExp(numSamples * dim, output.getData(), output.getData()); vLog1p(numSamples * dim, output.getData(), output.getData()); - CHECK_EQ(errno, 0) << "vExp+vLog1p error"; } void CpuMatrix::softreluDerivative(Matrix& output) { @@ -3513,9 +3509,7 @@ void CpuMatrix::softreluDerivative(Matrix& output) { MatrixPtr tmpMat = Matrix::create(numSamples, dim); real* tmp = tmpMat->getData(); - errno = 0; vExp(size, output.getData(), tmpMat->getData()); - CHECK_EQ(errno, 0) << "vExp error"; for (size_t i = 0; i < size; ++i) { grad[i] *= (1.0 - 1.0 / tmp[i]); @@ -3538,10 +3532,7 @@ void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) { out[i] = p2 * in[i]; } - // out = tanh(out) - errno = 0; vTanh(numSamples * dim, out, out); - CHECK_EQ(errno, 0) << "vTanh error"; // out = p1 * out for (size_t i = 0; i < numSamples * dim; ++i) { diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index eb72f11e1c6538cd2c66bc56dbc8686a942bd308..247be983ba3296383c8e2f30f1036859ecfde492 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -13,3 +13,4 @@ add_simple_unittest(test_sparseMatrixCompare) add_simple_unittest(test_perturbation) add_simple_unittest(test_CpuGpuVector) add_simple_unittest(test_Allocator) +add_simple_unittest(test_FPException) diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp new file mode 100644 index 0000000000000000000000000000000000000000..174278c2aaac4575a6ea0b219bf7a389db712703 --- /dev/null +++ b/paddle/math/tests/test_FPException.cpp @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + + +/** + * This test is about floating point calculation exception. + * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions. + * + * Some exceptions occur in the middle of a set of formulas, + * that can be circumvented by some tricks. + * For example, + * calculate tanh + * b = 2.0 / (1.0 + exp(-2 * a)) - 1.0 + * + * If the result of (-2 * a) is too large, + * a FE_OVERFLOW exception occurs when calculating exp. + * But the result of tanh is no overflow problem, + * so we can add some tricks to prevent exp calculate an excessive value. + * + */ +#include +#include +#include "paddle/math/Matrix.h" +#include "paddle/utils/Excepts.h" + +using namespace paddle; // NOLINT + +void SetTensorValue(Matrix& matrix, real value) { + int height = matrix.getHeight(); + int width = matrix.getWidth(); + int stride = matrix.getStride(); + real* data = matrix.getData(); + for (int i = 0; i < height; i++) { + int j = rand() % width; // NOLINT + if (typeid(matrix) == typeid(CpuMatrix)) { + data[i * stride + j] = value; + } else if (typeid(matrix) == typeid(GpuMatrix)) { + hl_memcpy(&data[i * stride + j], &value, sizeof(real)); + } else { + LOG(FATAL) << "should not reach here"; + } + } +} + +template +void testTanh(real illegal) { + MatrixPtr A = std::make_shared(10, 10); + MatrixPtr B = std::make_shared(10, 10); + A->randomizeUniform(); + B->randomizeUniform(); + + SetTensorValue(*A, illegal); + + A->tanh(*B); +} + +template +void testSigmoid(real illegal) { + MatrixPtr A = std::make_shared(10, 10); + MatrixPtr B = std::make_shared(10, 10); + A->randomizeUniform(); + B->randomizeUniform(); + + SetTensorValue(*A, illegal); + + A->sigmoid(*B); +} + +TEST(fp, overflow) { + for (auto illegal : {-90.0, 90.0}) { + LOG(INFO) << " illegal=" << illegal; + testTanh(illegal); + testSigmoid(illegal); + } +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + + feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW); + return RUN_ALL_TESTS(); +}