fix floating-point overflow problem of tanh (#355)

a07da949 · hedaoyuan · Yu Yang · 56b23d18 · a07da949 · a07da949
10 changed file
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -209,6 +209,15 @@ typedef struct {
 #define HL_FLOAT_MIN        2.2250738585072014e-308
 #endif

+
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ *
+ * Currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT       40.0
+
+
 /**
 * @brief DIVUP(x, y) is similar to ceil(x / y).
 * @note  For CUDA, DIVUP will be used to specify

--- a/paddle/cuda/src/hl_avx_functions.cc
+++ b/paddle/cuda/src/hl_avx_functions.cc
@@ -38,7 +38,9 @@ namespace hppl {
  }

  __m256 tanh(const __m256 a) {
+    __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
    __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+    tmp = _mm256_min_ps(tmp, max);
    tmp = exp(tmp);
    return _mm256_sub_ps(
        _mm256_div_ps(_mm256_set1_ps(2.0f),

--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -30,7 +30,9 @@ namespace hppl {
  }

  real tanh(const real a) {
-    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
+    real tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    return (2.0 / (1.0 + exp(tmp))) - 1.0;
  }

  real linear(const real a) {

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -995,7 +995,7 @@ TEST(Layer, LstmLayer) {
  TestConfig config;
  config.layerConfig.set_type("lstmemory");
  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_type("tanh");
  config.layerConfig.set_active_state_type("sigmoid");
  config.layerConfig.set_active_gate_type("sigmoid");
  config.biasSize = 28;

--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -369,7 +369,7 @@ TEST(Layer, LstmLayer) {
  LayerConfig layerConfig;
  layerConfig.set_type("lstmemory");
  layerConfig.set_active_type("relu");
-  layerConfig.set_active_state_type("sigmoid");
+  layerConfig.set_active_state_type("tanh");
  layerConfig.set_active_gate_type("sigmoid");

  layerConfig.add_inputs();

--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -625,7 +625,10 @@ void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
  applyBinary(binary::SquareDerivative<T>(), b);
 }

-DEFINE_MATRIX_BINARY_OP(Tanh, b = 2.0 / (1.0 + exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(Tanh,
+    T tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<>
 void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
  applyBinary(binary::Tanh<real>(), b);

--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -200,7 +200,10 @@ void vLog1p(const int n, const T* a, T* r) {
    binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }

-DEFINE_MATRIX_BINARY_OP(vTanh, b = 2.0 / (1.0 + std::exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(vTanh,
+    T tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<class T>
 void vTanh(const int n, const T* a, T* r) {
  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3471,9 +3471,7 @@ void CpuMatrix::tanh(Matrix& output) {
  size_t dim = getWidth();
  CHECK_EQ(output.getHeight(), numSamples);
  CHECK_EQ(output.getWidth(), dim);
-  errno = 0;
  vTanh(numSamples * dim, getData(), output.getData());
-  CHECK_EQ(errno, 0) << "vTanh error";
 }

 void CpuMatrix::tanhDerivative(Matrix& output) {
@@ -3495,10 +3493,8 @@ void CpuMatrix::softrelu(Matrix& output) {
      out[j] = x;
    }
  }
-  errno = 0;
  vExp(numSamples * dim, output.getData(), output.getData());
  vLog1p(numSamples * dim, output.getData(), output.getData());
-  CHECK_EQ(errno, 0) << "vExp+vLog1p error";
 }

 void CpuMatrix::softreluDerivative(Matrix& output) {
@@ -3513,9 +3509,7 @@ void CpuMatrix::softreluDerivative(Matrix& output) {
  MatrixPtr tmpMat = Matrix::create(numSamples, dim);
  real* tmp = tmpMat->getData();

-  errno = 0;
  vExp(size, output.getData(), tmpMat->getData());
-  CHECK_EQ(errno, 0) << "vExp error";

  for (size_t i = 0; i < size; ++i) {
    grad[i] *= (1.0 - 1.0 / tmp[i]);
@@ -3538,10 +3532,7 @@ void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
    out[i] = p2 * in[i];
  }

-  // out = tanh(out)
-  errno = 0;
  vTanh(numSamples * dim, out, out);
-  CHECK_EQ(errno, 0) << "vTanh error";

  // out = p1 * out
  for (size_t i = 0; i < numSamples * dim; ++i) {

--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -13,3 +13,4 @@ add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
+add_simple_unittest(test_FPException)
--- a/paddle/math/tests/test_FPException.cpp
+++ b/paddle/math/tests/test_FPException.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+/**
+ * This test is about floating point calculation exception.
+ * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
+ *
+ * Some exceptions occur in the middle of a set of formulas, 
+ * that can be circumvented by some tricks.
+ * For example, 
+ * calculate tanh
+ *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+ *
+ * If the result of (-2 * a) is too large,
+ * a FE_OVERFLOW exception occurs when calculating exp.
+ * But the result of tanh is no overflow problem,
+ * so we can add some tricks to prevent exp calculate an excessive value.
+ *
+ */
+#include <fenv.h>
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Excepts.h"
+
+using namespace paddle;     // NOLINT
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+template<typename Matrix>
+void testTanh(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->tanh(*B);
+}
+
+template<typename Matrix>
+void testSigmoid(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->sigmoid(*B);
+}
+
+TEST(fp, overflow) {
+  for (auto illegal : {-90.0, 90.0}) {
+    LOG(INFO) << " illegal=" << illegal;
+    testTanh<CpuMatrix>(illegal);
+    testSigmoid<CpuMatrix>(illegal);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+  return RUN_ALL_TESTS();
+}