diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 1fe2774cc5a291dbafb61b50d63553b086512e4d..02fa6bc3ace32ff5ffe51dcce9c49757a990a9b2 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -209,6 +209,15 @@ typedef struct {
 #define HL_FLOAT_MIN        2.2250738585072014e-308
 #endif
 
+
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ *
+ * Currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT       40.0
+
+
 /**
  * @brief DIVUP(x, y) is similar to ceil(x / y).
  * @note  For CUDA, DIVUP will be used to specify
diff --git a/paddle/cuda/src/hl_avx_functions.cc b/paddle/cuda/src/hl_avx_functions.cc
index 2d471206f61f281eebf6939443a2b28470ecf808..08976180fff5b099475b1406b16f967655867e5b 100644
--- a/paddle/cuda/src/hl_avx_functions.cc
+++ b/paddle/cuda/src/hl_avx_functions.cc
@@ -38,7 +38,9 @@ namespace hppl {
   }
 
   __m256 tanh(const __m256 a) {
+    __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
     __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+    tmp = _mm256_min_ps(tmp, max);
     tmp = exp(tmp);
     return _mm256_sub_ps(
         _mm256_div_ps(_mm256_set1_ps(2.0f),
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
index 3fd6b278d053714a6b6f0fe33831a32e2c64e3ae..b8352c2d537fba5ec9cd3237fe8f3fa9c25cbffe 100644
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -30,7 +30,9 @@ namespace hppl {
   }
 
   real tanh(const real a) {
-    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
+    real tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    return (2.0 / (1.0 + exp(tmp))) - 1.0;
   }
 
   real linear(const real a) {
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 5397b952bced8f572204e98f01adf3b3ba71d1ba..4e01fa91ed2ba6b40882d9995e52e9dbeb37f57e 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -995,7 +995,7 @@ TEST(Layer, LstmLayer) {
   TestConfig config;
   config.layerConfig.set_type("lstmemory");
   config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_type("tanh");
   config.layerConfig.set_active_state_type("sigmoid");
   config.layerConfig.set_active_gate_type("sigmoid");
   config.biasSize = 28;
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 9b933b153d158bef565c0964232525ba99b8b3d4..1c8497e8c526f84cabf6e0862ea96653f99f64be 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -369,7 +369,7 @@ TEST(Layer, LstmLayer) {
   LayerConfig layerConfig;
   layerConfig.set_type("lstmemory");
   layerConfig.set_active_type("relu");
-  layerConfig.set_active_state_type("sigmoid");
+  layerConfig.set_active_state_type("tanh");
   layerConfig.set_active_gate_type("sigmoid");
 
   layerConfig.add_inputs();
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 8b888b1ee5e46ec5cac316d9f90095a7e314ae13..d81b99e5441584b21fb023dcae65ccec7dd27996 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -625,7 +625,10 @@ void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
   applyBinary(binary::SquareDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_OP(Tanh, b = 2.0 / (1.0 + exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(Tanh,
+    T tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<>
 void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
   applyBinary(binary::Tanh<real>(), b);
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index f8132066477db3b9762348e9baf7a5112d302fd6..e0b2a2bb5b2cdbd845d9be08a8926f0514398458 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -200,7 +200,10 @@ void vLog1p(const int n, const T* a, T* r) {
     binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
-DEFINE_MATRIX_BINARY_OP(vTanh, b = 2.0 / (1.0 + std::exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(vTanh,
+    T tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<class T>
 void vTanh(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index d901ba93492ac548c803c526b23001e5deb4fe51..4fc9b2d0893665c3d478fbec55f810c5bc99e236 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3471,9 +3471,7 @@ void CpuMatrix::tanh(Matrix& output) {
   size_t dim = getWidth();
   CHECK_EQ(output.getHeight(), numSamples);
   CHECK_EQ(output.getWidth(), dim);
-  errno = 0;
   vTanh(numSamples * dim, getData(), output.getData());
-  CHECK_EQ(errno, 0) << "vTanh error";
 }
 
 void CpuMatrix::tanhDerivative(Matrix& output) {
@@ -3495,10 +3493,8 @@ void CpuMatrix::softrelu(Matrix& output) {
       out[j] = x;
     }
   }
-  errno = 0;
   vExp(numSamples * dim, output.getData(), output.getData());
   vLog1p(numSamples * dim, output.getData(), output.getData());
-  CHECK_EQ(errno, 0) << "vExp+vLog1p error";
 }
 
 void CpuMatrix::softreluDerivative(Matrix& output) {
@@ -3513,9 +3509,7 @@ void CpuMatrix::softreluDerivative(Matrix& output) {
   MatrixPtr tmpMat = Matrix::create(numSamples, dim);
   real* tmp = tmpMat->getData();
 
-  errno = 0;
   vExp(size, output.getData(), tmpMat->getData());
-  CHECK_EQ(errno, 0) << "vExp error";
 
   for (size_t i = 0; i < size; ++i) {
     grad[i] *= (1.0 - 1.0 / tmp[i]);
@@ -3538,10 +3532,7 @@ void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
     out[i] = p2 * in[i];
   }
 
-  // out = tanh(out)
-  errno = 0;
   vTanh(numSamples * dim, out, out);
-  CHECK_EQ(errno, 0) << "vTanh error";
 
   // out = p1 * out
   for (size_t i = 0; i < numSamples * dim; ++i) {
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index eb72f11e1c6538cd2c66bc56dbc8686a942bd308..247be983ba3296383c8e2f30f1036859ecfde492 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -13,3 +13,4 @@ add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
+add_simple_unittest(test_FPException)
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..174278c2aaac4575a6ea0b219bf7a389db712703
--- /dev/null
+++ b/paddle/math/tests/test_FPException.cpp
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+/**
+ * This test is about floating point calculation exception.
+ * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
+ *
+ * Some exceptions occur in the middle of a set of formulas, 
+ * that can be circumvented by some tricks.
+ * For example, 
+ * calculate tanh
+ *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+ *
+ * If the result of (-2 * a) is too large,
+ * a FE_OVERFLOW exception occurs when calculating exp.
+ * But the result of tanh is no overflow problem,
+ * so we can add some tricks to prevent exp calculate an excessive value.
+ *
+ */
+#include <fenv.h>
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Excepts.h"
+
+using namespace paddle;     // NOLINT
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+template<typename Matrix>
+void testTanh(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->tanh(*B);
+}
+
+template<typename Matrix>
+void testSigmoid(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->sigmoid(*B);
+}
+
+TEST(fp, overflow) {
+  for (auto illegal : {-90.0, 90.0}) {
+    LOG(INFO) << " illegal=" << illegal;
+    testTanh<CpuMatrix>(illegal);
+    testSigmoid<CpuMatrix>(illegal);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+  return RUN_ALL_TESTS();
+}