diff --git a/.travis.yml b/.travis.yml
index 7812ac02837895a32fcad36158814268e93a4da8..74aa767febeb17e3defb85697153e17b4c3fcb0f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -38,10 +38,21 @@ addons:
       - curl
       - lcov
       - graphviz
+      - swig
 before_install:
+  - |
+    if [ ${JOB} == "BUILD_AND_TEST" ]; then
+      if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then
+        TRAVIS_COMMIT_RANGE="FETCH_HEAD...$TRAVIS_BRANCH"
+      fi
+      git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)' || {
+        echo "Only markdown docs were updated, stopping build process."
+        exit
+      }
+    fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
-  - pip install wheel protobuf sphinx breathe recommonmark
+  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
 script:
   - paddle/scripts/travis/main.sh
 notifications:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 282e3e199ef440092550deec906019bc44bc73bd..39f876bc9ee4b34ef512cfaaf5aae7752920c33f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,11 +95,26 @@ if(NOT WITH_GPU)
     add_definitions(-DHPPL_STUB_FUNC)
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
+    if(${CUDA_VERSION_MAJOR} GREATER 6)
+        if(COMPILER_SUPPORT_CXX11)
+            LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
+        endif()
+    endif()
+
     # TODO(yuyang18): Change it to remove std=c++11 in cuda compile.
     set(CUDA_PROPAGATE_HOST_FLAGS OFF)
     if(NOT CUDNN_FOUND)
         message(FATAL_ERROR "Paddle need cudnn to compile")
     endif()
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -O3 --use_fast_math")
+
+    if(WITH_AVX)
+        if(AVX_FOUND)
+            set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -mavx")
+        endif(AVX_FOUND)
+    else(WITH_AVX)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler -msse3")
+    endif(WITH_AVX)
 
     if(WITH_DSO)
         set(CUDA_LIBRARIES "")
diff --git a/README.md b/README.md
index 66767d7ff8e4acf8ef246f7e0129a66e64486727..81ff8c7122ab8f1e39ef14a056532bb85cc57c77 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ developed by Baidu scientists and engineers for the purpose of applying deep
 learning to many products at Baidu.
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
-Please refer to our [release log](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle. 
+Please refer to our [release announcement](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle. 
 
 ## Features
 
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 3f78cd08c390550790b7145c412de32351873e4e..a8282f07184c34f77d506ed7ef40206fbbd55b41 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -188,14 +188,6 @@ macro(add_simple_unittest TARGET_NAME)
     add_unittest(${TARGET_NAME} ${TARGET_NAME}.cpp)
 endmacro()
 
-macro(add_paddle_culib TARGET_NAME)
-    set(NVCC_FLAG ${CUDA_NVCC_FLAGS})
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--use_fast_math)
-    cuda_add_library(${TARGET_NAME} STATIC ${ARGN})
-    set(CUDA_NVCC_FLAGS ${NVCC_FLAG})
-endmacro()
-
-
 # Creates C resources file from files in given resource file
 function(create_resources res_file output)
     # Create empty output file
diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/preprocess.sh
index fe2acbbd74898fa3d12ddee3271658043c43e32e..58a72147c5e41351634395e770e9a214ed3cb01d 100755
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
@@ -21,14 +21,21 @@
 set -e
 
 export LC_ALL=C
+UNAME_STR=`uname`
+
+if [[ ${UNAME_STR} == 'Linux' ]]; then
+  SHUF_PROG='shuf'
+else
+  SHUF_PROG='gshuf'
+fi
 
 mkdir -p data/tmp
 python preprocess.py -i data/reviews_Electronics_5.json.gz
 # uniq and shuffle
 cd data/tmp
 echo 'uniq and shuffle...'
-cat pos_*|sort|uniq|shuf> pos.shuffed
-cat neg_*|sort|uniq|shuf> neg.shuffed
+cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
+cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed
 
 min_len=`sed -n '$=' neg.shuffed`
 test_num=$((min_len/10))
@@ -42,8 +49,8 @@ head -n$train_num neg.shuffed >train.neg
 tail -n$test_num pos.shuffed >test.pos
 tail -n$test_num neg.shuffed >test.neg
 
-cat train.pos train.neg|shuf>../train.txt
-cat test.pos test.neg|shuf>../test.txt
+cat train.pos train.neg | ${SHUF_PROG} >../train.txt
+cat test.pos test.neg | ${SHUF_PROG} >../test.txt
 
 cd -
 echo 'data/train.txt' > data/train.list
diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
index c37234d3ef14dfcfeaa1f34b0565e40e0672edc0..b8f26f431eb7a04147fe791a8c805427c827fe09 100644
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -4,7 +4,6 @@ Installing from Sources
 * [1. Download and Setup](#download)
 * [2. Requirements](#requirements)
 * [3. Build on Ubuntu](#ubuntu)
-* [4. Build on Mac OS X](#mac)
 
 ## <span id="download">Download and Setup</span> 
 You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).
@@ -191,121 +190,3 @@ sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
 # or just run 
 sudo paddle version
 ```
-
-## <span id="mac">Building on Mac OS X</span>
-
-### Prerequisites
-This guide is based on Mac OS X 10.11 (El Capitan). Note that if you are running an up to date version of OS X, 
-you will already have Python 2.7.10 and Numpy 1.8 installed.
-
-The best option is to use the package manager homebrew to handle installations and upgrades for you.
-To install [homebrew](http://brew.sh/), first open a terminal window (you can find Terminal in the Utilities folder in Applications), and issue the command:
-
-```bash
-# install brew
-/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
-# install pip
-easy_install pip
-```
-
-### Install Dependencies
-
-- **CPU Dependencies**
-
-  ```bash
-  # Install fundamental dependents 
-  brew install glog gflags cmake protobuf openblas
-
-  # Install google test on Mac OS X
-  # Download gtest 1.7.0
-  wget https://github.com/google/googletest/archive/release-1.7.0.tar.gz
-  tar -xzf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
-  # Build gtest
-  mkdir build && cd build && cmake .. && make
-  # Install gtest library
-  sudo cp -r ../include/gtest /usr/local/include/
-  sudo cp lib*.a /usr/local/lib
-  ```
-
-- **GPU Dependencies(optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. Mac OS X 10.11 or later
-        2. the Clang compiler and toolchain installed using Xcode
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    1. After downloading cuDNN library, issue the following commands:
-
-        ```bash
-        sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
-        sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib/libcudnn*
-        ```
-    2. Then you need to set DYLD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-        ```bash
-        export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
-        export PATH=/usr/local/cuda/bin:$PATH
-        ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-cmake ..
-```
-
-CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
-libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
-If still not found, you can manually set it based on CMake error information from your screen.
-
-As a simple example, consider the following:
-
-- **Only CPU**
-
-  ```bash
-  cmake  .. -DWITH_GPU=OFF
-  ```
-- **GPU**
-
-  ```bash
-  cmake .. -DWITH_GPU=ON
-  ```
-
-- **GPU with doc and swig**
-
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
-  ``` 
-
-Finally, you can build PaddlePaddle:
-
-```bash
-# you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<installation path>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `sysctl -n hw.ncpu` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<installation path>/bin:$PATH
-```
-**Note:**
-
-If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
-Otherwise, PaddlePaddle will automatically install python dependencies
-at first time when user run paddle commands, such as `paddle version`, `paddle train`.
-It may require sudo privileges:
-
-```bash
-# you can run
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-# or just run 
-sudo paddle version
-```
diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md
index aa6b66ca8c02411016420bf9d99c5e1b4e3cefdd..4d9b24ba851a7aaaeb0d79bfbeb0703b8878b77f 100644
--- a/doc_cn/demo/quick_start/index.md
+++ b/doc_cn/demo/quick_start/index.md
@@ -134,9 +134,8 @@ define_py_data_sources2(train_list='data/train.list',
 * obj="process": 指定生成数据的函数
 * args={"dictionary": word_dict}: 额外的参数，这里指定词典
 
-更详细用例请参考文档<a href = "../../../doc/ui/data_provider/python_case.html">Python Use Case</a>，
-数据格式和详细文档请参考<a href = "../../../doc/ui/data_provider/pydataprovider2.html">
-PyDataProviderWrapper</a>。
+更详细数据格式和用例请参考<a href = "../../ui/data_provider/pydataprovider2.html">
+PyDataProvider2</a>。
 
 ## 网络结构(Network Architecture)
 本节我们将专注于网络结构的介绍。
diff --git a/doc_cn/faq/index.rst b/doc_cn/faq/index.rst
index db28b4436fe5e76882861a4cf06f358a63d8ebd4..3eb0e10ae2228740cd384270db5070e367f7007b 100644
--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
@@ -177,3 +177,40 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
     pip install --upgrade pip
 
+8.  python相关的单元测试都过不了
+--------------------------------
+
+如果出现以下python相关的单元测试都过不了的情况：
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+    
+并且查询PaddlePaddle单元测试的日志，提示：
+
+..  code-block:: bash
+    
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+    
+解决办法是：卸载paddle包 :code:`pip uninstall paddle`。
+
+原因是：单元测试使用了一个旧版本的python包，而没有测试到代码中实际修改的python包。即单元测试需要一个干净的环境：
+
+* 如果paddle包已经在python的site-packages里面了，那么单元测试时使用的paddle包，就是site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。
+* 即便设置了 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
\ No newline at end of file
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
index 6531e5ccb3dba39315c7e35191ea1bdf0504d220..a2352250c31efa7ee3c4c8338d95dce5a5b9a511 100644
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -1,6 +1,7 @@
 PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
 WITH_GPU="@WITH_GPU@"
 PROTOBUF_LIB="@PROTOBUF_LIBRARY@"
+ZLIB_LIB="@ZLIB_LIBRARIES@"
 CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
 CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
 
@@ -15,3 +16,4 @@ GFLAGS_LOCATION="@GFLAGS_LOCATION@"
 CBLAS_LIBRARIES="@CBLAS_LIBS@"
 
 CUDA_LIBRARIES="@CUDA_LIBRARIES@"
+WITH_COVERALLS="@ON_COVERALLS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index bc1afc5898e829bc271b62b702b3743bf7eb782b..05d741f8859ba46893bff49681536d9187a3ed6e 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -38,6 +38,7 @@ try:
             self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
             self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
             self.protolib = PROTOBUF_LIB
+            self.zlib = ZLIB_LIB
             self.thread = CMAKE_THREAD_LIB
             self.dl_libs = CMAKE_DL_LIBS
             self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
@@ -47,6 +48,7 @@ try:
             self.glog_libs = LIBGLOG_LIBRARY
 
             self.with_gflags = PaddleLDFlag.cmake_bool(WITH_GFLAGS)
+            self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
             self.gflags_libs = GFLAGS_LIBRARIES
             self.gflags_location = GFLAGS_LOCATION
             self.cblas_libs = CBLAS_LIBRARIES
@@ -64,7 +66,7 @@ try:
 
         def parent_dir_str(self):
             libdirs = PARENT_LIB_DIRS
-            return " ".join(map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x), 
+            return " ".join(map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x),
                 libdirs))
 
         def libs_str(self):
@@ -82,6 +84,7 @@ try:
                 "-lpaddle_cuda",
                 "-lpaddle_api",
                 self.normalize_flag(self.protolib),
+                self.normalize_flag(self.zlib),
                 self.normalize_flag(self.thread),
                 self.normalize_flag(self.dl_libs),
                 self.normalize_flag(self.cblas_libs),
@@ -95,6 +98,8 @@ try:
                 libs.append(self.normalize_flag(self.gflags_libs))
             if self.with_gpu:
                 libs.append(self.normalize_flag(self.curt))
+            if self.with_coverage:
+                libs.append("-fprofile-arcs")
             return " ".join(filter(lambda l: len(l) != 0, libs))
 
         def normalize_flag(self, cmake_flag):
@@ -131,8 +136,14 @@ try:
                 return False
             else:
                 return True
-
+        def c_flag(self):
+            if self.with_coverage:
+                return ["-fprofile-arcs", "-ftest-coverage", "-O0", "-g"]
+            else:
+                return None
 except ImportError:
     class PaddleLDFlag(object):
         def ldflag_str(self):
             pass
+        def c_flag(self):
+            pass
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 1fe2774cc5a291dbafb61b50d63553b086512e4d..02fa6bc3ace32ff5ffe51dcce9c49757a990a9b2 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -209,6 +209,15 @@ typedef struct {
 #define HL_FLOAT_MIN        2.2250738585072014e-308
 #endif
 
+
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ *
+ * Currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT       40.0
+
+
 /**
  * @brief DIVUP(x, y) is similar to ceil(x / y).
  * @note  For CUDA, DIVUP will be used to specify
diff --git a/paddle/cuda/src/hl_avx_functions.cc b/paddle/cuda/src/hl_avx_functions.cc
index 2d471206f61f281eebf6939443a2b28470ecf808..08976180fff5b099475b1406b16f967655867e5b 100644
--- a/paddle/cuda/src/hl_avx_functions.cc
+++ b/paddle/cuda/src/hl_avx_functions.cc
@@ -38,7 +38,9 @@ namespace hppl {
   }
 
   __m256 tanh(const __m256 a) {
+    __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
     __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+    tmp = _mm256_min_ps(tmp, max);
     tmp = exp(tmp);
     return _mm256_sub_ps(
         _mm256_div_ps(_mm256_set1_ps(2.0f),
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
index 3fd6b278d053714a6b6f0fe33831a32e2c64e3ae..b8352c2d537fba5ec9cd3237fe8f3fa9c25cbffe 100644
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -30,7 +30,9 @@ namespace hppl {
   }
 
   real tanh(const real a) {
-    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
+    real tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    return (2.0 / (1.0 + exp(tmp))) - 1.0;
   }
 
   real linear(const real a) {
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index 724ea490e8ea9a8b2a1be39f3e0037df6e49882f..f16376ec937d3a397d9e7117de528c304f8403ee 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -78,6 +78,8 @@ DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
 DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
 DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
 DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
 CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 
 #undef DYNAMIC_LOAD_CUBLAS_WRAP
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 9ac4d210f6d376639df20800b6782f1f8c03d6aa..a066f80c221ee8ab4383ee6463f7b111984b58ff 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -50,7 +50,7 @@ if(NOT WITH_PYTHON)
 endif()
 
 if(WITH_GPU)
-    add_paddle_culib(paddle_gserver ${GSERVER_SOURCES})
+    cuda_add_library(paddle_gserver ${GSERVER_SOURCES})
 else()
     add_library(paddle_gserver STATIC
         ${GSERVER_SOURCES})
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 4d4e439dc6268998bf3bf7d2f87683714ab94a3f..0c18611f01090ada12a3ac0ff6899209a2827f6e 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1025,7 +1025,7 @@ TEST(Layer, LstmLayer) {
   TestConfig config;
   config.layerConfig.set_type("lstmemory");
   config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_type("tanh");
   config.layerConfig.set_active_state_type("sigmoid");
   config.layerConfig.set_active_gate_type("sigmoid");
   config.biasSize = 28;
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 9b933b153d158bef565c0964232525ba99b8b3d4..1c8497e8c526f84cabf6e0862ea96653f99f64be 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -369,7 +369,7 @@ TEST(Layer, LstmLayer) {
   LayerConfig layerConfig;
   layerConfig.set_type("lstmemory");
   layerConfig.set_active_type("relu");
-  layerConfig.set_active_state_type("sigmoid");
+  layerConfig.set_active_state_type("tanh");
   layerConfig.set_active_gate_type("sigmoid");
 
   layerConfig.add_inputs();
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 8b888b1ee5e46ec5cac316d9f90095a7e314ae13..d81b99e5441584b21fb023dcae65ccec7dd27996 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -625,7 +625,10 @@ void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
   applyBinary(binary::SquareDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_OP(Tanh, b = 2.0 / (1.0 + exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(Tanh,
+    T tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<>
 void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
   applyBinary(binary::Tanh<real>(), b);
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index db305812a7c036177022836d877661c8f83e999f..93b1bf46a10078b4ae83efdbf268f64b6da052dc 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -23,7 +23,7 @@ if(NOT WITH_GPU)
     add_library(paddle_math STATIC
         ${MATH_SOURCES})
 else()
-    add_paddle_culib(paddle_math ${MATH_SOURCES})
+    cuda_add_library(paddle_math ${MATH_SOURCES})
 endif()
 
 
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index f8132066477db3b9762348e9baf7a5112d302fd6..e0b2a2bb5b2cdbd845d9be08a8926f0514398458 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -200,7 +200,10 @@ void vLog1p(const int n, const T* a, T* r) {
     binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
-DEFINE_MATRIX_BINARY_OP(vTanh, b = 2.0 / (1.0 + std::exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(vTanh,
+    T tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<class T>
 void vTanh(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 9abcbba67ab7e2ab2f4721bd18e2e134932a8616..4770a7203498dbe251f468f4fbff562d6c86a54b 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3528,9 +3528,7 @@ void CpuMatrix::tanh(Matrix& output) {
   size_t dim = getWidth();
   CHECK_EQ(output.getHeight(), numSamples);
   CHECK_EQ(output.getWidth(), dim);
-  errno = 0;
   vTanh(numSamples * dim, getData(), output.getData());
-  CHECK_EQ(errno, 0) << "vTanh error";
 }
 
 void CpuMatrix::tanhDerivative(Matrix& output) {
@@ -3552,10 +3550,8 @@ void CpuMatrix::softrelu(Matrix& output) {
       out[j] = x;
     }
   }
-  errno = 0;
   vExp(numSamples * dim, output.getData(), output.getData());
   vLog1p(numSamples * dim, output.getData(), output.getData());
-  CHECK_EQ(errno, 0) << "vExp+vLog1p error";
 }
 
 void CpuMatrix::softreluDerivative(Matrix& output) {
@@ -3570,9 +3566,7 @@ void CpuMatrix::softreluDerivative(Matrix& output) {
   MatrixPtr tmpMat = Matrix::create(numSamples, dim);
   real* tmp = tmpMat->getData();
 
-  errno = 0;
   vExp(size, output.getData(), tmpMat->getData());
-  CHECK_EQ(errno, 0) << "vExp error";
 
   for (size_t i = 0; i < size; ++i) {
     grad[i] *= (1.0 - 1.0 / tmp[i]);
@@ -3595,10 +3589,7 @@ void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
     out[i] = p2 * in[i];
   }
 
-  // out = tanh(out)
-  errno = 0;
   vTanh(numSamples * dim, out, out);
-  CHECK_EQ(errno, 0) << "vTanh error";
 
   // out = p1 * out
   for (size_t i = 0; i < numSamples * dim; ++i) {
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index eb72f11e1c6538cd2c66bc56dbc8686a942bd308..247be983ba3296383c8e2f30f1036859ecfde492 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -13,3 +13,4 @@ add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
+add_simple_unittest(test_FPException)
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..174278c2aaac4575a6ea0b219bf7a389db712703
--- /dev/null
+++ b/paddle/math/tests/test_FPException.cpp
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+/**
+ * This test is about floating point calculation exception.
+ * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
+ *
+ * Some exceptions occur in the middle of a set of formulas, 
+ * that can be circumvented by some tricks.
+ * For example, 
+ * calculate tanh
+ *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+ *
+ * If the result of (-2 * a) is too large,
+ * a FE_OVERFLOW exception occurs when calculating exp.
+ * But the result of tanh is no overflow problem,
+ * so we can add some tricks to prevent exp calculate an excessive value.
+ *
+ */
+#include <fenv.h>
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Excepts.h"
+
+using namespace paddle;     // NOLINT
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+template<typename Matrix>
+void testTanh(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->tanh(*B);
+}
+
+template<typename Matrix>
+void testSigmoid(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->sigmoid(*B);
+}
+
+TEST(fp, overflow) {
+  for (auto illegal : {-90.0, 90.0}) {
+    LOG(INFO) << " illegal=" << illegal;
+    testTanh<CpuMatrix>(illegal);
+    testSigmoid<CpuMatrix>(illegal);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 54e3320c8c1584d0f41e8507c846b17f7c85d09c..242fd982aa0015bfe9cb910c52afc3b42ab1028b 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -3,6 +3,8 @@ source ./common.sh
 CMAKE_EXTRA=""
 if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
   CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
+else
+  CMAKE_EXTRA="-DWITH_SWIG_PY=ON"
 fi
 
 
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index 3341dd6f95969fcd8df5b6049b0b8d2d5905a43f..1a15eafd5528a68aa9a68ed020de6decb61bd2a7 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -31,8 +31,8 @@ is_lin = (system == 'linux')
 #   because generate paddle LDFLAGS is too complicated to do in setup.py
 #   it just read COMAKE generated LDFLAGS.
 extra_links = []
-ldflags = api.paddle_ld_flags.PaddleLDFlag()
-ldflags = ldflags.ldflag_str()
+obj = api.paddle_ld_flags.PaddleLDFlag()
+ldflags = obj.ldflag_str()
 if ldflags is not None:
   extra_links.extend(ldflags.split(" "))
 
@@ -51,13 +51,20 @@ elif is_osx == True:
 
 include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
 
+extra_c = obj.c_flag()
+
+attr=dict()
+if extra_c is not None:
+  attr["extra_compile_args"] = extra_c
+
 setup(name="py_paddle",
   version="@PADDLE_VERSION@",
   ext_modules=[
     Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
        ['Paddle_wrap.cxx'],
        include_dirs = include_dirs,
-       extra_link_args = extra_links
+       extra_link_args = extra_links,
+       **attr
     )
   ],
   packages=['py_paddle'],