diff --git a/.gitignore b/.gitignore
index 00368ede67d3d2426f50a278578a33d18b736ca0..7e21ba0b750dfc3846e736da04b68781ea2bf46c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,6 @@
 *.DS_Store
 build/
+*.user
+
+.vscode
+.idea
\ No newline at end of file
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 617bd7ea7162b8869e1c41df136f6814bd6f62a9..529b4b9d15d097fbd25b94bc41f9b3ae1a998c83 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -65,12 +65,14 @@ set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
 set(OPENBLAS_INCLUDE_SEARCH_PATHS
         ${OPENBLAS_ROOT}/include
         /usr/include
-        /usr/include/openblas)
+        /usr/include/openblas
+        /usr/local/opt/openblas/include)
 set(OPENBLAS_LIB_SEARCH_PATHS
         ${OPENBLAS_ROOT}/lib
         /usr/lib
         /usr/lib/blas/openblas
-        /usr/lib/openblas)
+        /usr/lib/openblas
+        /usr/local/opt/openblas/lib)
 
 find_path(OPENBLAS_INC_DIR NAMES cblas.h
   PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index e2ff923a22923243e50233b1bad28edb672ab11e..e5b59be19369d3ba3e852624426b95ae365e7357 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -15,7 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
     /usr/lib)
-find_library(CUDNN_LIBRARY NAMES libcudnn.so # libcudnn_static.a
+find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
     PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
           NO_DEFAULT_PATH
     DOC "Path to cuDNN library.")
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 5b56304656e3844411c85f1ff946b80e9c909ae5..d776c3ae499526ef52e24c0aeea18ccab71a242b 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -1,16 +1,55 @@
 # Some common routine for paddle compile.
 
-
 # target_circle_link_libraries
 # Link libraries to target which has circle dependencies.
 #
 # First Argument: target name want to be linked with libraries
 # Rest Arguments: libraries which link together.
 function(target_circle_link_libraries TARGET_NAME)
-    target_link_libraries(${TARGET_NAME}
-        -Wl,--start-group
-        ${ARGN}
-        -Wl,--end-group)
+    if(APPLE)
+        set(LIBS)
+        set(inArchive OFF)
+        set(libsInArgn)
+
+        foreach(arg ${ARGN})
+            if(${arg} STREQUAL "ARCHIVE_START")
+                set(inArchive ON)
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                set(inArchive OFF)
+            else()
+                if(inArchive)
+                    list(APPEND LIBS "-Wl,-force_load")
+                endif()
+                list(APPEND LIBS ${arg})
+                list(APPEND libsInArgn ${arg})
+            endif()
+        endforeach()
+        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+            list(APPEND LIBS "-undefined dynamic_lookup")
+        endif()
+        list(REVERSE libsInArgn)
+        target_link_libraries(${TARGET_NAME}
+            ${LIBS}
+            ${libsInArgn})
+
+    else()  # LINUX
+        set(LIBS)
+
+        foreach(arg ${ARGN})
+            if(${arg} STREQUAL "ARCHIVE_START")
+                list(APPEND LIBS "-Wl,--whole-archive")
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                list(APPEND LIBS "-Wl,--no-whole-archive")
+            else()
+                list(APPEND LIBS ${arg})
+            endif()
+        endforeach()
+
+        target_link_libraries(${TARGET_NAME}
+                "-Wl,--start-group"
+                ${LIBS}
+                "-Wl,--end-group")
+    endif()
 endfunction()
 
 # compile_cu_as_cpp
@@ -41,20 +80,20 @@ function(link_paddle_exe TARGET_NAME)
     if(PADDLE_WITH_INTERNAL)
         set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
         target_circle_link_libraries(${TARGET_NAME}
-            -Wl,--whole-archive
+            ARCHIVE_START
             paddle_internal_gserver
             paddle_internal_owlqn
-            -Wl,--no-whole-archive
+            ARCHIVE_END
             paddle_internal_parameter)
     else()
         set(INTERAL_LIBS "")
     endif()
 
     target_circle_link_libraries(${TARGET_NAME}
-        -Wl,--whole-archive
+        ARCHIVE_START
         paddle_gserver
         ${METRIC_LIBS}
-        -Wl,--no-whole-archive
+        ARCHIVE_END
         paddle_pserver
         paddle_trainer_lib
         paddle_network
diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
index a191d31318aa67e6f4de75d81c031aeb729311c9..a6090d68191625d86d8103c9cf96832535cbcf73 100644
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -1,141 +1,306 @@
-Build and Install
+Installing from Sources
 =================
 
-## Requirement
+* [1. Download and Setup](#download)
+* [2. Requirements](#requirements)
+* [3. Build on Ubuntu](#ubuntu)
+* [4. Build on Mac OS X](#mac)
 
-### Dependents
+## <span id="download">Download and Setup</span> 
+You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).
 
-- **CMake**: required for 2.8+ version
-- **g++**: a recent c++ compiler supporting c++11, >= 4.6, < 5
-- **BLAS library**: such as openBLAS, MKL, ATLAS
-- **protobuf**: required for 2.4+ version, 3.x is not supported
-- **python**: currently only 2.7 version is supported
+```bash
+git clone https://github.com/baidu/Paddle paddle
+```
+
+## <span id="requirements">Requirements</span>
+
+To compile the source code, your computer must be equipped with GCC >=4.6 or Clang Compiler.
+### Dependencies
+
+- **CMake**: version >= 2.8
+- **BLAS**: MKL, OpenBlas or ATLAS
+- **protobuf**: version >= 2.4, **Note: 3.x is not supported**
+- **python**: only python 2.7 is supported currently
+
+### Options
+
+PaddlePaddle supports some build options. To enable it, first you need to install the related libraries. 
+
+    Optional            | Description
+    ------------        | :-----------
+    **WITH_GPU**        | Compile with GPU mode.
+    **WITH_DOUBLE**     | Compile with double precision floating-point, default: single precision. |
+    **WITH_GLOG**       | Compile with glog. If not found, default: an internal log implementation.
+    **WITH_GFLAGS**     | Compile with gflags. If not found, default: an internal flag implementation.
+    **WITH_TESTING**    | Compile with gtest for PaddlePaddle's unit testing. 
+    **WITH_DOC**        | Compile to generate PaddlePaddle's docs, default: disabled (OFF).
+    **WITH_SWIG_PY**    | Compile with python predict API, default: disabled (OFF).
+    **WITH_STYLE_CHECK**| Compile with code style check, default: enabled (ON).
+|
 
-### Optional
+**Note:**
+  - The GPU version works best with Cuda Toolkit 7.5 and cuDNN v5.
+  - Other versions like Cuda Toolkit 6.5, 7.0, 8.0 and cuDNN v2, v3, v4 are also supported.
+  - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
 
-PaddlePaddle also support some build options, you have to install related libraries. 
+As a simple example, consider the following:  
 
-- **WITH_GPU**: Compile with gpu mode
-  - The GPU version works best with Cuda Toolkit 7.5 and cuDNN v5
-  - Other versions Cuda Toolkit 6.5, 7.0 and cuDNN v2, v3, v4 are also supported
-  - Note: to utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa
-- **WITH_DOUBLE**: Compile with double precision, otherwise use single precision 
-- **WITH_GLOG**: Compile with glog, otherwise use a log implement internally
-- **WITH_GFLAGS**: Compile with gflags, otherwise use a flag implement internally
-- **WITH_TESTING**: Compile with gtest and run unittest for PaddlePaddle 
-- **WITH_DOC**: Compile with documentation
-- **WITH_SWIG_PY**: Compile with python predict api
-- **WITH_STYLE_CHECK**: Style check for source code
+1. **Python Dependencies(optional)**
+  
+    To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows:
+
+    ```bash
+    # install swig on ubuntu
+    sudo apt-get install swig
+    # install swig on Mac OS X
+    brew install swig
+
+    # active swig in cmake
+    cmake .. -DWITH_SWIG_PY=ON
+    ```
+
+2. **Doc Dependencies(optional)**
+
+    To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:
 
+    ```bash
+    pip install 'sphinx>=1.4.0'
+    pip install sphinx_rtd_theme breathe recommonmark
 
-## Building on Ubuntu14.04
+    # install doxygen on Ubuntu
+    sudo apt-get install doxygen 
+    # install doxygen on Mac OS X
+    brew install doxygen
+
+    # active docs in cmake
+    cmake .. -DWITH_DOC=ON`
+    ```
+
+## <span id="ubuntu">Build on Ubuntu 14.04</span>
 
 ### Install Dependencies
 
 - **CPU Dependencies**
 
-```bash
-# necessary
-sudo apt-get update
-sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
-# optional
-sudo apt-get install libgoogle-glog-dev
-sudo apt-get install libgflags-dev
-sudo apt-get install libgtest-dev
-sudo pip install wheel
-pushd /usr/src/gtest
-cmake .
-make
-sudo cp *.a /usr/lib
-popd
-```
-    
+    ```bash
+    # necessary
+    sudo apt-get update
+    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
+    # optional
+    sudo apt-get install libgoogle-glog-dev
+    sudo apt-get install libgflags-dev
+    sudo apt-get install libgtest-dev
+    sudo pip install wheel
+    pushd /usr/src/gtest
+    cmake .
+    make
+    sudo cp *.a /usr/lib
+    popd
+    ```
   
-- **GPU Dependencies(optional)**
+- **GPU Dependencies (optional)**
 
-If you need to build GPU version, the first thing you need is a machine that has GPU and CUDA installed.
-And you also need to install cuDNN.
+    To build GPU version, you will need the following installed:
 
-You can download CUDA toolkit and cuDNN from nvidia website:
-    
-```bash
-https://developer.nvidia.com/cuda-downloads
-https://developer.nvidia.com/cudnn
-```
-You can copy cuDNN files into the CUDA toolkit directory, such as:
+        1. a CUDA-capable GPU
+        2. A supported version of Linux with a gcc compiler and toolchain
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    After downloading cuDNN library, issue the following commands:
+
+    ```bash
+    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
+    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+    ```
+    Then you need to set LD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
+
+    ```bash
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export CUDA_HOME=/usr/local/cuda
+    export PATH=/usr/local/cuda/bin:$PATH
+    ```
+
+### Build and Install
+
+As usual, the best option is to create build folder under paddle project directory.
 
 ```bash
-sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+mkdir build && cd build
+cmake ..
 ```
-Then you need to set LD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
+
+CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
+libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
+If still not found, you can manually set it based on CMake error information from your screen.
+
+As a simple example, consider the following:
+
+- **Only CPU**
+
+  ```bash
+  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  ```
+- **GPU**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  ```
+
+- **GPU with doc and swig**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
+  ``` 
+
+Finally, you can download source code and build:
 
 ```bash
-export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-export CUDA_HOME=/usr/local/cuda
-export PATH=/usr/local/cuda/bin:$PATH
+# you can add build option here, such as:    
+cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install>
+# please use sudo make install, if you want
+# to install PaddlePaddle into the system
+make -j `nproc` && make install
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<path to install>/bin:$PATH
 ```
-- **Python Dependencies(optional)**
 
-If you want to compile PaddlePaddle with python predict api, you need to add -DWITH_SWIG_PY=ON in cmake command and install these first:
+**Note:**
+
+If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
+Otherwise, PaddlePaddle will automatically install python dependencies
+at first time when user run paddle commands, such as `paddle version`, `paddle train`.
+It may require sudo privileges:
 
 ```bash
-sudo apt-get install swig
+# you can run
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
+# or just run 
+sudo paddle version
 ```
 
-- **Doc Dependencies(optional)**
+## <span id="mac">Building on Mac OS X</span>
 
-If you want to compile PaddlePaddle with doc, you need to add -DWITH_DOC=ON in cmake command and install these first:
+### Prerequisites
+This guide is based on Mac OS X 10.11 (El Capitan). Note that if you are running an up to date version of OS X, 
+you will already have Python 2.7.10 and Numpy 1.8 installed.
+
+The best option is to use the package manager homebrew to handle installations and upgrades for you.
+To install [homebrew](http://brew.sh/), first open a terminal window (you can find Terminal in the Utilities folder in Applications), and issue the command:
 
 ```bash
-pip install 'sphinx>=1.4.0'
-pip install sphinx_rtd_theme breathe recommonmark
-sudo apt-get install doxygen 
+# install brew
+/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+# install pip
+easy_install pip
 ```
 
-### Build and Install
+### Install Dependencies
 
-CMake will find dependent libraries in system default paths first. After installing some optional libraries, corresponding build option will automatically be on(such as glog, gtest and gflags). And if libraries are not found, you have to set following variables manually in cmake command(CUDNN_ROOT, ATLAS_ROOT, MKL_ROOT, OPENBLAS_ROOT).
+- **CPU Dependencies**
 
-Here are some examples of cmake command with different options:
+  ```bash
+  # Install fundamental dependents 
+  brew install glog gflags cmake protobuf openblas
+
+  # Install google test on Mac OS X
+  # Download gtest 1.7.0
+  wget https://github.com/google/googletest/archive/release-1.7.0.tar.gz
+  tar -xvf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
+  # Build gtest
+  mkdir build && cmake ..
+  make
+  # Install gtest library
+  sudo cp -r ../include/gtest /usr/local/include/
+  sudo cp lib*.a /usr/local/lib
+  ```
 
-**only cpu**
+- **GPU Dependencies(optional)**
 
-```bash
-cmake -DWITH_GPU=OFF -DWITH_DOC=OFF
-```
+    To build GPU version, you will need the following installed:
+
+        1. a CUDA-capable GPU
+        2. Mac OS X 10.11 or later
+        2. the Clang compiler and toolchain installed using Xcode
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    1. After downloading cuDNN library, issue the following commands:
+
+        ```bash
+        sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
+        sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+        ```
+    2. Then you need to set DYLD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
 
-**gpu**
+        ```bash
+        export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
+        export PATH=/usr/local/cuda/bin:$PATH
+        ```
+
+### Build and Install
+
+As usual, the best option is to create build folder under paddle project directory.
 
 ```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF
+mkdir build && cd build
+cmake ..
 ```
 
-**gpu with doc and swig**
+CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
+libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
+If still not found, you can manually set it based on CMake error information from your screen.
 
-```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
-``` 
+As a simple example, consider the following:
 
-Finally, you can download source code and build:
+- **Only CPU**
+
+  ```bash
+  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  ```
+- **GPU**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  ```
+
+- **GPU with doc and swig**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
+  ``` 
+
+Finally, you can build PaddlePaddle:
 
 ```bash
-git clone https://github.com/baidu/Paddle paddle
-cd paddle
-mkdir build
-cd build
 # you can add build option here, such as:    
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install> ..
-# please use sudo make install, if you want
-# to install PaddlePaddle into the system
+cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<installation path>
+# please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
-# PaddlePaddle installation path
-export PATH=<path to install>/bin:$PATH
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<installation path>/bin:$PATH
 ```
-**Note**
 
-And if you set WITH_SWIG_PY=ON, you have to install related python predict api at the same time:
+
+**Note:**
+
+If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
+Otherwise, PaddlePaddle will automatically install python dependencies
+at first time when user run paddle commands, such as `paddle version`, `paddle train`.
+It may require sudo privileges:
 
 ```bash
-pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
+# you can run
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
+# or just run 
+sudo paddle version
+```
\ No newline at end of file
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 79487c4cf4d41f416e9269580bd7629947ca6cd6..b3140617af188b6a80067d9dbd312bd9e9155adf 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/utils/GlobalConstants.h"
+#include "paddle/utils/TypeDefs.h"
 
 /// Import PaddlePaddle's enumeration into global namespace.
 using namespace paddle::enumeration_wrapper;  // NOLINT
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index 4e655c324a1ede19dacab9237d62fcfcaf70d64d..8a6741078f2f19d8c3cb081f129447d6fc5801c9 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Flags.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/parameter/Parameter.h"
 
 #include <fenv.h>
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index 21b4ca1dd61713d59e262766e57c70933a363477..bc1afc5898e829bc271b62b702b3743bf7eb782b 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -15,6 +15,19 @@
 try:
     from paddle_api_config import *
     import os.path
+    import platform
+
+    system = platform.system().lower()
+    is_osx = (system == 'darwin')
+    is_win = (system == 'windows')
+    is_lin = (system == 'linux')
+
+    if is_lin:
+        whole_start = "-Wl,--whole-archive"
+        whole_end = "-Wl,--no-whole-archive"
+    elif is_osx:
+        whole_start = ""
+        whole_end = ""
 
     LIB_DIRS = ["math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", "trainer"]
     PARENT_LIB_DIRS = ['proto']
@@ -56,9 +69,9 @@ try:
 
         def libs_str(self):
             libs = [
-                "-Wl,--whole-archive",
+                whole_start,
                 "-lpaddle_gserver",
-                "-Wl,--no-whole-archive",
+                whole_end,
                 "-lpaddle_pserver",
                 "-lpaddle_trainer_lib",
                 "-lpaddle_network",
diff --git a/paddle/cuda/include/hl_device_functions.cuh b/paddle/cuda/include/hl_device_functions.cuh
index 27e3f450c5c1c987f4b3fbc2eb1e52a1696c7320..88d950d6c17132d1d9969d0f3766395377e2de96 100755
--- a/paddle/cuda/include/hl_device_functions.cuh
+++ b/paddle/cuda/include/hl_device_functions.cuh
@@ -16,28 +16,37 @@ limitations under the License. */
 #ifndef HL_DEVICE_FUNCTIONS_CUH_
 #define HL_DEVICE_FUNCTIONS_CUH_
 
-namespace hppl {
-
-static __inline__ __device__ double atomicAdd(double* address, double val) {
-    // NOLINTNEXTLINE
-    unsigned long long int* address_as_ull = (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed; // NOLINT
-
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull,
-                        assumed,
-                        __double_as_longlong(val +
-                        __longlong_as_double(assumed)));
-    } while (assumed != old);
-
-    return __longlong_as_double(old);
-}
+namespace paddle {
+
+template <class T>
+inline __device__ T paddleAtomicAdd(T* address, T val);
 
-}  // namespace hppl
+template <>
+inline __device__ float paddleAtomicAdd(float* address, float val) {
+  return atomicAdd(address, val);
+}
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-using hppl::atomicAdd;
+template <>
+inline __device__ double paddleAtomicAdd(double* address, double val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+  return atomicAdd(address, val);
+#else
+  // NOLINTNEXTLINE
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed; // NOLINT
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull,
+                    assumed,
+                    __double_as_longlong(val +
+                    __longlong_as_double(assumed)));
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
 #endif
+}
+}  // namespace paddle
+
 
 #endif /* HL_DEVICE_FUNCTIONS_CUH_ */
diff --git a/paddle/cuda/include/hl_gpu_lstm.cuh b/paddle/cuda/include/hl_gpu_lstm.cuh
index 2ca33f2b13a1f7c0863275be2993583996f70e50..07806e11c18a2b47d79237587a0e882d7bf2a1d2 100644
--- a/paddle/cuda/include/hl_gpu_lstm.cuh
+++ b/paddle/cuda/include/hl_gpu_lstm.cuh
@@ -192,10 +192,10 @@ __global__ void KeLstmBackward(Op op,
 
   if (isBatch) {
     if (value.prevStateValue) {
-      if (grad.checkIgGrad) atomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
-      if (grad.checkFgGrad) atomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
+      if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
+      if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
     }
-    if (grad.checkOgGrad) atomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
+    if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
   } else {
     if (value.prevStateValue) {
       if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
index 85b60cc313fa78b1efcfbb786c1a63f14d1102aa..6917f3629014115b264966d207b9111a66efa1d1 100644
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -27,6 +27,8 @@ typedef float4 vecType;
 typedef double2 vecType;
 #endif
 #else
+#include <mmintrin.h>
+#include <xmmintrin.h>
 #include <emmintrin.h>
 #ifndef HPPL_TYPE_DOUBLE
 typedef __m128  vecType;
diff --git a/paddle/cuda/include/hl_sse_matrix_kernel.cuh b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
index d774150c21e61716e7cee7eea7ad31c7f802c5a6..c90d49e4adeb5ee7c03a36d8963c0bd7eef56e1a 100644
--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -25,6 +25,9 @@ limitations under the License. */
 #define     VECTOR_LEN      4
 #define     VECTOR_SET      _mm_set_ps1
 #else
+#if   defined(__APPLE__) || defined(__OSX__)
+#define     _mm_set_pd1     _mm_set1_pd
+#endif
 /* number of double in vector */
 #define     VECTOR_LEN      2
 #define     VECTOR_SET      _mm_set_pd1
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index f07538d6ba71334109ef2e2dc572613fbd3cca4e..acd8e2fe6afb4c5642979a6d31f6ec8bbc9b6daa 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -209,7 +209,18 @@ __thread cudaStream_t default_stream = 0;
 __thread bool g_sync_flag = true;
 bool hl_start_flag = false;
 
-#define gettid() syscall(SYS_gettid)
+inline pid_t gettid() {
+#if defined(__APPLE__) || defined(__OSX__)
+  pid_t tid = syscall(SYS_thread_selfid);
+#else
+  #ifndef __NR_gettid
+  #define __NR_gettid 224
+  #endif
+  pid_t tid = syscall(__NR_gettid);
+#endif
+  CHECK_NE(tid, -1);
+  return tid;    
+}
 
 void hl_init(int device) {
   CHECK(hl_start_flag)
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
index 64699c9f6d45040a05c9d8a1371f1afa4cfb43e3..cf009620bf69d05397c5e03de3f7f2856bf4ff6b 100644
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -564,11 +564,11 @@ __global__ void KeLstmBackward(real *gateValue,
 
   /* TODO: Temporary save & merger in another kernel */
   if (frameIdy == 1) {
-    if (checkIgGrad) atomicAdd(checkIgGrad+frameIdx, rCheckGrad);
+    if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad);
   } else if (frameIdy == 2) {
-    if (checkFgGrad) atomicAdd(checkFgGrad+frameIdx, rCheckGrad);
+    if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad);
   } else if (frameIdy == 3) {
-    if (checkOgGrad) atomicAdd(checkOgGrad+frameIdx, rCheckGrad);
+    if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad);
   }
 }
 
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index ecc44944e4fa19b064fb0aa09d81e2143e5bc85d..38e4f16217c2a4779ca9b1fd76df6276819645ff 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -623,7 +623,7 @@ __global__ void KeCosSimDerivative(real* grad,
         prevGradY[index] +=
           scale * grad[ty] * prevOutX[index] * reciprocal;
       } else {
-        atomicAdd(prevGradY + index,
+        paddle::paddleAtomicAdd(prevGradY + index,
           scale * grad[ty] * prevOutX[index] * reciprocal);
       }
     }
@@ -640,7 +640,7 @@ __global__ void KeCosSimDerivative(real* grad,
           (prevOutX[index] * reciprocalXY -
            prevOutY[index] * reciprocalSquareSumY);
       } else {
-        atomicAdd(prevGradY + index, output[ty] * grad[ty] *
+        paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
           (prevOutX[index] * reciprocalXY -
            prevOutY[index] * reciprocalSquareSumY));
       }
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index f88a2682fd0604c4c33cb6dd7d49d63e058bebc1..e028880156e5b191d032253bc62739c9e5ab34fc 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -362,7 +362,7 @@ __global__ void KeMatrixAddRows(real* output,
         if (AddRow == 0) {
           outputData[i] += tableData[i];
         } else {
-          atomicAdd(&tableData[i], outputData[i]);
+          paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
         }
       }
     }
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh
index becb6c66492c1b208e303af959352f1a246349c2..db5c9ce979885a173c8caadc8f6b47836f1771b5 100644
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -280,7 +280,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
         if (index_n_t < dimN) {
           real tmp;
           tmp = alpha*a_r*b_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
           C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
           index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
         }
@@ -328,7 +328,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
         if (index_n_t < dimN) {
           real tmp;
           tmp = alpha*a_r*b_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
           C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
           index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
         }
@@ -629,7 +629,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
       for (int n=0; n < CU_DM_CSR_N; n++) {
         if (index_m_t++ < dimM) {
           tmp = alpha * b_r * a_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
           C_d_r += dimN;
         }
       }
@@ -660,7 +660,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
       for (int n=0; n < CU_DM_CSR_N; n++) {
         if (index_m_t++ < dimM) {
           tmp = alpha * b_r * a_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
           C_d_r += dimN;
         }
       }
@@ -912,7 +912,7 @@ __global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val,
   for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) {
     int colIdx = csr_col[idx];
     real val = csr_val[idx];
-    atomicAdd(a_val + colIdx, val);
+    paddle::paddleAtomicAdd(a_val + colIdx, val);
   }
 }
 
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index 3558b163b5ae0d94e4f94a6c6165212d0992bf72..eee9984e07326668a49fd2627e361804a6aacd7b 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -69,23 +69,40 @@ static inline void GetDsoHandleWithSearchPath(
 
     CHECK(nullptr != *dso_handle)
       << "For Gpu version of PaddlePaddle, it couldn't find CUDA library: "
-      << dlPath.c_str() << " Please make sure you already specify its path."
-      << "Note: for training data on Cpu using Gpu version of PaddlePaddle,"
-      << "you must specify libcudart.so via LD_LIBRARY_PATH.";
+      << dlPath.c_str() << ". Please make sure you already specify its path. "
+      << "Note: for training data on Cpu using Gpu version of PaddlePaddle, "
+      << "you must specify libcudart via export LD_LIBRARY_PATH for Linux or "
+      << "export DYLD_LIBRARY_PATH for MAC OS.";
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+#else
     GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+#endif
 }
 
 void GetCudnnDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+#else
     GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+#endif
 }
 
 void GetCudartDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath("", "libcudart.dylib", dso_handle);
+#else
     GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle);
+#endif
 }
 
 void GetCurandDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+#else
     GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+#endif
 }
diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu
index 05335c5f835fc52d48753d34cca1037e371cddf2..52ee4610edf670dc339e0ece66d58153c0164499 100644
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@@ -35,7 +35,7 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
       real *tab = table + tableId * ldt;
       for (int i = idx; i < dim; i += blockDimX) {
         if (AddRow) {
-          atomicAdd(&tab[i], out[i]);
+          paddle::paddleAtomicAdd(&tab[i], out[i]);
         } else {
           out[i] += tab[i];
         }
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index decbde6c91758c78513d2d4e644126b37929eb97..0689f90f3e7dd3d3e1df19f3958c821d53e69700 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -65,7 +65,8 @@ void DataProviderGroup<T>::reset() {
   provider_ = nullptr;
 
   // shuffle file list
-  std::random_shuffle(fileList_.begin(), fileList_.end());
+  std::shuffle(fileList_.begin(), fileList_.end(),
+      ThreadLocalRandomEngine::get());
 
   startLoader();
   DataProvider::reset();
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index b0c14c85b2d81eac7e8148a223bd53ff6f4ebdf6..344644755f24045443b8cb3ebd08004a4b1cdcb5 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -374,7 +374,8 @@ void ProtoDataProvider::reset() {
 }
 
 void ProtoDataProvider::shuffle() {
-  std::random_shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end());
+  std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
+      ThreadLocalRandomEngine::get());
 }
 
 /*
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index aeefd16063df82ce3e190bdaacdb2b788f5f9b8e..1332c0ab635b6ebec05f25fd77b9703b39227bc1 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -17,6 +17,8 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 #include <fenv.h>
 #include "paddle/utils/Util.h"
+#include "paddle/utils/Excepts.h"
+
 
 namespace paddle {
 
@@ -44,7 +46,6 @@ PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
 }
 
 void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
-  int feFlag = fegetexcept();
   VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
   classInstance_ =
       createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
@@ -55,7 +56,7 @@ void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
   std::string headerInfo =
       std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
   parseHeaderData(headerInfo);
-  feenableexcept(feFlag);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
 }
 
 void PyDataProvider::parseHeaderData(const std::string& headerData) {
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index eb1522a178d48c1d71b5b4a63ce73f65e1167288..3127b4dd9a2fd3a3da26b90100763c4ec2470cae 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -385,17 +385,17 @@ void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
   }
 }
 
-extern NeuralNetwork* newCustomNeuralNetwork(
-    const std::string& name, NeuralNetwork* network) __attribute__((weak));
+extern NeuralNetwork* newCustomNerualNetwork(
+  const std::string& name, NeuralNetwork* network) __attribute__((weak));
 
 NeuralNetwork* NeuralNetwork::newNeuralNetwork(
     const std::string& name,
     NeuralNetwork* rootNetwork) {
-  if (newCustomNeuralNetwork) {
-    return newCustomNeuralNetwork(name, rootNetwork);
-  } else {
-    return new NeuralNetwork(name, rootNetwork);
-  }
+    if (newCustomNerualNetwork) {
+      return newCustomNerualNetwork(name, rootNetwork);
+    } else {
+      return new NeuralNetwork(name, rootNetwork);
+    }
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/tests/concat_table_a.conf b/paddle/gserver/tests/concat_table_a.conf
index 2e3c518883e20caf2a69e5899598104317625561..a8ff70f883318676b5bd295c217105ca4b98edff 100644
--- a/paddle/gserver/tests/concat_table_a.conf
+++ b/paddle/gserver/tests/concat_table_a.conf
@@ -16,9 +16,9 @@
 
 from paddle.trainer_config_helpers import *
 
-settings(batch_size=1000)
+settings(batch_size=300)
 
-data = data_layer(name ="input", size=100000)
+data = data_layer(name ="input", size=10000)
 
 # emb1 is equal to emb2, note that bias_attr=false 
 # and act=LinearActivation() in default.
diff --git a/paddle/gserver/tests/concat_table_b.conf b/paddle/gserver/tests/concat_table_b.conf
index 6da24a5fbc55c1c3beec719d38f3d4d05b8eb46a..95d7c10f7b0cd66e38f60c282e4f67ebf3b7cafb 100644
--- a/paddle/gserver/tests/concat_table_b.conf
+++ b/paddle/gserver/tests/concat_table_b.conf
@@ -16,9 +16,9 @@
 
 from paddle.trainer_config_helpers import *
 
-settings(batch_size=1000)
+settings(batch_size=300)
 
-data = data_layer(name ="input", size=100000)
+data = data_layer(name ="input", size=10000)
 
 proj1 = table_projection(input=data, size=128)
 
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 5c80eb546cfafb223454487ebf198394cc3a96c5..3150c31e4900c3b09f9f49a19e65b1b8c25d19e6 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -50,7 +50,7 @@ TEST(Operator, dot_mul) {
 TEST(Projection, context) {
   for (auto contextStart : {-5, -3, -1, 0, 3}) {
     for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto batchSize : {1, 2, 5, 20, 100}) {
+      for (auto batchSize : {1, 2, 5, 20, 50}) {
         for (auto trainablePadding : {false, true}) {
           LOG(INFO) << " contextStart=" << contextStart
                     << " contextLength=" << contextLength
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index c5fe31b29187f4a5b429a928d1870a06848691fa..e75e53ab7f431a34798e8a79985f30441005098c 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -321,7 +321,7 @@ TEST(PyDataProvider2, input_order) {
     if (!realBatchSize) {
       break;
     }
-    ASSERT_EQ(batch.getStreams().size(), 2);
+    ASSERT_EQ(batch.getStreams().size(), (size_t)2);
     for (size_t i = 0; i < realBatchSize; ++i) {
       ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
       ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 36166236e9effdb8171f56289bba7ca99515d1cd..f7aa60380f23eeea91ee852480862f6b19caedec 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #pragma once
 
 #include <mutex>
-#include <malloc.h>
+#include <stdlib.h>
 #include "hl_gpu.h"
 #include "paddle/utils/Logging.h"
 
@@ -48,9 +48,10 @@ public:
    * @return Pointer to the allocated memory
    */
   virtual void* alloc(size_t size) {
-    void* ptr = memalign(32ul, size);
-    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
-    return ptr;
+      void* ptr;
+      CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+      CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
+      return ptr;
   }
 
   /**
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index fe486c741d6f5dd79962c40af2396c00568c551f..43075977dc9cef1573cf6dd75d9ef577b07d337e 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -23,6 +23,8 @@ extern "C" {
 }
 #endif
 
+#include <cmath>
+
 namespace paddle {
 
 template<class T>
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 1b7f9ac5dac16c167dcc22930c28bc3521162b9b..e351bede724ac3cef941f50b9418af5d438d6f77 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -2514,7 +2514,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
     for (int k = 0; k < blockNum_; ++k) {
       blockSeq.push_back(k);
     }
-    std::random_shuffle(blockSeq.begin(), blockSeq.end());
+    std::shuffle(blockSeq.begin(), blockSeq.end(),
+        ThreadLocalRandomEngine::get());
   }
   std::vector<int>& localBufRows = *localBufRows_;
   int* cols = a->getCols();
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
index 22af0eb893753490767297329194185c23c69be6..aca8ffb0ab42e10d76dc9fbaad657a8afab316e9 100644
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <mutex>
 #include <vector>
 #include <unordered_map>
+#include <map>
 #include "Allocator.h"
 
 namespace paddle {
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 9a879a964ec6d04ad1c6cbf1a009b6414a8e23fd..0403c3521cf54d833b32ff0810ba6d29dfc8f3c6 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -25,8 +25,8 @@ namespace paddle {
 // Initialization StorageEngine singleton.
 // Other modules may rely on storage management,
 // so StorageEngine need to be initialized before other modules.
-static InitFunction __init_storage_engine(
-  StorageEngine::singleton, std::numeric_limits<int>::max());
+static InitFunction __init_storage_engine([](){StorageEngine::singleton();},
+                                          std::numeric_limits<int>::max());
 
 StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {
 }
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index 631d0516cf409fda7c20f1d9329328684854aedf..491b0cda7b9e1a13882aee6621e0de984709ae80 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 
-#include <malloc.h>
+#include <stdlib.h>
 #include <time.h>
 
 static constexpr size_t VECTOR_LEN = 3072;
@@ -37,7 +37,9 @@ static std::mt19937 RandomEngine(time(0));
 
 inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
                                                  size_t align = ALIGN) {
-  return std::unique_ptr<float[]>((float*)memalign(align, len * sizeof(float)));
+  float* ptr;
+  CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
+  return std::unique_ptr<float[]>(ptr);
 }
 
 inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
index 1310e509877a02301062f58b9f050203dddbe258..fa682164aa8643dd088bd0ece757728e03488b76 100644
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
@@ -124,8 +124,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a,
       if (a->getValueType() == FLOAT_VALUE) {
         real aVal = a->getValue()[r];
         real bVal = b->getValue()[r];
-        if (fabs(aVal - bVal) > err) {
-          if ((fabsf(aVal - bVal) / fabsf(aVal)) > (err / 10.0f)) {
+        if (std::abs(aVal - bVal) > err) {
+          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
             LOG(INFO) << "a=" << aVal << "\t" << "b=" << bVal;
             count++;
           }
@@ -141,8 +141,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a,
       if (a->getValueType() == FLOAT_VALUE) {
         real aVal = a->getValue()[r];
         real bVal = b->getValue()[r];
-        if (fabs(aVal - bVal) > err) {
-          if ((fabsf(aVal - bVal) / fabsf(aVal)) > (err / 10.0f)) {
+        if (std::abs(aVal - bVal) > err) {
+          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
             count++;
           }
         }
@@ -173,8 +173,8 @@ void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
     for (int j = 0; j < width; j++) {
       real a = data1[i * width + j];
       real b = data2[i * width + j];
-      if (fabs(a - b) > err) {
-        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
+      if (std::abs(a - b) > err) {
+        if ((std::abs(a - b) / std::abs(a)) > (err / 10.0f)) {
           count++;
         }
       }
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 3db96ccf941e3b0687b6e010b21b7f68faca364b..1a22abf7cf80157039f6147293e7648d654e45f7 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 
-#include <malloc.h>
+#include <stdlib.h>
 #include <paddle/utils/Util.h>
 
 #include <gtest/gtest.h>
@@ -124,9 +124,13 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
 TEST_F(CommonTest, sgdUpdate) {
   const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
   for (auto& size : sizeVec_) {
-    real* gradientBuffer = (real*)memalign(32, sizeof(real) * size);
-    real* valueBuffer = (real*)memalign(32, sizeof(real) * size);
-    real* momentumBuffer = (real*)memalign(32, sizeof(real) * size);
+    real *gradientBuffer, *valueBuffer, *momentumBuffer;
+    CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
+        0);
+    CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
+    CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
+        0);
+
     for (size_t i = 0; i < size; i++) {
       gradientBuffer[i] = 1.0;
       valueBuffer[i] = 2.0;
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index fb427832fad646b4181cbaa98a29d5ae22970556..ff2875fc702ffbb0675f21433138961c19ff0b86 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <sys/socket.h>
 #include <netdb.h>
 #include <netinet/in.h>
+#include <netinet/tcp.h>
 #include <fcntl.h>
 
 #include <arpa/inet.h>
@@ -24,7 +25,6 @@ limitations under the License. */
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <sstream>
-#include <linux/tcp.h>
 
 #include "LightNetwork.h"
 #include "paddle/utils/Util.h"
@@ -79,6 +79,7 @@ std::string getIpAddr(std::string &device) {
  * @note adjust some default sock option for better performance
  */
 void setOption(int sockfd) {
+#if !defined(__APPLE__) && !defined(__OSX__)
   int sendSize = FLAGS_sock_send_buf_size;
   int recvSize = FLAGS_sock_recv_buf_size;
   CHECK_GE(
@@ -87,15 +88,19 @@ void setOption(int sockfd) {
   CHECK_GE(
       setsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendSize, sizeof(sendSize)),
       0);
+#endif
+
   if (FLAGS_small_messages) {
     int optval = 1;
     CHECK_GE(
         setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)),
         0);
+#ifdef TCP_QUICKACK
     optval = 1;
     CHECK_GE(
         setsockopt(sockfd, IPPROTO_TCP, TCP_QUICKACK, &optval, sizeof(optval)),
         0);
+#endif
   }
   int reuse = 1;
   CHECK_GE(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)),
@@ -340,17 +345,27 @@ void SocketWorker::run() {
  */
 void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
   struct sockaddr_in serv_addr;
-  struct hostent hostinfo, *server;
-  char buf[1024];  // temp for gethostbyname_r
+  struct hostent *server;
+
   int errRet;      // temp for gethostbyname_r
 
   /// Create a socket point
   int sockfd = socket(AF_INET, SOCK_STREAM, 0);
   PCHECK(sockfd >= 0) << "ERROR opening socket";
-  CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf),
-                              &server, &errRet))
-      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-  CHECK(server) << "gethostbyname_r err";
+
+#if defined(__OSX__) || defined(__APPLE__)
+   server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
+   CHECK_NE(HOST_NOT_FOUND, errRet)
+     << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
+   CHECK(server) << "getipnodebyname error!";
+#else
+   struct hostent hostinfo;
+   char buf[1024];  // temp for gethostbyname_r
+   CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf),
+                               &server, &errRet))
+       << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
+   CHECK(server) << "gethostbyname_r error!";
+#endif
 
   bzero((char *)&serv_addr, sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
index 698473060a4c1660b93c10b40053e074d2ac81e9..b9d542a296ddddcd5e00d345b71fa976f0d72388 100644
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
@@ -27,6 +27,15 @@ limitations under the License. */
 
 namespace paddle {
 
+/**
+ * UIO_MAXIOV is documented in writev(2), but <sys/uio.h> only
+ * declares it on osx/ios if defined(KERNEL)
+ */
+#ifndef UIO_MAXIOV
+#define UIO_MAXIOV 512
+#endif
+
+
 SocketChannel::~SocketChannel() {
   if (tcpRdma_ == F_TCP)
     close(tcpSocket_);
@@ -148,8 +157,7 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
   std::vector<iovec> iovs;
   iovs.reserve(userIovs.size() + 2);
   iovs.push_back({&header, sizeof(header)});
-  iovs.push_back({&iovLengths[0],
-       sizeof(iovLengths[0]) * (size_t) header.numIovs});
+  iovs.push_back({&iovLengths[0], sizeof(iovLengths[0]) * header.numIovs});
   iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
 
   header.totalLength = 0;
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index da86eb795dc5890a5598f07e9c2bd62e503091f6..02ea9067431c62263d9a3a21d112c9ea64bacbe4 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -17,6 +17,14 @@
 from setuptools import setup, Extension
 import numpy as np
 import api.paddle_ld_flags
+import platform
+
+system = platform.system().lower()
+
+is_osx = (system == 'darwin')
+is_win = (system == 'windows')
+is_lin = (system == 'linux')
+
 
 # The extra links will passed from COMAKE
 #   because generate paddle LDFLAGS is too complicated to do in setup.py
@@ -34,17 +42,24 @@ try:
 except:
   pass
 
+if is_lin == True:
+    extra_links = ["-Xlinker", '-start-group'] + extra_links + ["-Xlinker", "-end-group"]
+elif is_osx == True:
+    extra_links = ["-Wl,-all_load"] + extra_links
+
+include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
+
 setup(name="py_paddle",
   version="@PADDLE_VERSION@",
   ext_modules=[
     Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
-      ['Paddle_wrap.cxx'],
-      extra_link_args=["-Xlinker", '-start-group'] +
-                        extra_links + ["-Xlinker", "-end-group"]
+       ['Paddle_wrap.cxx'],
+       include_dirs = include_dirs,
+       extra_link_args = extra_links
     )
   ],
   packages=['py_paddle'],
-  include_dirs = [np.get_include(), "../"],   # include numpy and paddle.
+  include_dirs = include_dirs,
   install_requires = [
     'numpy>=1.8.0',      # The numpy is required.
     'protobuf>=2.4.1' # The paddle protobuf version
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index c0e5ec3bd6bd81791e3f59c11630e090230bedf2..275150e12d12b57550ce45355cb3c533b57b4b86 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/utils/GlobalConstants.h"
 
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index dd30b2c8a5b4539466378173cbe5e3e089adc18d..94266639f94ade6b490eb26243dd964ddedf40b9 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <fenv.h>
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/pserver/ParameterServer2.h"
 
 #include "ParamUtil.h"
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 3070682c0a2ef9cb629b9874dd85af33a1aef3ae..ff37d7b36484031b420c7cdb16ce67cd27440c64 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -146,12 +146,12 @@ TEST(compareSparse, remote_cpu) {
 TEST(compareSparse, cpu10_local_vs_remote) {
   FLAGS_local = 1;  // disable remote sparse update in parameter config
   std::vector<ParameterPtr> localParameters =
-      trainerOnePassTest(configFile1, true, 10);
+      trainerOnePassTest(configFile1, true, 2);
 
   FLAGS_local = 0;  // will enable remote sparse update
   FLAGS_ports_num_for_sparse = 5;
   std::vector<ParameterPtr> remoteParameters =
-      trainerOnePassTest(configFile1, true, 10);
+      trainerOnePassTest(configFile1, true, 2);
 
   compareValue(localParameters, remoteParameters);
 }
@@ -174,7 +174,7 @@ TEST(compareSparse, multiGradientMachine) {
       FLAGS_parallel_nn = useGpu;
       LOG(INFO) << " local=" << local
                 << " useGpu=" << useGpu;
-      int trainerCount = useGpu ? numGpu : 10;
+      int trainerCount = useGpu ? numGpu : 2;
       std::vector<ParameterPtr> parameters =
           trainerOnePassTest(configFile1, true, trainerCount, useGpu);
       compareValue(getDenseParameters(), parameters, eps);
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 8ca9be71de9ac816f20885c08da06fbf02567648..ad2a715ef89c6f4c4b509e1a8b816699b709c59d 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -62,7 +62,11 @@ TEST(checkGradient, multiGpu) {
   }
 }
 
-TEST(checkGradient, parallel) { checkGradientTest(configFile4, true, true); }
+TEST(checkGradient, parallel) {
+  if (hl_get_device_count() >= 2) {
+    checkGradientTest(configFile4, true, true);
+  }
+}
 
 TEST(checkGradient, multiParallel) {
   FLAGS_allow_only_one_model_on_one_gpu = false;
@@ -90,7 +94,11 @@ TEST(checkGradient, multi) {
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
+#if defined(__APPLE__) || defined (__OSX__)
+  EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py"));
+#else
   EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
+#endif
   checkGradientTest(configFile3, false, false);
 #ifndef PADDLE_ONLY_CPU
   checkGradientTest(configFile3, true, true);
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 6d8b8e0ca5c98b0526f36d52e8084a87ac09d87c..4554b94485f99f1fea1ebef8f5ae8a59b630d106 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -82,7 +82,11 @@ TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
 
 TEST(trainerOnePass, gpu4) { trainerOnePassTest(configFile1, true, false, 4); }
 
-TEST(trainerOnePass, parallel) { trainerOnePassTest(configFile2, true, true); }
+TEST(trainerOnePass, parallel) {
+  if (hl_get_device_count() >= 2) {
+    trainerOnePassTest(configFile2, true, true);
+  }
+}
 #endif
 
 // 2. test average_window.
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 3c08f1e3055f86aadca8844094381909e86df0c1..0557b01e36f078bebebbcb65af95357c96369514 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -2,12 +2,18 @@
 
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
-
+if(APPLE)
+    file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
+else()
+    file(GLOB UTIL_ARCH_SOURCES . arch/linux/*.cpp)
+endif()
 add_library(paddle_utils STATIC
-        ${UTIL_SOURCES})
+        ${UTIL_SOURCES}
+        ${UTIL_ARCH_SOURCES})
 add_style_check_target(paddle_utils ${UTIL_HEADERS})
-add_style_check_target(paddle_utils ${UTIL_SOURCES})
+add_style_check_target(paddle_utils ${UTIL_SOURCES}
+    ${UTIL_ARCH_SOURCES})
 add_dependencies(paddle_utils gen_proto_cpp)
 if(WITH_TESTING)
     add_subdirectory(tests)
-endif()
\ No newline at end of file
+endif()
diff --git a/paddle/utils/Excepts.cpp b/paddle/utils/Excepts.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9123508fc78d002a9fc5fd0e7e9da8ddec975d6f
--- /dev/null
+++ b/paddle/utils/Excepts.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Excepts.h"
+
+#if defined(__APPLE__) || defined(__OSX__)
+
+#include <fenv.h>
+
+int fegetexcept(void) {
+  static fenv_t fenv;
+  return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
+}
+
+int feenableexcept(unsigned int excepts) {
+  static fenv_t fenv;
+  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
+
+  if ( fegetenv (&fenv) ) return -1;
+  old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+  // unmask
+  fenv.__control &= ~new_excepts;
+  fenv.__mxcsr   &= ~(new_excepts << 7);
+
+  return ( fesetenv (&fenv) ? -1 : old_excepts );
+}
+
+int fedisableexcept(unsigned int excepts) {
+  static fenv_t fenv;
+  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
+
+  if ( fegetenv (&fenv) ) return -1;
+  old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+  // mask
+  fenv.__control |= new_excepts;
+  fenv.__mxcsr   |= new_excepts << 7;
+
+  return ( fesetenv (&fenv) ? -1 : old_excepts );
+}
+
+#endif
diff --git a/paddle/utils/Excepts.h b/paddle/utils/Excepts.h
new file mode 100644
index 0000000000000000000000000000000000000000..a84a2d33a6a3d0664218151befd6b2af44f72a97
--- /dev/null
+++ b/paddle/utils/Excepts.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef EXCEPTS_H_
+#define EXCEPTS_H_
+
+#if defined(__APPLE__) || defined(__OSX__)
+
+int fegetexcept(void);
+int feenableexcept(unsigned int excepts);
+int fedisableexcept(unsigned int excepts);
+
+#endif
+
+#endif  // EXCEPTS_H_
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index 085aca508dbbec4b290b99722d8b907528006b78..1fc0363d34597c9447996479aaf771e46d0ba600 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -16,13 +16,12 @@ limitations under the License. */
 #pragma once
 
 #include <pthread.h>
-#include <semaphore.h>
 #include <sys/time.h>
-#include <unistd.h>
-
 #include <condition_variable>
 #include <mutex>
 
+#include "DisableCopy.h"
+
 namespace paddle {
 
 /**
@@ -98,35 +97,44 @@ protected:
  * which means it will keep trying to lock until lock on successfully.
  * The SpinLock disable copy.
  */
+class SpinLockPrivate;
 class SpinLock {
 public:
-  SpinLock() { pthread_spin_init(&lock_, 0); }
-  ~SpinLock() { pthread_spin_destroy(&lock_); }
-  SpinLock(const SpinLock&) = delete;
-  SpinLock& operator=(const SpinLock&) = delete;
+  DISABLE_COPY(SpinLock);
+  SpinLock();
+  ~SpinLock();
 
   // std::mutext interface
-  void lock() { pthread_spin_lock(&lock_); }
-  void unlock() { pthread_spin_unlock(&lock_); }
+  void lock();
+  void unlock();
 
-protected:
-  pthread_spinlock_t lock_;
-  char padding_[64 - sizeof(pthread_spinlock_t)];
+private:
+  SpinLockPrivate* m;
 };
 
 /**
  * A simple wapper of semaphore which can only be shared in the same process.
  */
+class SemaphorePrivate;
 class Semaphore {
+public:
+  //! Disable copy & assign
+  Semaphore(const Semaphore& other) = delete;
+  Semaphore& operator= (const Semaphore&& other) = delete;
+
+  //! Enable move.
+  Semaphore(Semaphore&& other): m(std::move(other.m)) {
+  }
+
 public:
   /**
    * @brief Construct Function. 
    * @param[in] initValue the initial value of the 
    * semaphore, default 0.
    */
-  explicit Semaphore(int initValue = 0) { sem_init(&sem_, 0, initValue); }
+  explicit Semaphore(int initValue = 0);
 
-  ~Semaphore() { sem_destroy(&sem_); }
+  ~Semaphore();
 
   /**
    * @brief The same as wait(), except if the decrement can not 
@@ -136,41 +144,38 @@ public:
    * @return ture if the decrement proceeds before ts, 
    * else return false.
    */
-  bool timeWait(struct timespec* ts) { return (0 == sem_timedwait(&sem_, ts)); }
+  bool timeWait(struct timespec* ts);
 
   /**
    * @brief decrement the semaphore. If the semaphore's value is 0, then call blocks.
    */
-  void wait() { sem_wait(&sem_); }
+  void wait();
 
   /**
    * @brief increment the semaphore. If the semaphore's value 
    * greater than 0, wake up a thread blocked in wait().
    */
-  void post() { sem_post(&sem_); }
+  void post();
 
-protected:
-  sem_t sem_;
+private:
+  SemaphorePrivate* m;
 };
 
-static_assert(sizeof(SpinLock) == 64, "Wrong padding");
-
 /**
  * A simple wrapper of thread barrier.
  * The ThreadBarrier disable copy.
  */
+class ThreadBarrierPrivate;
 class ThreadBarrier {
 public:
+  DISABLE_COPY(ThreadBarrier);
+
   /**
    * @brief Construct Function. Initialize the barrier should
    * wait for count threads in wait().
    */
-  explicit ThreadBarrier(int count) {
-    pthread_barrier_init(&barrier_, NULL, count);
-  }
-  ~ThreadBarrier() { pthread_barrier_destroy(&barrier_); }
-  ThreadBarrier(const ThreadBarrier&) = delete;
-  ThreadBarrier& operator=(const ThreadBarrier&) = delete;
+  explicit ThreadBarrier(int count);
+  ~ThreadBarrier();
 
   /**
    * @brief . 
@@ -178,10 +183,10 @@ public:
    * then wake up all the count - 1 threads and continue run together. 
    * Else block the thread until waked by other thread .
    */
-  void wait() { pthread_barrier_wait(&barrier_); }
+  void wait();
 
-protected:
-  pthread_barrier_t barrier_;
+private:
+  ThreadBarrierPrivate* m;
 };
 
 /**
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index 928db486fa9e926ce4fb7aec5aa30075ccd3d7ee..db02d1252b4057dbfdcc7c894b4a23bc5561732b 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -18,6 +18,12 @@ limitations under the License. */
 #ifndef PADDLE_NO_PYTHON
 // must include the following two blocks, otherwise,
 // gcc compiler may produce warning
+#ifdef __APPLE__
+#define _POSIX_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#define _XOPEN_SOURCE 700
+#endif
+
 #ifdef _POSIX_C_SOURCE
 #define __TEMP_POSIX_C_SOURCE _POSIX_C_SOURCE
 #undef _POSIX_C_SOURCE
@@ -28,12 +34,7 @@ limitations under the License. */
 #endif
 #include <Python.h>
 #include <frameobject.h>
-#ifndef _POSIX_C_SOURCE
-#warning "no _POSIX_C_SOURCE defined in Python.h"
-#endif
-#ifndef _XOPEN_SOURCE
-#warning "no _XOPEN_SOURCE defined in Python.h"
-#endif
+
 #endif
 
 #include "paddle/utils/Util.h"
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index 14aae6909d40446a1fd2c190a7f842df4e3ab3af..d7b20ca5eb2f4eadaa6b4acad056d669a9b59c14 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -13,24 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Stat.h"
-
-#include <sys/syscall.h>  // for syscall()
-#include <sys/types.h>
+#include "Util.h"
 #include <iomanip>
 #include <algorithm>
 
 namespace paddle {
 
-// return the thread id used by glog
-pid_t getTID() {
-#ifndef __NR_gettid
-#define __NR_gettid 224
-#endif
-  pid_t tid = syscall(__NR_gettid);
-  CHECK_NE(tid, -1);
-  return tid;
-}
-
 StatSet globalStat("GlobalStatInfo");
 
 void Stat::addSample(uint64_t value) {
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
index 3e1d95ab1fcde9d9f1450ec52bf816c517473527..f6c826a1eeb656ff852c70f70b85c3b00a6a5e8b 100644
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
@@ -13,13 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "Util.h"
 #include "Logging.h"
 #include <thread>
 
-#include <sys/syscall.h>
-#include <unistd.h>
-inline pid_t gettid() { return syscall(SYS_gettid); }
-
 #include "Queue.h"
 #include "ThreadLocal.h"
 
@@ -175,7 +172,7 @@ public:
         jobFinishBarrier_(numWorkers + 1),
         jobFunc_(nullptr),
         checkOwner_(checkOwner) {
-    ownerThreadId_ = ::gettid();
+    ownerThreadId_ = getTID();
     workers_.resize(numWorkers);
     start();
   }
@@ -199,7 +196,7 @@ public:
    */
   void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) {
     if (checkOwner_) {
-      CHECK_EQ(ownerThreadId_, ::gettid())
+      CHECK_EQ(ownerThreadId_, getTID())
           << "this sync thread pool should be used in one thread";
     }
 
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index a4b399d144ee39def73e446ae6be910e8e5a422c..0f948f1029af85c97d2564a089b7bf878244643c 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "Util.h"
 #include "ThreadLocal.h"
-
-#include "Thread.h"
-
 #include "CommandLineParser.h"
 
 P_DEFINE_bool(thread_local_rand_use_global_seed, false,
@@ -31,11 +29,11 @@ unsigned int* ThreadLocalRand::getSeed() {
   if (!p) {  // init seed
     if (FLAGS_thread_local_rand_use_global_seed) {
       p = new unsigned int(defaultSeed_);
-    } else if (getpid() == gettid()) {  // main thread
+    } else if (getpid() == getTID()) {  // main thread
       // deterministic, but differs from global srand()
       p = new unsigned int(defaultSeed_ - 1);
     } else {
-      p = new unsigned int(defaultSeed_ + gettid());
+      p = new unsigned int(defaultSeed_ + getTID());
       LOG(INFO) << "thread use undeterministic rand seed:" << *p;
     }
     seed_.set(p);
@@ -51,7 +49,7 @@ std::default_random_engine& ThreadLocalRandomEngine::get() {
     int defaultSeed = ThreadLocalRand::getDefaultSeed();
     engine->seed(FLAGS_thread_local_rand_use_global_seed
                      ? defaultSeed
-                     : defaultSeed + gettid());
+                     : defaultSeed + getTID());
     engine_.set(engine);
   }
   return *engine;
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index e782868f69a5d0f1a1aeaca1ed2a76001eeb4721..686a1a99a4aa0645cd1a9c636afb9c2c5f50fa58 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -156,7 +156,15 @@ private:
   static void dataDestructor(void* p) { delete (T*)p; }
 
   void updateMap(T* p) {
-    pid_t tid = syscall(SYS_gettid);
+#if defined(__APPLE__) || defined(__OSX__)
+    pid_t tid = syscall(SYS_thread_selfid);
+#else
+    #ifndef __NR_gettid
+    #define __NR_gettid 224
+    #endif
+    pid_t tid = syscall(__NR_gettid);
+#endif
+    CHECK_NE(tid, -1);
     std::lock_guard<std::mutex> guard(mutex_);
     auto ret = threadMap_.insert(std::make_pair(tid, p));
     if (!ret.second) {
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index d8c3376fb18c48185abdcb7a6d65fa56f0eaa290..c3c76f907d40e0ccebddbe73f183af326dac8b8c 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -93,6 +93,19 @@ static void installProfilerSwitch() {}
 
 namespace paddle {
 
+pid_t getTID() {
+  #if defined(__APPLE__) || defined(__OSX__)
+      pid_t tid = syscall(SYS_thread_selfid);
+  #else
+      #ifndef __NR_gettid
+      #define __NR_gettid 224
+      #endif
+      pid_t tid = syscall(__NR_gettid);
+  #endif
+  CHECK_NE(tid, -1);
+  return tid;
+}
+
 static bool g_initialized = false;
 typedef std::pair<int, std::function<void()>> PriorityFuncPair;
 typedef std::vector<PriorityFuncPair> InitFuncList;
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 57839f2e215738bbc3855484512480af8f8a0e0d..2adb626c83f94c7c5d7a8d53653a46090e19e7b7 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -24,6 +24,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <mutex>
 #include <functional>
+#include <sys/syscall.h>  // for syscall()
+#include <sys/types.h>
 
 #include "CommandLineParser.h"
 #include "Logging.h"
@@ -63,6 +65,9 @@ limitations under the License. */
 
 namespace paddle {
 
+// return the thread id used by glog
+pid_t getTID();
+
 /**
  * return the 1-based index of the highest bit set
  *
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..347ae64c26dfdfcdaff62886481c20e9c4c7cfec
--- /dev/null
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Locks.h"
+#include <semaphore.h>
+#include <unistd.h>
+
+namespace paddle {
+class SemaphorePrivate {
+public:
+  sem_t sem;
+};
+
+Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+  sem_init(&m->sem, 0, initValue);
+}
+
+Semaphore::~Semaphore() {
+  sem_destroy(&m->sem);
+}
+
+bool Semaphore::timeWait(struct timespec* ts) {
+  return (0 == sem_timedwait(&m->sem, ts));
+}
+
+void Semaphore::wait() {
+  sem_wait(&m->sem);
+}
+
+void Semaphore::post() {
+  sem_post(&m->sem);
+}
+
+
+class SpinLockPrivate {
+public:
+  inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
+  inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
+  pthread_spinlock_t lock_;
+  char padding_[64 - sizeof(pthread_spinlock_t)];
+};
+
+SpinLock::SpinLock():m(new SpinLockPrivate()) {}
+
+
+SpinLock::~SpinLock() { delete m; }
+
+void SpinLock::lock() {
+  pthread_spin_lock(&m->lock_);
+}
+
+void SpinLock::unlock() {
+  pthread_spin_unlock(&m->lock_);
+}
+
+class ThreadBarrierPrivate {
+public:
+  pthread_barrier_t barrier_;
+};
+
+ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate()) {
+  pthread_barrier_init(&m->barrier_, nullptr, count);
+}
+
+ThreadBarrier::~ThreadBarrier() {
+  pthread_barrier_destroy(&m->barrier_);
+  delete m;
+}
+
+void ThreadBarrier::wait() {
+  pthread_barrier_wait(&m->barrier_);
+}
+
+}  // namespace paddle
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47e44e9d7c114cc2a8b038c816b73511c048d82e
--- /dev/null
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Logging.h"
+#include <dispatch/dispatch.h>
+#include <libkern/OSAtomic.h>
+namespace paddle {
+
+class SemaphorePrivate {
+public:
+  ~SemaphorePrivate() {
+    dispatch_release(sem);
+  }
+
+  dispatch_semaphore_t sem;
+};
+
+Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+  m->sem = dispatch_semaphore_create(initValue);
+}
+
+Semaphore::~Semaphore() {
+  delete m;
+}
+
+bool Semaphore::timeWait(timespec *ts) {
+  dispatch_time_t tm = dispatch_walltime(ts, 0);
+  return (0 == dispatch_semaphore_wait(m->sem, tm));
+}
+
+void Semaphore::wait() {
+  dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
+}
+
+void Semaphore::post() {
+  dispatch_semaphore_signal(m->sem);
+}
+
+class SpinLockPrivate {
+public:
+  SpinLockPrivate(): lock_(OS_SPINLOCK_INIT) {}
+
+  OSSpinLock lock_;
+  char padding_[64 - sizeof(OSSpinLock)];  // Padding to cache line size
+};
+
+SpinLock::SpinLock(): m(new SpinLockPrivate()) {}
+SpinLock::~SpinLock() { delete m; }
+
+void SpinLock::lock() {
+  OSSpinLockLock(&m->lock_);
+}
+
+void SpinLock::unlock() {
+  OSSpinLockUnlock(&m->lock_);
+}
+
+
+class ThreadBarrierPrivate {
+public:
+  pthread_mutex_t mutex_;
+  pthread_cond_t cond_;
+  int count_;
+  int tripCount_;
+
+  inline explicit ThreadBarrierPrivate(int cnt):count_(0), tripCount_(cnt) {
+    CHECK_NE(cnt, 0);
+    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
+    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
+  }
+
+  inline ~ThreadBarrierPrivate() {
+    pthread_cond_destroy(&cond_);
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  /**
+   * @brief wait
+   * @return true if the last wait
+   */
+  inline bool wait() {
+    pthread_mutex_lock(&mutex_);
+    ++count_;
+    if (count_ >= tripCount_) {
+      count_ = 0;
+      pthread_cond_broadcast(&cond_);
+      pthread_mutex_unlock(&mutex_);
+      return true;
+    } else {
+      pthread_cond_wait(&cond_, &mutex_);
+      pthread_mutex_unlock(&mutex_);
+      return false;
+    }
+  }
+};
+
+ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::~ThreadBarrier() { delete m; }
+void ThreadBarrier::wait() { m->wait(); }
+
+}  // namespace paddle
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index be59a785ecf366dc38a01ac53642eb137abec798..51f18893928455308a2331fa5061f9849019432c 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -3,12 +3,15 @@ add_simple_unittest(test_Logging)
 add_simple_unittest(test_Thread)
 add_simple_unittest(test_StringUtils)
 add_simple_unittest(test_CustomStackTrace)
+add_simple_unittest(test_ThreadBarrier)
 
 add_executable(
     test_CustomStackTracePrint
     test_CustomStackTracePrint.cpp
 )
 link_paddle_exe(test_CustomStackTracePrint)
-add_test(NAME test_CustomStackTracePrint
-    COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+if(NOT APPLE)
+    add_test(NAME test_CustomStackTracePrint
+        COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 26ca4c678a650df50d372b0fbb4c3e03d52f91df..3e665021471cb3c179b13960dcc9f2284a0d664c 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -45,6 +45,7 @@ void testNormalImpl(const std::function<void(
   size_t cntDown = countDown;
   while (cntDown-- > 0) {
     startBarrier.wait();
+    sleep(1);
     doneBarrier.wait();
     ASSERT_TRUE(tracer.empty());
   }
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..90bd6c21bc8e5ac05b248a0517f9e4fb43d04054
--- /dev/null
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <set>
+#include <vector>
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Locks.h"
+
+P_DEFINE_int32(test_thread_num, 100, "testing thread number");
+
+void testNormalImpl(size_t thread_num,
+                    const std::function<void(size_t,
+                    std::mutex&, std::set<std::thread::id>&,
+                    paddle::ThreadBarrier&)>& callback) {
+ std::mutex mutex;
+ std::set<std::thread::id> tids;
+ paddle::ThreadBarrier barrier(thread_num);
+
+ std::vector<std::thread> threads;
+ threads.reserve(thread_num);
+ for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back([&thread_num, &mutex,
+                         &tids, &barrier, &callback]{
+        callback(thread_num, mutex, tids, barrier);
+    });
+ }
+
+ for (auto& thread : threads) {
+   thread.join();
+ }
+}
+
+TEST(ThreadBarrier, normalTest) {
+  for (auto &thread_num : {10, 30, 50 , 100 , 300, 1000}) {
+    testNormalImpl(thread_num,
+                  [](size_t thread_num, std::mutex& mutex,
+                  std::set<std::thread::id>& tids,
+                  paddle::ThreadBarrier& barrier){
+      {
+        std::lock_guard<std::mutex> guard(mutex);
+        tids.insert(std::this_thread::get_id());
+      }
+      barrier.wait();
+      // Check whether all threads reach this point or not
+      CHECK_EQ(tids.size(), thread_num);
+    });
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}