diff --git a/.gitignore b/.gitignore
index 00368ede67d3d2426f50a278578a33d18b736ca0..801c76325c92e8c1b381f7398416ab703e829ae6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 *.DS_Store
 build/
+*.user
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 007f1f18bb655e10c45bc7e21a299861eb204dd2..99c6c0d373052fa1be528ebb82c3d2f248e64bb0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,7 @@ find_package(CUDA QUIET)
 find_package(Protobuf REQUIRED)
 find_package(PythonLibs 2.7 REQUIRED)
 find_package(PythonInterp 2.7 REQUIRED)
+find_package(ZLIB REQUIRED)
 find_package(NumPy)
 find_package(Threads REQUIRED)
 find_package(Glog)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index c95d4063105e74df7acd491558f399dff6a8e0a8..4b99e7f7fb6af3ca85d6dc642a4aea07e64c7049 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -58,8 +58,8 @@ set(COMMON_FLAGS
     -fPIC
     -fno-omit-frame-pointer
     -Wall
-#    -Wextra
-#    -Werror
+    -Wextra
+    -Werror
     -Wnon-virtual-dtor
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
diff --git a/cmake/util.cmake b/cmake/util.cmake
index f3227d27c53c24e0e2e6a68eccecb188b1f1c14d..5f2f4a075cc579fac827fefbfc30f6743d2e4cc9 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -1,19 +1,3 @@
-# MAC OS does not contain start-up and whole-archive args
-if(APPLE)
-    set(GROUP_START "")
-    set(GROUP_END "")
-
-    set(ARCHIVE_START "")
-    set(ARCHIVE_END "")
-else()
-    set(GROUP_START "-Wl,--start-group")
-    set(GROUP_END "-Wl,--end-group")
-
-    set(ARCHIVE_START "-Wl,--whole-archive")
-    set(ARCHIVE_END "-Wl,--no-whole-archive")
-endif()
-
-
 # Some common routine for paddle compile.
 
 # target_circle_link_libraries
@@ -23,17 +7,46 @@ endif()
 # Rest Arguments: libraries which link together.
 function(target_circle_link_libraries TARGET_NAME)
     if(APPLE)
+        set(LIBS)
+        set(inArchive OFF)
+        set(libsInArgn)
+
         foreach(arg ${ARGN})
-            list(APPEND OSX_LIBRARIES "-Wl,-force_load" "${arg}")
+            if(${arg} STREQUAL "ARCHIVE_START")
+                set(inArchive ON)
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                set(inArchive OFF)
+            else()
+                if(inArchive)
+                    list(APPEND LIBS "-Wl,-force_load")
+                endif()
+                list(APPEND LIBS ${arg})
+                list(APPEND libsInArgn ${arg})
+            endif()
         endforeach()
+
+        list(REVERSE libsInArgn)
         target_link_libraries(${TARGET_NAME}
-                ${OSX_LIBRARIES} -lz)
-    else()
+            ${LIBS}
+            ${libsInArgn})
+
+    else()  # LINUX
+        set(LIBS)
+
+        foreach(arg ${ARGN})
+            if(${arg} STREQUAL "ARCHIVE_START")
+                list(APPEND LIBS "-Wl,--whole-archive")
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                list(APPEND LIBS "-Wl,--no-whole-archive")
+            else()
+                list(APPEND LIBS ${arg})
+            endif()
+        endforeach()
+
         target_link_libraries(${TARGET_NAME}
-                ${GROUP_START}
-                ${ARGN}
-                -lz
-                ${GROUP_END})
+                "-Wl,--start-group"
+                ${LIBS}
+                "-Wl,--end-group")
     endif()
 endfunction()
 
@@ -65,20 +78,20 @@ function(link_paddle_exe TARGET_NAME)
     if(PADDLE_WITH_INTERNAL)
         set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
         target_circle_link_libraries(${TARGET_NAME}
-            ${ARCHIVE_START}
+            ARCHIVE_START
             paddle_internal_gserver
             paddle_internal_owlqn
-            ${ARCHIVE_END}
+            ARCHIVE_END
             paddle_internal_parameter)
     else()
         set(INTERAL_LIBS "")
     endif()
 
     target_circle_link_libraries(${TARGET_NAME}
-        ${ARCHIVE_START}
+        ARCHIVE_START
         paddle_gserver
         ${METRIC_LIBS}
-        ${ARCHIVE_END}
+        ARCHIVE_END
         paddle_pserver
         paddle_trainer_lib
         paddle_network
@@ -91,8 +104,10 @@ function(link_paddle_exe TARGET_NAME)
         ${PROTOBUF_LIBRARY}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CBLAS_LIBS}
+        ${INTERAL_LIBS}
+        ${ZLIB_LIBRARIES}
         ${CMAKE_DL_LIBS}
-        ${INTERAL_LIBS})
+        )
     
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 903922204343ec49a1696eb35b06c127310b432d..0f497e44d4c25c2f8b79a1ee3f0630456d587820 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -277,6 +277,7 @@ void NeuralNetwork::getState(MachineState& machineState) {
 }
 
 void NeuralNetwork::backward(const UpdateCallback& callback) {
+  gLayerStackTrace.pop("");  // tell layer trace is during backward.
   FOR_EACH_R(layer, layers_) {
     REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
     if ((*layer)->needGradient()) {
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index ca8eadbc1aa42265c1e505d41e1a134b0e7f19b9..f7aa60380f23eeea91ee852480862f6b19caedec 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -49,7 +49,7 @@ public:
    */
   virtual void* alloc(size_t size) {
       void* ptr;
-      posix_memalign(&ptr, 32ul, size);
+      CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
       CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
       return ptr;
   }
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index bae5d8c684d8920b738fcd1cda1bd831f2333605..491b0cda7b9e1a13882aee6621e0de984709ae80 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -38,7 +38,7 @@ static std::mt19937 RandomEngine(time(0));
 inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
                                                  size_t align = ALIGN) {
   float* ptr;
-  posix_memalign((void**)&ptr, align, len * sizeof(float));
+  CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
   return std::unique_ptr<float[]>(ptr);
 }
 
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 050f2ca9ced80d18bef2a83f087ee7aa88fa736d..51e346fef91bf8dae7a5084ba8b2d86d9021650b 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -254,10 +254,4 @@ int main(int argc, char** argv) {
   return RUN_ALL_TESTS();
 }
 
-#else
-
-int main(int argc, char const* argv[]) {
-    return 0;
-}
-
 #endif
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 4f92aec1d967169e2c27579b8a95941c029a3e49..1a22abf7cf80157039f6147293e7648d654e45f7 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -125,9 +125,11 @@ TEST_F(CommonTest, sgdUpdate) {
   const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
   for (auto& size : sizeVec_) {
     real *gradientBuffer, *valueBuffer, *momentumBuffer;
-    posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size);
-    posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size);
-    posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size);
+    CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
+        0);
+    CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
+    CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
+        0);
 
     for (size_t i = 0; i < size; i++) {
       gradientBuffer[i] = 1.0;
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 3c08f1e3055f86aadca8844094381909e86df0c1..0557b01e36f078bebebbcb65af95357c96369514 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -2,12 +2,18 @@
 
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
-
+if(APPLE)
+    file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
+else()
+    file(GLOB UTIL_ARCH_SOURCES . arch/linux/*.cpp)
+endif()
 add_library(paddle_utils STATIC
-        ${UTIL_SOURCES})
+        ${UTIL_SOURCES}
+        ${UTIL_ARCH_SOURCES})
 add_style_check_target(paddle_utils ${UTIL_HEADERS})
-add_style_check_target(paddle_utils ${UTIL_SOURCES})
+add_style_check_target(paddle_utils ${UTIL_SOURCES}
+    ${UTIL_ARCH_SOURCES})
 add_dependencies(paddle_utils gen_proto_cpp)
 if(WITH_TESTING)
     add_subdirectory(tests)
-endif()
\ No newline at end of file
+endif()
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/utils/CustomStackTrace.cpp
index 50d7f5402f586771194fa5b1578293b7614ea1f2..232a478ecd93a7dcb7da7b02a5a1af37a1d1bc43 100644
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
@@ -14,9 +14,44 @@ limitations under the License. */
 
 
 #include "CustomStackTrace.h"
+#include "CommandLineParser.h"
+#include <iostream>
+
+P_DEFINE_bool(layer_stack_error_only_current_thread,
+    true,
+    "Dump current thread or whole process layer stack when signal error "
+    "occurred. true means only dump current thread layer stack");
 
 namespace paddle {
 
 CustomStackTrace<std::string> gLayerStackTrace;
 
+static std::mutex gLayerStackTraceMtx;
+void installLayerStackTracer() {
+  logging::installFailureWriter([](const char* data, int sz) {
+    std::lock_guard<std::mutex> guard(gLayerStackTraceMtx);
+    if (!gLayerStackTrace.empty()) {
+      size_t curTid = -1UL;
+      std::hash<std::thread::id> hasher;
+      gLayerStackTrace.dump([&curTid, &hasher](std::thread::id tid,
+                            bool* isForwarding,
+                            const std::string& layerName) {
+        if (curTid != hasher(tid)) {
+          if (curTid != -1UL) {
+            std::cerr << std::endl;
+          }
+          curTid = hasher(tid);
+          std::cerr << "Thread [" << tid << "] ";
+          if (isForwarding) {
+            std::cerr << (*isForwarding ? "Forwarding ": "Backwarding ");
+          }
+        }
+        std::cerr << layerName << ", ";
+      }, FLAGS_layer_stack_error_only_current_thread);
+      std::cerr << std::endl;
+    }
+    std::cerr.write(data, sz);
+  });
+}
+
 }  // namespace paddle
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index e1b2d2d8e5ee6ce572b10b94a42fb285078dddc1..774c4db2b9be40c38286ef1248bf77746949fd6b 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <stack>
+#include <thread>
+#include <unordered_map>
+#include <functional>
 
 #include "ThreadLocal.h"
 
@@ -29,25 +32,18 @@ namespace paddle {
  * @code{.cpp}
  * 
  * paddle::CustomStackTrace<std::string> stack;
- * PASS_TEST=0;
  * for (auto& layer : layers){
  *   stack.push(layer->getName());
- *   layer->forward(passType);
+ *   layer->forward();
  * }
- * for (auto& layer : layers){
+ *
+ * stack.pop("");  // mark under pop stage.
+ *
+ * for (auto it = layers.rbegin(); it != layers.rend(); ++it){
+ *   auto& layer = *it;
  *   layer->backward(passType);
  *   stack.pop(layer->getName());
  * }
- * 
- * if(passType == PASS_TEST) {
- *   stack.clear();
- * }
- * else {
- *   stack.dump([](const std::string& layername){
- *     LOG(INFO) << "LayerName: " << layername;
- *   })
- * }
- * 
  *
  * @endcode
  */
@@ -55,45 +51,141 @@ template <typename T>
 class CustomStackTrace{
 public:
   /**
-   * @brief Pop out an item from the top of the stack. For safety the item 
-   * will be poped should equal to ip.
+   * @brief Pop out an item from the top of the stack if item == top.
+   *        Else, just set status to popping.
    */
-  void pop(const T& ip) {
-    auto& p = *logstack_;
-    CHECK_EQ(ip, p.top());
-    p.pop();
+  void pop(const T& item) {
+    pushing() = false;
+    auto& s = this->stack();
+    if (item == s.top()) {
+      s.pop();
+    }
   }
+
   /**
-   * @brief Empty the stack by sequence from top to button.
-   * @param[in] callback A function deal with each item while dumping.
-   * It must have and only have a in parameter which is the stack item.
+   * @brief clear current thread stack.
    */
-  template <typename Callback>
-  void dump(Callback callback) {
-    auto& p = *logstack_;
-    while (!p.empty()) {
-      callback(p.top());
-      p.pop();
+  void clear() {
+    auto& s = stack();
+    while (!s.empty()) {
+      s.pop();
     }
   }
+
   /**
-   * @brief Only empty the stack.
+   * @brief return true if all thread's stack is empty.
+   * @return true if empty
    */
-  void clear() {
-    dump([](const T& ip){});
+  bool empty() const {
+    std::lock_guard<std::mutex> g(this->mtx_);
+    for (auto p : this->stackBuffers_) {
+      std::stack<T>& s = *p.second;
+      if (!s.empty()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+
+  /**
+   * @brief DumpCallback Type. It will be invoked many times by dump method.
+   *
+   * The first parameter is stack thread id.
+   * The second parameter is the last action of stack is push or not.
+   * The third parameter is the item in stack.
+   */
+  typedef std::function<void(const std::thread::id& /*threadId*/,
+                              bool* /*isPushing*/,
+                              const T& /*item*/)> DumpCallback;
+
+  /**
+   * Dump all thread stack, and all stack will be cleared.
+   */
+  void dump(const DumpCallback& callback, bool onlyCurrentThread = false) {
+    std::lock_guard<std::mutex> g(this->mtx_);
+    for (auto p : this->stackBuffers_) {
+      std::thread::id tid = p.first;
+      if (onlyCurrentThread && tid != std::this_thread::get_id()) {
+        continue;
+      }
+      std::stack<T>& s = *p.second;
+      bool* isPush = nullptr;
+      auto it = this->pushingBuffers_.find(tid);
+      if (it != this->pushingBuffers_.end()) {
+        isPush = it->second;
+      }
+
+      while (!s.empty()) {
+        callback(tid, isPush, s.top());
+        s.pop();
+      }
+    }
   }
+
   /**
-   * @brief Push item ip to the top of the stack.
+   * @brief Push item to current thread stack.
    */
-  void push(const T& ip) {
-    auto& p = *logstack_;
-    p.push(ip);
+  void push(const T& item) {
+    pushing() = true;
+    auto& p = this->stack();
+    p.push(item);
   }
 
 private:
-  ThreadLocalD<std::stack<T> > logstack_;
+  /**
+   * Get thread local attribute, and save them into a map (threadId => TYPE*)
+   *
+   * @tparam TYPE thread local attribute type.
+   * @param threadLocal Thread Local object.
+   * @param buffers a map from threadId to TYPE*
+   */
+  template <typename TYPE>
+  inline TYPE& getThreadLocal(
+      ThreadLocal<TYPE>& threadLocal,
+      std::unordered_map<std::thread::id, TYPE*>& buffers) {
+    TYPE* retv = threadLocal.get(false);
+    if (retv) {
+      return *retv;
+    } else {
+      std::lock_guard<std::mutex> guard(this->mtx_);
+      retv = threadLocal.get();
+      auto id = std::this_thread::get_id();
+      buffers.insert({id, retv});
+      return *retv;
+    }
+  }
+
+  /**
+   * @brief Get thread local stack reference.
+   */
+  std::stack<T>& stack() {
+    return this->getThreadLocal(this->logStack_,
+                                this->stackBuffers_);
+  }
+
+  /**
+   * @brief Get thread local pushing flag.
+   */
+  bool& pushing() {
+    return this->getThreadLocal(this->isPushing_,
+                                this->pushingBuffers_);
+  }
+
+private:
+  mutable std::mutex mtx_;
+
+  std::unordered_map<std::thread::id, std::stack<T>* > stackBuffers_;
+  std::unordered_map<std::thread::id, bool* > pushingBuffers_;
+  ThreadLocal<bool> isPushing_;
+  ThreadLocal<std::stack<T> > logStack_;
 };
 
 extern CustomStackTrace<std::string> gLayerStackTrace;
 
+/**
+ * @brief Install a failure handler to print layer stack when error.
+ */
+extern void installLayerStackTracer();
+
 }  // namespace paddle
diff --git a/paddle/utils/Locks.cpp b/paddle/utils/Locks.cpp
deleted file mode 100644
index c2f58cf5764ef9d36a40f8fffa11075ed5ced75b..0000000000000000000000000000000000000000
--- a/paddle/utils/Locks.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __APPLE__
-#include <dispatch/dispatch.h>
-#endif
-
-#ifdef __APPLE__
-#ifndef PTHREAD_BARRIER_H_
-#define PTHREAD_BARRIER_H_
-
-#include <pthread.h>
-#include <errno.h>
-
-typedef int pthread_barrierattr_t;
-typedef struct {
-    pthread_mutex_t mutex;
-    pthread_cond_t cond;
-    int count;
-    int tripCount;
-} pthread_barrier_t;
-
-int pthread_barrier_init(pthread_barrier_t *barrier,
-   const pthread_barrierattr_t *attr, unsigned int count) {
-    if (count == 0) {
-        errno = EINVAL;
-        return -1;
-    }
-    if (pthread_mutex_init(&barrier->mutex, 0) < 0) {
-        return -1;
-    }
-    if (pthread_cond_init(&barrier->cond, 0) < 0) {
-        pthread_mutex_destroy(&barrier->mutex);
-        return -1;
-    }
-    barrier->tripCount = count;
-    barrier->count = 0;
-
-    return 0;
-}
-
-int pthread_barrier_destroy(pthread_barrier_t *barrier) {
-    pthread_cond_destroy(&barrier->cond);
-    pthread_mutex_destroy(&barrier->mutex);
-    return 0;
-}
-
-int pthread_barrier_wait(pthread_barrier_t *barrier) {
-    pthread_mutex_lock(&barrier->mutex);
-    ++(barrier->count);
-    if (barrier->count >= barrier->tripCount) {
-        barrier->count = 0;
-        pthread_cond_broadcast(&barrier->cond);
-        pthread_mutex_unlock(&barrier->mutex);
-        return 1;
-    } else {
-        pthread_cond_wait(&barrier->cond, &(barrier->mutex));
-        pthread_mutex_unlock(&barrier->mutex);
-        return 0;
-    }
-}
-
-#endif  // PTHREAD_BARRIER_H_
-
-typedef int pthread_spinlock_t;
-
-int pthread_spin_init(pthread_spinlock_t *lock, int pshared) {
-    __asm__ __volatile__("" ::: "memory");
-    *lock = 0;
-    return 0;
-}
-
-int pthread_spin_destroy(pthread_spinlock_t *lock) {
-    return 0;
-}
-
-int pthread_spin_lock(pthread_spinlock_t *lock) {
-    while (1) {
-        int i;
-        for (i=0; i < 10000; i++) {
-            if (__sync_bool_compare_and_swap(lock, 0, 1)) {
-                return 0;
-            }
-        }
-        sched_yield();
-    }
-}
-
-int pthread_spin_unlock(pthread_spinlock_t *lock) {
-    __asm__ __volatile__("" ::: "memory");
-    *lock = 0;
-    return 0;
-}
-
-#endif  // __APPLE__
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index e7b0b77081f3695e4dad33068c2e3fdf65f631eb..1fc0363d34597c9447996479aaf771e46d0ba600 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -16,56 +16,11 @@ limitations under the License. */
 #pragma once
 
 #include <pthread.h>
-#include <semaphore.h>
 #include <sys/time.h>
-#include <unistd.h>
-
 #include <condition_variable>
 #include <mutex>
 
-#ifdef __APPLE__
-#include <dispatch/dispatch.h>
-#endif
-
-#ifdef __APPLE__
-#ifndef PTHREAD_BARRIER_H_
-#define PTHREAD_BARRIER_H_
-
-#include <pthread.h>
-#include <errno.h>
-
-typedef int pthread_barrierattr_t;
-typedef struct {
-    pthread_mutex_t mutex;
-    pthread_cond_t cond;
-    int count;
-    int tripCount;
-} pthread_barrier_t;
-
-
-extern int pthread_barrier_init(pthread_barrier_t *barrier,
-                                const pthread_barrierattr_t *attr,
-                                unsigned int count);
-
-extern int pthread_barrier_destroy(pthread_barrier_t *barrier);
-
-extern int pthread_barrier_wait(pthread_barrier_t *barrier);
-
-#endif  // PTHREAD_BARRIER_H_
-
-typedef int pthread_spinlock_t;
-
-extern int pthread_spin_init(pthread_spinlock_t *lock, int pshared);
-
-extern int pthread_spin_destroy(pthread_spinlock_t *lock);
-
-extern int pthread_spin_lock(pthread_spinlock_t *lock);
-
-extern int pthread_spin_unlock(pthread_spinlock_t *lock);
-
-#endif
-
-
+#include "DisableCopy.h"
 
 namespace paddle {
 
@@ -142,58 +97,44 @@ protected:
  * which means it will keep trying to lock until lock on successfully.
  * The SpinLock disable copy.
  */
+class SpinLockPrivate;
 class SpinLock {
 public:
-  SpinLock() { pthread_spin_init(&lock_, 0); }
-  ~SpinLock() { pthread_spin_destroy(&lock_); }
-  SpinLock(const SpinLock&) = delete;
-  SpinLock& operator=(const SpinLock&) = delete;
+  DISABLE_COPY(SpinLock);
+  SpinLock();
+  ~SpinLock();
 
   // std::mutext interface
-  void lock() { pthread_spin_lock(&lock_); }
-  void unlock() { pthread_spin_unlock(&lock_); }
+  void lock();
+  void unlock();
 
-protected:
-  pthread_spinlock_t lock_;
-  char padding_[64 - sizeof(pthread_spinlock_t)];
+private:
+  SpinLockPrivate* m;
 };
 
 /**
  * A simple wapper of semaphore which can only be shared in the same process.
  */
-
-#ifdef __APPLE__
-
+class SemaphorePrivate;
 class Semaphore {
 public:
-    explicit Semaphore(int initValue = 0) {
-        sem_ = dispatch_semaphore_create(initValue);
-    }
-
-    ~Semaphore() { dispatch_release(sem_); }
-    bool timeWait(struct timespec* ts) {
-        dispatch_time_t m = dispatch_walltime(ts, 0);
-        return (0 == dispatch_semaphore_wait(sem_, m));
-    }
-    void wait() { dispatch_semaphore_wait(sem_, DISPATCH_TIME_FOREVER); }
-    void post() { dispatch_semaphore_signal(sem_);}
-
-protected:
- dispatch_semaphore_t sem_;
-};
+  //! Disable copy & assign
+  Semaphore(const Semaphore& other) = delete;
+  Semaphore& operator= (const Semaphore&& other) = delete;
 
-#else
+  //! Enable move.
+  Semaphore(Semaphore&& other): m(std::move(other.m)) {
+  }
 
-class Semaphore {
 public:
   /**
    * @brief Construct Function. 
    * @param[in] initValue the initial value of the 
    * semaphore, default 0.
    */
-  explicit Semaphore(int initValue = 0) { sem_init(&sem_, 0, initValue); }
+  explicit Semaphore(int initValue = 0);
 
-  ~Semaphore() { sem_destroy(&sem_); }
+  ~Semaphore();
 
   /**
    * @brief The same as wait(), except if the decrement can not 
@@ -203,43 +144,38 @@ public:
    * @return ture if the decrement proceeds before ts, 
    * else return false.
    */
-  bool timeWait(struct timespec* ts) { return (0 == sem_timedwait(&sem_, ts)); }
+  bool timeWait(struct timespec* ts);
 
   /**
    * @brief decrement the semaphore. If the semaphore's value is 0, then call blocks.
    */
-  void wait() { sem_wait(&sem_); }
+  void wait();
 
   /**
    * @brief increment the semaphore. If the semaphore's value 
    * greater than 0, wake up a thread blocked in wait().
    */
-  void post() { sem_post(&sem_); }
+  void post();
 
-protected:
-  sem_t sem_;
+private:
+  SemaphorePrivate* m;
 };
 
-#endif
-
-static_assert(sizeof(SpinLock) == 64, "Wrong padding");
-
 /**
  * A simple wrapper of thread barrier.
  * The ThreadBarrier disable copy.
  */
+class ThreadBarrierPrivate;
 class ThreadBarrier {
 public:
+  DISABLE_COPY(ThreadBarrier);
+
   /**
    * @brief Construct Function. Initialize the barrier should
    * wait for count threads in wait().
    */
-  explicit ThreadBarrier(int count) {
-    pthread_barrier_init(&barrier_, NULL, count);
-  }
-  ~ThreadBarrier() { pthread_barrier_destroy(&barrier_); }
-  ThreadBarrier(const ThreadBarrier&) = delete;
-  ThreadBarrier& operator=(const ThreadBarrier&) = delete;
+  explicit ThreadBarrier(int count);
+  ~ThreadBarrier();
 
   /**
    * @brief . 
@@ -247,10 +183,10 @@ public:
    * then wake up all the count - 1 threads and continue run together. 
    * Else block the thread until waked by other thread .
    */
-  void wait() { pthread_barrier_wait(&barrier_); }
+  void wait();
 
-protected:
-  pthread_barrier_t barrier_;
+private:
+  ThreadBarrierPrivate* m;
 };
 
 /**
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index aab5446a98c0bae3f03b4b319836721bdd1112da..ff6e8ade2cd48f85c3f1a0c2acd5542097aa4117 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -25,7 +25,7 @@ namespace paddle {
 pid_t getTID() {
   #if defined(__APPLE__) || defined(__OSX__)
       pid_t tid = syscall(SYS_thread_selfid);
-  #elif defined(__LINUX__)
+  #else
       #ifndef __NR_gettid
       #define __NR_gettid 224
       #endif
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 1c1d75dc5bed98848fcb03366b383201ee6f5024..d8c3376fb18c48185abdcb7a6d65fa56f0eaa290 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -129,13 +129,7 @@ void runInitFunctions() {
 
 void initMain(int argc, char** argv) {
   initializeLogging(argc, argv);
-  logging::installFailureWriter([](const char* data, int sz) {
-    std::cerr << "Current Layer forward/backward stack is " << std::endl;
-    gLayerStackTrace.dump([](const std::string& layername){
-      std::cerr << "LayerName: " << layername << std::endl;
-    });
-    std::cerr.write(data, sz);
-  });
+  installLayerStackTracer();
   std::string line;
   for (int i = 0; i < argc; ++i) {
     line += argv[i];
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..347ae64c26dfdfcdaff62886481c20e9c4c7cfec
--- /dev/null
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Locks.h"
+#include <semaphore.h>
+#include <unistd.h>
+
+namespace paddle {
+class SemaphorePrivate {
+public:
+  sem_t sem;
+};
+
+Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+  sem_init(&m->sem, 0, initValue);
+}
+
+Semaphore::~Semaphore() {
+  sem_destroy(&m->sem);
+}
+
+bool Semaphore::timeWait(struct timespec* ts) {
+  return (0 == sem_timedwait(&m->sem, ts));
+}
+
+void Semaphore::wait() {
+  sem_wait(&m->sem);
+}
+
+void Semaphore::post() {
+  sem_post(&m->sem);
+}
+
+
+class SpinLockPrivate {
+public:
+  inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
+  inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
+  pthread_spinlock_t lock_;
+  char padding_[64 - sizeof(pthread_spinlock_t)];
+};
+
+SpinLock::SpinLock():m(new SpinLockPrivate()) {}
+
+
+SpinLock::~SpinLock() { delete m; }
+
+void SpinLock::lock() {
+  pthread_spin_lock(&m->lock_);
+}
+
+void SpinLock::unlock() {
+  pthread_spin_unlock(&m->lock_);
+}
+
+class ThreadBarrierPrivate {
+public:
+  pthread_barrier_t barrier_;
+};
+
+ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate()) {
+  pthread_barrier_init(&m->barrier_, nullptr, count);
+}
+
+ThreadBarrier::~ThreadBarrier() {
+  pthread_barrier_destroy(&m->barrier_);
+  delete m;
+}
+
+void ThreadBarrier::wait() {
+  pthread_barrier_wait(&m->barrier_);
+}
+
+}  // namespace paddle
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5e0411624fd60cd441d251c14084cd0a1ca42bb5
--- /dev/null
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Logging.h"
+#include <dispatch/dispatch.h>
+#include <libkern/OSAtomic.h>
+namespace paddle {
+class SemaphorePrivate {
+public:
+  ~SemaphorePrivate() {
+    dispatch_release(sem);
+  }
+
+  dispatch_semaphore_t sem;
+};
+
+Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+  m->sem = dispatch_semaphore_create(initValue);
+}
+
+Semaphore::~Semaphore() {
+  delete m;
+}
+
+bool Semaphore::timeWait(timespec *ts) {
+  dispatch_time_t tm = dispatch_walltime(ts, 0);
+  return (0 == dispatch_semaphore_wait(m->sem, tm));
+}
+
+void Semaphore::wait() {
+  dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
+}
+
+void Semaphore::post() {
+  dispatch_semaphore_signal(m->sem);
+}
+
+class SpinLockPrivate {
+public:
+  SpinLockPrivate(): lock_(0) {}
+
+  OSSpinLock lock_;
+  char padding_[64 - sizeof(OSSpinLock)];  // Padding to cache line size
+};
+
+SpinLock::SpinLock(): m(new SpinLockPrivate()) {}
+SpinLock::~SpinLock() { delete m; }
+
+void SpinLock::lock() {
+  OSSpinLockLock(&m->lock_);
+}
+
+void SpinLock::unlock() {
+  OSSpinLockUnlock(&m->lock_);
+}
+
+
+class ThreadBarrierPrivate {
+public:
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
+  int count;
+  int tripCount;
+
+  inline explicit ThreadBarrierPrivate(int cnt):count(0), tripCount(cnt) {
+    CHECK_NE(cnt, 0);
+    CHECK_GE(pthread_mutex_init(&mutex, 0), 0);
+    CHECK_GE(pthread_cond_init(&cond, 0), 0);
+  }
+
+  inline ~ThreadBarrierPrivate() {
+    pthread_cond_destroy(&cond);
+    pthread_mutex_destroy(&mutex);
+  }
+
+  /**
+   * @brief wait
+   * @return true if the last wait
+   */
+  inline bool wait() {
+    pthread_mutex_lock(&mutex);
+    ++count;
+    if (count > tripCount) {
+      count = 0;
+      pthread_cond_broadcast(&cond);
+      pthread_mutex_unlock(&mutex);
+      return true;
+    } else {
+      pthread_cond_wait(&cond, &mutex);
+      pthread_mutex_unlock(&mutex);
+      return false;
+    }
+  }
+};
+
+ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::~ThreadBarrier() { delete m; }
+void ThreadBarrier::wait() { m->wait(); }
+
+}  // namespace paddle
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index 147ee3f6d6d86775f2f8c7839c79180f1daffa76..5b31cd393dd1fc319be0ae9a5811f5637617e08d 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -2,3 +2,15 @@ add_simple_unittest(test_CommandLineParser)
 add_simple_unittest(test_Logging)
 add_simple_unittest(test_Thread)
 add_simple_unittest(test_StringUtils)
+add_simple_unittest(test_CustomStackTrace)
+
+add_executable(
+    test_CustomStackTracePrint
+    test_CustomStackTracePrint.cpp
+)
+link_paddle_exe(test_CustomStackTracePrint)
+if(NOT APPLE)
+    add_test(NAME test_CustomStackTracePrint
+        COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..26ca4c678a650df50d372b0fbb4c3e03d52f91df
--- /dev/null
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <chrono>
+
+#include "paddle/utils/CustomStackTrace.h"
+#include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Locks.h"
+
+P_DEFINE_int32(test_thread_num, 10, "testing thread number");
+
+void testNormalImpl(const std::function<void(
+                      paddle::CustomStackTrace<std::string>&,
+                      size_t, size_t,
+                      paddle::ThreadBarrier&,
+                      paddle::ThreadBarrier&)>& callback) {
+  paddle::CustomStackTrace<std::string> tracer;
+  paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1);
+  paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1);
+  constexpr size_t countDown = 10;
+  constexpr size_t layerSize = 1000;
+  std::vector<std::unique_ptr<std::thread>> threads;
+  threads.reserve(FLAGS_test_thread_num);
+
+  for (int32_t i=0; i < FLAGS_test_thread_num; ++i) {
+    threads.emplace_back(new std::thread([&tracer, &countDown, &layerSize,
+                                         &startBarrier, &doneBarrier,
+                                         &callback]{
+      callback(tracer, countDown, layerSize, startBarrier, doneBarrier);
+    }));
+  }
+  size_t cntDown = countDown;
+  while (cntDown-- > 0) {
+    startBarrier.wait();
+    doneBarrier.wait();
+    ASSERT_TRUE(tracer.empty());
+  }
+
+  for (auto& thread : threads) {
+    thread->join();
+  }
+}
+
+
+TEST(CustomStackTrace, normalTrain) {
+  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
+                 size_t countDown, size_t layerSize,
+                 paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){
+    while (countDown-- > 0) {
+      start.wait();
+      for (size_t i=0; i < layerSize; ++i) {
+        tracer.push("layer_" + std::to_string(i));
+      }
+      tracer.pop("");
+      for (size_t i=0; i < layerSize; ++i) {
+        tracer.pop("layer_" + std::to_string(layerSize - 1 - i));
+      }
+      finish.wait();
+    }
+  });
+}
+
+TEST(CustomStackTrace, normalTest) {
+  testNormalImpl([] (paddle::CustomStackTrace<std::string>& tracer,
+                 size_t countDown, size_t layerSize,
+                 paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){
+    while (countDown-- > 0) {
+      start.wait();
+      for (size_t i=0; i < layerSize; ++i) {
+        tracer.push("layer_" + std::to_string(i));
+      }
+      tracer.clear();  // in forward test, tracer will clear after forward.
+      finish.wait();
+    }
+  });
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c19c98614e6a7d6285990aa19849131579f7307b
--- /dev/null
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Util.h"
+#include "paddle/utils/CustomStackTrace.h"
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+
+  for (size_t i=0; i < 1000; ++i) {
+    paddle::gLayerStackTrace.push("layer_" + std::to_string(i));
+    if (i == 998) {
+      throw "Unhandle exception";
+    }
+  }
+
+  return 0;
+}
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.sh b/paddle/utils/tests/test_CustomStackTracePrint.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b5543485f365adee49629578d470a14e0c742547
--- /dev/null
+++ b/paddle/utils/tests/test_CustomStackTracePrint.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+echo "Test Custom Stack Trace print correct result when fail"
+./test_CustomStackTracePrint >customStackTraceLog 2>&1
+if [ $? -eq 0 ]; then
+  exit 1
+else
+  set -e
+  TEXT=""
+  for ((i=0; i<=998; i++))
+  do
+    TEXT="layer_$i, "$TEXT
+  done
+  TEXT="Forwarding "$TEXT
+  grep -q "$TEXT" customStackTraceLog
+fi