diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst
index 1583fce981fed64141acdccc0d89b46b63d13cc0..f902d1c995bc5045d62d0b2e279ee612f9dc7c93 100644
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -245,10 +245,10 @@ addto_layer
     :members: addto_layer
     :noindex:
 
-convex_comb_layer
+linear_comb_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
-    :members: convex_comb_layer
+    :members: linear_comb_layer
     :noindex:
 
 interpolation_layer
@@ -280,7 +280,13 @@ tensor_layer
 ..  automodule:: paddle.trainer_config_helpers.layers
     :members: tensor_layer
     :noindex:
-    
+
+cos_sim
+-------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: cos_sim
+    :noindex:
+
 trans_layer
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -341,12 +347,6 @@ rank_cost
     :members: rank_cost
     :noindex:
 
-cos_sim
--------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: cos_sim
-    :noindex:
-
 crf_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 19c94b2453981301bcb632ecbe5d322369009973..c2dce1977bdf5daefb6c5b8032bb6b12563e9425 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -150,7 +150,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
 
 
 // APIs available after R4:
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
   __macro(cudnnBatchNormalizationForwardTraining)            \
   __macro(cudnnBatchNormalizationForwardInference)           \
@@ -999,7 +999,7 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
                                     double epsilon,
                                     real *savedMean,
                                     real *savedVar) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
   if ((NULL != runningMean && NULL == runningInvVar) ||
       (NULL == runningMean && NULL != runningInvVar)) {
     LOG(FATAL) << "runningMean and runningInvVar can be NULL "
@@ -1024,7 +1024,7 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
 
   CHECK_SYNC("hl_batch_norm_forward_training failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
              << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
@@ -1039,7 +1039,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
                                     real *estimatedMean,
                                     real *estimatedInvVar,
                                     double epsilon) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
   cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
   cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
   cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
@@ -1053,7 +1053,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
 
   CHECK_SYNC("hl_batch_norm_forward_inference failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
              << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
@@ -1071,7 +1071,7 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
                             double epsilon,
                             real *savedMean,
                             real *savedInvVar) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
   if ((NULL != savedMean && NULL == savedInvVar) ||
       (NULL == savedMean && NULL != savedInvVar)) {
     LOG(FATAL) << "savedMean and savedVar can be NULL "
@@ -1087,16 +1087,14 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
   CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
               t_resource.cudnn_handle, mode, &alpha, &beta,
-#if CUDNN_VERSION >= 5000
               &alpha, &beta,
-#endif
               xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
               bnDesc, scale, scaleGrad, biasGrad, epsilon,
               savedMean, savedInvVar));
 
   CHECK_SYNC("hl_batch_norm_backward failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
              << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index fca52828957a2da42238c9f945f5126beea95008..eb1522a178d48c1d71b5b4a63ce73f65e1167288 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -277,6 +277,7 @@ void NeuralNetwork::getState(MachineState& machineState) {
 }
 
 void NeuralNetwork::backward(const UpdateCallback& callback) {
+  gLayerStackTrace.pop("");  // tell layer trace is during backward.
   FOR_EACH_R(layer, layers_) {
     REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
     if ((*layer)->needGradient()) {
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index e092b2e390f37cd322db8bed8273f561fa979791..a81cf939af671f3fb34fb52ae33035a7bb524aed 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -21,18 +21,20 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief A layer for convex weighted average of vectors,
+ * @brief A layer for weighted sum of vectors,
  * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
  * TRANSLATE
- * - Input: the first input contains the convex weights (batchSize x weightDim),
- *          and the shape of second input is (batchSize x (weightdim*dataDim)).
- * - Output: the shape of output is (batchSize x dataDim).
+ * - Input: the the size of the first input is weightDim,
+ *          and the size of the second input is weightdim * dataDim.
+ * - Output: the sizeof the output is dataDim
  * \f[
- *   out[i][j] = \sum_{j}(in0(i, j) * in1(i,j + i * dataDim)),
- *               i = 0,1,...,(batchSize-1); j = 0, 1,...,(dataDim-1)
+ *   out(j) = \sum_{i}(in0(i) * in1(i,j + i * dataDim)),
+ *               i = 0,1,...,(weightDim-1); j = 0, 1,...,(dataDim-1)
  * \f]
+ * Note that the above computation is for one sample. Multiple samples are
+ * processed in one batch.
  *
- * The config file api is convex_comb_layer.
+ * The config file api is linear_comb_layer.
  */
 class ConvexCombinationLayer : public Layer {
 protected:
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
index b10bd1d886ecf42170914c619b7b4040d984501d..05a70aeff5e8ff3789bca966d351bffc8efb1cb3 100644
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -48,7 +48,7 @@ void CosSimLayer::forward(PassType passType) {
     REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
     MatrixPtr prevOut1 = getInputValue(0);
     MatrixPtr prevOut2 = getInputValue(1);
-    outV->cosSim(*prevOut1, *prevOut2, kCosSimScale_);
+    outV->cosSim(*prevOut1, *prevOut2, config_.cos_scale());
   }
 }
 
@@ -59,7 +59,7 @@ void CosSimLayer::backward(const UpdateCallback& callback) {
 
     outG->cosSimDerivative(*this->getOutputValue(), *getInputValue(0),
                            *getInputValue(1), *getInputGrad(0),
-                           *getInputGrad(1), kCosSimScale_);
+                           *getInputGrad(1), config_.cos_scale());
   }
 }
 
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index 9b0e53335b2503513ce11a4ab19f2199acfee499..65eb807ab2e6f16aab5ef2a9b08d697868c743a3 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -36,7 +36,7 @@ namespace paddle {
 class CosSimLayer : public Layer {
 public:
   explicit CosSimLayer(const LayerConfig& config)
-      : Layer(config), kCosSimScale_(5.0f) {}
+      : Layer(config) {}
 
   ~CosSimLayer() {}
 
@@ -44,8 +44,6 @@ public:
 
   void forward(PassType passType);
   void backward(const UpdateCallback& callback = nullptr);
-
-  const real kCosSimScale_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index cef8772fc254f98d676e6fb89042487315280c61..3c6d13b0bf92ea98eb5c3331a1fdff6b177529b6 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -115,29 +115,11 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
     create(tmpBiasGrad_, 1, channels_, &betaGrad);
   }
 
-  // because of the different api of cudnn v4 and v5.
-  if (hl_get_cudnn_lib_version() < 5000) {
-    if (weight_->getWGrad()) {
-      create(tmpWGrad_, 1, channels_, &gammaGrad);
-    }
-    if (biases_ && biases_->getWGrad()) {
-      create(tmpBiasGrad_, 1, channels_, &betaGrad);
-    }
-  }
-
   hl_batch_norm_backward(ioDesc_, input, ioDesc_, outGrad,
                          ioDesc_, inGrad, bnParamDesc_,
                          gamma, gammaGrad, betaGrad,
                          EPS, savedMean, savedInvVar);
 
-  // because of the different api of cudnn v4 and v5.
-  if (hl_get_cudnn_lib_version() < 5000) {
-    if (weight_->getWGrad() && biases_->getWGrad()) {
-      weight_->getWGrad()->add(*tmpWGrad_);
-      biases_->getWGrad()->add(*tmpBiasGrad_);
-    }
-  }
-
   {
     REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
     biases_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/utils/CustomStackTrace.cpp
index 50d7f5402f586771194fa5b1578293b7614ea1f2..232a478ecd93a7dcb7da7b02a5a1af37a1d1bc43 100644
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
@@ -14,9 +14,44 @@ limitations under the License. */
 
 
 #include "CustomStackTrace.h"
+#include "CommandLineParser.h"
+#include <iostream>
+
+P_DEFINE_bool(layer_stack_error_only_current_thread,
+    true,
+    "Dump current thread or whole process layer stack when signal error "
+    "occurred. true means only dump current thread layer stack");
 
 namespace paddle {
 
 CustomStackTrace<std::string> gLayerStackTrace;
 
+static std::mutex gLayerStackTraceMtx;
+void installLayerStackTracer() {
+  logging::installFailureWriter([](const char* data, int sz) {
+    std::lock_guard<std::mutex> guard(gLayerStackTraceMtx);
+    if (!gLayerStackTrace.empty()) {
+      size_t curTid = -1UL;
+      std::hash<std::thread::id> hasher;
+      gLayerStackTrace.dump([&curTid, &hasher](std::thread::id tid,
+                            bool* isForwarding,
+                            const std::string& layerName) {
+        if (curTid != hasher(tid)) {
+          if (curTid != -1UL) {
+            std::cerr << std::endl;
+          }
+          curTid = hasher(tid);
+          std::cerr << "Thread [" << tid << "] ";
+          if (isForwarding) {
+            std::cerr << (*isForwarding ? "Forwarding ": "Backwarding ");
+          }
+        }
+        std::cerr << layerName << ", ";
+      }, FLAGS_layer_stack_error_only_current_thread);
+      std::cerr << std::endl;
+    }
+    std::cerr.write(data, sz);
+  });
+}
+
 }  // namespace paddle
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index e1b2d2d8e5ee6ce572b10b94a42fb285078dddc1..774c4db2b9be40c38286ef1248bf77746949fd6b 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <stack>
+#include <thread>
+#include <unordered_map>
+#include <functional>
 
 #include "ThreadLocal.h"
 
@@ -29,25 +32,18 @@ namespace paddle {
  * @code{.cpp}
  * 
  * paddle::CustomStackTrace<std::string> stack;
- * PASS_TEST=0;
  * for (auto& layer : layers){
  *   stack.push(layer->getName());
- *   layer->forward(passType);
+ *   layer->forward();
  * }
- * for (auto& layer : layers){
+ *
+ * stack.pop("");  // mark under pop stage.
+ *
+ * for (auto it = layers.rbegin(); it != layers.rend(); ++it){
+ *   auto& layer = *it;
  *   layer->backward(passType);
  *   stack.pop(layer->getName());
  * }
- * 
- * if(passType == PASS_TEST) {
- *   stack.clear();
- * }
- * else {
- *   stack.dump([](const std::string& layername){
- *     LOG(INFO) << "LayerName: " << layername;
- *   })
- * }
- * 
  *
  * @endcode
  */
@@ -55,45 +51,141 @@ template <typename T>
 class CustomStackTrace{
 public:
   /**
-   * @brief Pop out an item from the top of the stack. For safety the item 
-   * will be poped should equal to ip.
+   * @brief Pop out an item from the top of the stack if item == top.
+   *        Else, just set status to popping.
    */
-  void pop(const T& ip) {
-    auto& p = *logstack_;
-    CHECK_EQ(ip, p.top());
-    p.pop();
+  void pop(const T& item) {
+    pushing() = false;
+    auto& s = this->stack();
+    if (item == s.top()) {
+      s.pop();
+    }
   }
+
   /**
-   * @brief Empty the stack by sequence from top to button.
-   * @param[in] callback A function deal with each item while dumping.
-   * It must have and only have a in parameter which is the stack item.
+   * @brief clear current thread stack.
    */
-  template <typename Callback>
-  void dump(Callback callback) {
-    auto& p = *logstack_;
-    while (!p.empty()) {
-      callback(p.top());
-      p.pop();
+  void clear() {
+    auto& s = stack();
+    while (!s.empty()) {
+      s.pop();
     }
   }
+
   /**
-   * @brief Only empty the stack.
+   * @brief return true if all thread's stack is empty.
+   * @return true if empty
    */
-  void clear() {
-    dump([](const T& ip){});
+  bool empty() const {
+    std::lock_guard<std::mutex> g(this->mtx_);
+    for (auto p : this->stackBuffers_) {
+      std::stack<T>& s = *p.second;
+      if (!s.empty()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+
+  /**
+   * @brief DumpCallback Type. It will be invoked many times by dump method.
+   *
+   * The first parameter is stack thread id.
+   * The second parameter is the last action of stack is push or not.
+   * The third parameter is the item in stack.
+   */
+  typedef std::function<void(const std::thread::id& /*threadId*/,
+                              bool* /*isPushing*/,
+                              const T& /*item*/)> DumpCallback;
+
+  /**
+   * Dump all thread stack, and all stack will be cleared.
+   */
+  void dump(const DumpCallback& callback, bool onlyCurrentThread = false) {
+    std::lock_guard<std::mutex> g(this->mtx_);
+    for (auto p : this->stackBuffers_) {
+      std::thread::id tid = p.first;
+      if (onlyCurrentThread && tid != std::this_thread::get_id()) {
+        continue;
+      }
+      std::stack<T>& s = *p.second;
+      bool* isPush = nullptr;
+      auto it = this->pushingBuffers_.find(tid);
+      if (it != this->pushingBuffers_.end()) {
+        isPush = it->second;
+      }
+
+      while (!s.empty()) {
+        callback(tid, isPush, s.top());
+        s.pop();
+      }
+    }
   }
+
   /**
-   * @brief Push item ip to the top of the stack.
+   * @brief Push item to current thread stack.
    */
-  void push(const T& ip) {
-    auto& p = *logstack_;
-    p.push(ip);
+  void push(const T& item) {
+    pushing() = true;
+    auto& p = this->stack();
+    p.push(item);
   }
 
 private:
-  ThreadLocalD<std::stack<T> > logstack_;
+  /**
+   * Get thread local attribute, and save them into a map (threadId => TYPE*)
+   *
+   * @tparam TYPE thread local attribute type.
+   * @param threadLocal Thread Local object.
+   * @param buffers a map from threadId to TYPE*
+   */
+  template <typename TYPE>
+  inline TYPE& getThreadLocal(
+      ThreadLocal<TYPE>& threadLocal,
+      std::unordered_map<std::thread::id, TYPE*>& buffers) {
+    TYPE* retv = threadLocal.get(false);
+    if (retv) {
+      return *retv;
+    } else {
+      std::lock_guard<std::mutex> guard(this->mtx_);
+      retv = threadLocal.get();
+      auto id = std::this_thread::get_id();
+      buffers.insert({id, retv});
+      return *retv;
+    }
+  }
+
+  /**
+   * @brief Get thread local stack reference.
+   */
+  std::stack<T>& stack() {
+    return this->getThreadLocal(this->logStack_,
+                                this->stackBuffers_);
+  }
+
+  /**
+   * @brief Get thread local pushing flag.
+   */
+  bool& pushing() {
+    return this->getThreadLocal(this->isPushing_,
+                                this->pushingBuffers_);
+  }
+
+private:
+  mutable std::mutex mtx_;
+
+  std::unordered_map<std::thread::id, std::stack<T>* > stackBuffers_;
+  std::unordered_map<std::thread::id, bool* > pushingBuffers_;
+  ThreadLocal<bool> isPushing_;
+  ThreadLocal<std::stack<T> > logStack_;
 };
 
 extern CustomStackTrace<std::string> gLayerStackTrace;
 
+/**
+ * @brief Install a failure handler to print layer stack when error.
+ */
+extern void installLayerStackTracer();
+
 }  // namespace paddle
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 1c1d75dc5bed98848fcb03366b383201ee6f5024..d8c3376fb18c48185abdcb7a6d65fa56f0eaa290 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -129,13 +129,7 @@ void runInitFunctions() {
 
 void initMain(int argc, char** argv) {
   initializeLogging(argc, argv);
-  logging::installFailureWriter([](const char* data, int sz) {
-    std::cerr << "Current Layer forward/backward stack is " << std::endl;
-    gLayerStackTrace.dump([](const std::string& layername){
-      std::cerr << "LayerName: " << layername << std::endl;
-    });
-    std::cerr.write(data, sz);
-  });
+  installLayerStackTracer();
   std::string line;
   for (int i = 0; i < argc; ++i) {
     line += argv[i];
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index 147ee3f6d6d86775f2f8c7839c79180f1daffa76..be59a785ecf366dc38a01ac53642eb137abec798 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -2,3 +2,13 @@ add_simple_unittest(test_CommandLineParser)
 add_simple_unittest(test_Logging)
 add_simple_unittest(test_Thread)
 add_simple_unittest(test_StringUtils)
+add_simple_unittest(test_CustomStackTrace)
+
+add_executable(
+    test_CustomStackTracePrint
+    test_CustomStackTracePrint.cpp
+)
+link_paddle_exe(test_CustomStackTracePrint)
+add_test(NAME test_CustomStackTracePrint
+    COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..26ca4c678a650df50d372b0fbb4c3e03d52f91df
--- /dev/null
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <chrono>
+
+#include "paddle/utils/CustomStackTrace.h"
+#include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Locks.h"
+
+P_DEFINE_int32(test_thread_num, 10, "testing thread number");
+
+void testNormalImpl(const std::function<void(
+                      paddle::CustomStackTrace<std::string>&,
+                      size_t, size_t,
+                      paddle::ThreadBarrier&,
+                      paddle::ThreadBarrier&)>& callback) {
+  paddle::CustomStackTrace<std::string> tracer;
+  paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1);
+  paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1);
+  constexpr size_t countDown = 10;
+  constexpr size_t layerSize = 1000;
+  std::vector<std::unique_ptr<std::thread>> threads;
+  threads.reserve(FLAGS_test_thread_num);
+
+  for (int32_t i=0; i < FLAGS_test_thread_num; ++i) {
+    threads.emplace_back(new std::thread([&tracer, &countDown, &layerSize,
+                                         &startBarrier, &doneBarrier,
+                                         &callback]{
+      callback(tracer, countDown, layerSize, startBarrier, doneBarrier);
+    }));
+  }
+  size_t cntDown = countDown;
+  while (cntDown-- > 0) {
+    startBarrier.wait();
+    doneBarrier.wait();
+    ASSERT_TRUE(tracer.empty());
+  }
+
+  for (auto& thread : threads) {
+    thread->join();
+  }
+}
+
+
+TEST(CustomStackTrace, normalTrain) {
+  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
+                 size_t countDown, size_t layerSize,
+                 paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){
+    while (countDown-- > 0) {
+      start.wait();
+      for (size_t i=0; i < layerSize; ++i) {
+        tracer.push("layer_" + std::to_string(i));
+      }
+      tracer.pop("");
+      for (size_t i=0; i < layerSize; ++i) {
+        tracer.pop("layer_" + std::to_string(layerSize - 1 - i));
+      }
+      finish.wait();
+    }
+  });
+}
+
+TEST(CustomStackTrace, normalTest) {
+  testNormalImpl([] (paddle::CustomStackTrace<std::string>& tracer,
+                 size_t countDown, size_t layerSize,
+                 paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){
+    while (countDown-- > 0) {
+      start.wait();
+      for (size_t i=0; i < layerSize; ++i) {
+        tracer.push("layer_" + std::to_string(i));
+      }
+      tracer.clear();  // in forward test, tracer will clear after forward.
+      finish.wait();
+    }
+  });
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c19c98614e6a7d6285990aa19849131579f7307b
--- /dev/null
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Util.h"
+#include "paddle/utils/CustomStackTrace.h"
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+
+  for (size_t i=0; i < 1000; ++i) {
+    paddle::gLayerStackTrace.push("layer_" + std::to_string(i));
+    if (i == 998) {
+      throw "Unhandle exception";
+    }
+  }
+
+  return 0;
+}
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.sh b/paddle/utils/tests/test_CustomStackTracePrint.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b5543485f365adee49629578d470a14e0c742547
--- /dev/null
+++ b/paddle/utils/tests/test_CustomStackTracePrint.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+echo "Test Custom Stack Trace print correct result when fail"
+./test_CustomStackTracePrint >customStackTraceLog 2>&1
+if [ $? -eq 0 ]; then
+  exit 1
+else
+  set -e
+  TEXT=""
+  for ((i=0; i<=998; i++))
+  do
+    TEXT="layer_$i, "$TEXT
+  done
+  TEXT="Forwarding "$TEXT
+  grep -q "$TEXT" customStackTraceLog
+fi
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 68cc40247041093d3eec6dc93b22d358f4cbbaa1..fd9a003bb018c87fb8e8e2992390f27edfd72f4b 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -22,6 +22,8 @@ find_python_module(pip REQUIRED)
 find_python_module(wheel REQUIRED)
 find_python_module(google.protobuf REQUIRED)
 
+add_subdirectory(paddle/trainer_config_helpers/tests)
+
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
     DESTINATION opt/paddle/share/wheels
 )
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index aed317df67bd0880e23d8fb98cb3073d2be7030c..7a6053940178db472999ed7a0d4c49cce40680e4 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1623,7 +1623,7 @@ class BatchNormLayer(LayerBase):
         # Also based on cudnn version.
         use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \
             ((not parallel_nn) or self.config.device > -1) and \
-            cudnn_version >= 4000
+            cudnn_version >= 4007
         self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm"
         super(BatchNormLayer, self).__init__(name, self.layer_type, 0,
                                              active_type=active_type,
@@ -2273,6 +2273,9 @@ class ConvexCombinationLayer(LayerBase):
            name, 'convex_comb', size, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 2,
           'ConvexCombinationLayer must have 2 inputs')
+        config_assert(
+            size * self.get_input_layer(0).size == self.get_input_layer(1).size,
+            'Wrong input size for ConvexCombinationLayer')
         self.set_layer_size(size)
 
 @config_layer('interpolation')
@@ -2322,6 +2325,9 @@ class CosSimVecMatLayer(LayerBase):
         self.config.cos_scale = cos_scale
         config_assert(len(self.inputs) == 2,
           'CosSimVecMatLayer must have 2 inputs')
+        config_assert(
+            size * self.get_input_layer(0).size == self.get_input_layer(1).size,
+            'Wrong input size for CosSimVecMatLayer')
 
 @config_layer('sampling_id')
 class SamplingIdLayer(LayerBase):
@@ -2370,6 +2376,7 @@ class CosSimLayer(LayerBase):
             self,
             name,
             inputs,
+            cos_scale=5,
             device=None):
         super(CosSimLayer, self).__init__(
             name, 'cos', 1, inputs=inputs, device=device)
@@ -2377,6 +2384,7 @@ class CosSimLayer(LayerBase):
         config_assert(
             self.get_input_layer(0).size == self.get_input_layer(1).size,
             'inputs of CosSimLayer must have same dim')
+        self.config.cos_scale = cos_scale
 
 
 @config_layer('tensor')
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b7e5f566bb8c39fa6ea9ed491f28fa046bba71ee..f3f0077f9798f7e2097ae8cd4f39ce270a49b28f 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -47,6 +47,7 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
            'BaseGeneratedInput', 'conv_operator', 'conv_shift_layer',
            'tensor_layer', 'selective_fc_layer', 'sampling_id_layer',
            'slope_intercept_layer', 'trans_full_matrix_projection',
+           'linear_comb_layer',
            'convex_comb_layer', 'ctc_layer', 'crf_layer', 'crf_decoding_layer',
            'cross_entropy_with_selfnorm', 'cross_entropy',
            'multi_binary_label_cross_entropy',
@@ -70,7 +71,8 @@ class LayerType(object):
     POOLING_AVG = 'average'
     FC_LAYER = "fc"
     COST = 'cost'
-    COSINE_SIM = 'cos_vm'
+    COSINE_SIM_VEC = 'cos_vm'
+    COSINE_SIM = 'cos'
     HSIGMOID = 'hsigmoid'
     CONV_LAYER = "conv"
     POOL_LAYER = "pool"
@@ -102,7 +104,7 @@ class LayerType(object):
     SEL_FC_LAYER = "selective_fc"
     SAMPLING_ID_LAYER = "sampling_id"
     SLOPE_INTERCEPT_LAYER = "slope_intercept"
-    CONVEX_COMBINATION_LAYER = "convex_comb"
+    LINEAR_COMBINATION_LAYER = "convex_comb"
     BLOCK_EXPAND = "blockexpand"
 
     CTC_LAYER = "ctc"
@@ -171,6 +173,8 @@ class LayerOutput(object):
         assert LayerType.is_layer_type(layer_type)
         self.name = name
         self.layer_type = layer_type
+        if parents is not None and type(parents) != list:
+            parents = [parents]
         self.parents = [] if parents is None else parents
         self.activation = activation
         self.num_filters = num_filters
@@ -512,7 +516,7 @@ class MixedLayerType(LayerOutput):
         :rtype: MixedLayerType
         """
         if not self.finalized:
-            assert isinstance(other, Projection)
+            assert isinstance(other, Projection) or isinstance(other, Operator)
             self.inputs.append(other)
             self.parents.append(other.origin)
             return self
@@ -1169,13 +1173,16 @@ def power_layer(input, weight, name=None, layer_attr=None):
 @layer_support()
 def scaling_layer(input, weight, name=None, layer_attr=None):
     """
-    A layer for each row of a matrix, multiplying with a element of a vector.
+    A layer for multiplying input vector by weight scalar.
 
     .. math::
-       y.row[i] = w[i] * x.row[i]
+       y  = w x
 
-    where :math:`x` is (batchSize x dataDim) input, :math:`w` is
-    (batchSize x 1) weight vector, and :math:`y` is (batchSize x dataDim) output.
+    where :math:`x` is size=dataDim input, :math:`w` is size=1 weight,
+    and :math:`y` is size=dataDim output.
+
+    Note that the above computation is for one sample. Multiple samples are
+    processed in one batch.
 
     The example usage is:
 
@@ -1249,11 +1256,14 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
 
     ..  math::
         similarity = cos(\\theta) = {\\mathbf{a} \\cdot \\mathbf{b}
-        \\over \\|\\mathbf{b}\\| \\|\\mathbf{b}\\|}
+        \\over \\|\\mathbf{a}\\| \\|\\mathbf{b}\\|}
+
+    The size of a is M, size of b is M*N,
+    Similarity will be calculated N times by step M. The output size is
+    N. The scale will be multiplied to similarity.
 
-    And the input dimension is :math:`a \in R^M`, :math:`b \in R^{MN}`. The
-    similarity will be calculated N times by step M. The output dimension is
-    :math:`R^N`. The scale will be multiplied to similarity.
+    Note that the above computation is for one sample. Multiple samples are
+    processed in one batch.
 
     :param name: layer name
     :type name: basestring
@@ -1270,14 +1280,23 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    Layer(
-        name=name,
-        type=LayerType.COSINE_SIM,
-        size=size,
-        cos_scale=scale,
-        inputs=[a.name, b.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
+    if size == 1:
+        Layer(
+            name=name,
+            type=LayerType.COSINE_SIM,
+            cos_scale=scale,
+            inputs=[a.name, b.name],
+            **ExtraLayerAttribute.to_kwargs(layer_attr)
+        )
+    else:
+        Layer(
+            name=name,
+            type=LayerType.COSINE_SIM_VEC,
+            size=size,
+            cos_scale=scale,
+            inputs=[a.name, b.name],
+            **ExtraLayerAttribute.to_kwargs(layer_attr)
+        )
     return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b])
 
 @wrap_name_default()
@@ -2909,29 +2928,37 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
 
 
 @wrap_name_default()
-def convex_comb_layer(input, size, name=None):
+def linear_comb_layer(weights, vectors, size, name=None):
     """
-    A layer for convex weighted average of vectors takes two inputs.
-      - Input: a vector containing the convex weights (batchSize x weightdim),
-               and a matrix in a vector form (batchSize x (weightdim * datadim)).
-      - Output: a vector (batchSize * datadim).
+    A layer for weighted sum of vectors takes two inputs.
+      - Input: size of weights is M
+               size of vectors is M*N
+      - Output: a vector of size=N
 
     .. math::
 
-       y[i][j] = \sum_{j}(x_{1}(i, j) * x_{2}(i,j + i * dataDim)),
+       z(i) = \sum_{j=0}^{M-1} x(j) y(i+Nj)
+    where :math:`0 \le i \le N-1`
+
+    Or in the matrix notation:
+
+    .. math::
 
-                   i = 0,1,...,(batchSize-1); j = 0, 1,...,(dataDim-1)
+       z = x^T Y
 
     In this formular:
-      - :math:`x_{1}`: the first input.
-      - :math:`x_{2}`: the second input.
-      - :math:`y`: the output.
+      - :math:`x`: weights
+      - :math:`y`: vectors.
+      - :math:`z`: the output.
+
+    Note that the above computation is for one sample. Multiple samples are
+    processed in one batch.
 
     The simple usage is:
 
     .. code-block:: python
 
-       convex_comb = convex_comb_layer(input=inputs,
+       linear_comb = linear_comb_layer(weighs=weight, vectors=vectors,
                                        size=elem_dim)
 
     :param input: The input layers.
@@ -2944,15 +2971,16 @@ def convex_comb_layer(input, size, name=None):
     :rtype: LayerOutput
     """
 
-    assert isinstance(input, list) or isinstance(input, tuple)
-    assert len(input) == 2
     Layer(
         name=name,
-        type=LayerType.CONVEX_COMBINATION_LAYER,
+        type=LayerType.LINEAR_COMBINATION_LAYER,
         size=size,
-        inputs=[Input(input[0].name), Input(input[1].name)],
+        inputs=[Input(weights.name), Input(vectors.name)],
     )
-    return LayerOutput(name, LayerType.CONVEX_COMBINATION_LAYER, input, size=size)
+    return LayerOutput(name, LayerType.LINEAR_COMBINATION_LAYER,
+                       [weights, vectors], size=size)
+
+convex_comb_layer = linear_comb_layer
 
 @wrap_name_default()
 def block_expand_layer(input,
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..611fb855a8c9ad6679167105dd737c995b23c209
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -0,0 +1,5 @@
+#################### test_config_parser #########################
+add_test(NAME layers_test
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
+    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test.py b/python/paddle/trainer_config_helpers/tests/layers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b55667354750066a7d3ab3a0af59eb9e7d47d86
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/layers_test.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import parse_config_and_serialize
+
+if __name__ == '__main__':
+    parse_config_and_serialize(
+        'trainer_config_helpers/tests/layers_test_config.py', '')
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test_config.py b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec171fc6013f454da78570c96e64240017e849b9
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+num_classes = 5
+
+x = data_layer(name="input1", size=3)
+y = data_layer(name="input2", size=5)
+
+x1 = fc_layer(input=x, size=5)
+y1 = fc_layer(input=y, size=5)
+y2 = fc_layer(input=y, size=15)
+
+cos1 = cos_sim(a=x1, b=y1)
+cos3 = cos_sim(a=x1, b=y2, size=3)
+
+linear_comb = linear_comb_layer(weights=x1, vectors=y2, size=3)
+
+out = fc_layer(input=[cos1, cos3, linear_comb],
+               size=num_classes,
+               act=SoftmaxActivation())
+
+outputs(classification_cost(out, data_layer(name="label", size=num_classes)))
+
+settings(
+    batch_size=10,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)