diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 2f85dd3c3b69d21cffede49b001298c6629900a6..3c2df52fed4f86675ce8f1ead6a3b66e4babde34 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -605,7 +605,7 @@ public:
     int batchSize = input->getHeight();
     int size = 1;
     resizeOutput(batchSize, size);
-    output_.value->sumRows(*input);
+    output_.value->sumRows(*input, /* scaleSum= */1, /* scaleDest= */0);
   }
 
   virtual void backward(const UpdateCallback& callback = nullptr) {
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index c3c425a23dc25e8ed7bb6705189510482639c12d..54448bdb5a9bb4f665f28f973eada30a07fb5eee 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1473,6 +1473,21 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   return 0;
 }
 
+template<>
+template <class Agg>
+int BaseMatrixT<real>::applyRow(
+     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+  if (scaleDest != 0) {
+    applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
+  } else {
+    applyRow(agg, base::binary::second(), b);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+
 template<>
 template <class Agg, class Op, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
@@ -1490,6 +1505,21 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
   return 0;
 }
 
+template<>
+template <class Agg, class Op>
+int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
+                                BaseMatrixT& b, BaseMatrixT& c) {
+  if (scaleDest != 0) {
+    applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
+  } else {
+    applyRow(agg, op, base::binary::second(), b, c);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+
 template<>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
@@ -1518,9 +1548,24 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   return 0;
 }
 
+template<>
+template <class Agg>
+int BaseMatrixT<real>::applyCol(
+     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+  if (scaleDest != 0) {
+    applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
+  } else {
+    applyCol(agg, base::binary::second(), b);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+
 template<>
 void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
-  applyRow(aggregate::sum(), base::binary::add2(scaleDest, scaleSum), b);
+  applyRow(aggregate::sum(), scaleDest, scaleSum, b);
 }
 
 template<>
@@ -1550,21 +1595,21 @@ void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
 
 template<>
 void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
-  applyCol(aggregate::sum(), base::binary::add2(scaleDest, scaleSum), b);
+  applyCol(aggregate::sum(), scaleDest, scaleSum, b);
 }
 
 template<>
 void BaseMatrixT<real>::sumOfSquaredDiffs(
     BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
   applyRow(aggregate::sum(), base::binary::squaredDiff(),
-           base::binary::add2(scaleDest, scaleSum), b, c);
+           scaleDest, scaleSum, b, c);
 }
 
 template<>
 void BaseMatrixT<real>::sumOfProducts(
     BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
   applyRow(aggregate::sum(), base::binary::mul(),
-           base::binary::add2(scaleDest, scaleSum), b, c);
+           scaleDest, scaleSum, b, c);
 }
 
 template class BaseMatrixT<real>;
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index fd1604b985dd5cdce8f69a5e4f9a07b5760f9a0d..3a91fdc3c30c5332866a97c256b018eb0982260f 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -317,6 +317,11 @@ public:
   template <class Agg, class Op, class Saver>
   int applyRow(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c);
 
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg, class Op>
+  int applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
+               BaseMatrixT& b, BaseMatrixT& c);
+
   /**
    * a aggregate expression that apply each row of matrix b.
    *
@@ -329,6 +334,10 @@ public:
   template <class Agg, class Saver>
   int applyRow(Agg agg, Saver sv, BaseMatrixT& b);
 
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg>
+  int applyRow(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
+
   /**
    * a aggregate expression that apply each column of matrix b.
    *
@@ -352,6 +361,10 @@ public:
   template <class Agg, class Saver>
   int applyCol(Agg agg, Saver sv, BaseMatrixT& b);
 
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg>
+  int applyCol(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
+
   bool useGpu() const { return useGpu_; }
 
   const T* rowBuf(size_t row) const { return data_ + width_ * row; }
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index ca5ab68c5c2b4bbf61110978f824b631d6d78331..b5e10ef81009a00e76b0c4147b404ba0aaba72b3 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -29,7 +29,6 @@ except ImportError:
     import pickle
 import copy
 
-<<<<<<< 0ba0f02c685e52b14632f6b9bfca4321494505c7
 __all__ = [
     "full_matrix_projection",
     "AggregateLevel",
@@ -1456,11 +1455,11 @@ def bilinear_interp_layer(input,
     .. code-block:: python
 
        bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
-    
+
     :param   input:        A input layer.
     :type    input:        LayerOutput.
     :param   out_size_x:   bilinear interpolation output width.
-    :type    out_size_x:   int|None 
+    :type    out_size_x:   int|None
     :param   out_size_y:   bilinear interpolation output height.
     :type    out_size_y:   int|None
     :param   name:         The layer's name, which cna not be specified.
@@ -1772,11 +1771,11 @@ def img_conv_layer(input,
     The details of convolution layer, please refer UFLDL's `convolution
     <http://ufldl.stanford.edu/tutorial/supervised/
     FeatureExtractionUsingConvolution/>`_ .
-    
-    Convolution Transpose (deconv) layer for image. Paddle only support square 
+
+    Convolution Transpose (deconv) layer for image. Paddle only support square
     input currently and thus input image's width equals height.
 
-    The details of convolution transpose layer, 
+    The details of convolution transpose layer,
     please refer to the following explanation and references therein
     <http://datascience.stackexchange.com/questions/6107/
     what-are-deconvolutional-layers/>`_ .
@@ -4422,7 +4421,7 @@ def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
 
     .. code-block:: python
 
-       cost = cross_entropy(input=input_layer, 
+       cost = cross_entropy(input=input_layer,
                             label=label_layer)
 
     :param input: The first input layer.
@@ -4462,7 +4461,7 @@ def cross_entropy_with_selfnorm(input,
 
     .. code-block:: python
 
-       cost = cross_entropy_with_selfnorm(input=input_layer, 
+       cost = cross_entropy_with_selfnorm(input=input_layer,
                                           label=label_layer)
 
     :param input: The first input layer.
@@ -4532,7 +4531,7 @@ def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
 
     .. code-block:: python
 
-       cost = huber_cost(input=input_layer, 
+       cost = huber_cost(input=input_layer,
                          label=label_layer)
 
     :param input: The first input layer.
@@ -4572,7 +4571,7 @@ def multi_binary_label_cross_entropy(input,
 
     .. code-block:: python
 
-       cost = multi_binary_label_cross_entropy(input=input_layer, 
+       cost = multi_binary_label_cross_entropy(input=input_layer,
                                                label=label_layer)
 
     :param input: The first input layer.