Merge branch 'develop' into group

ffbf00a0 · Luo Tao · 0752b3b7 · 9115ab1c · ffbf00a0 · ffbf00a0
62 changed file
--- a/.clang-format
+++ b/.clang-format
@@ -13,8 +13,6 @@
 # The document of clang-format is 
 #   http://clang.llvm.org/docs/ClangFormat.html
 #   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
-#
-# TODO(yuyang18): Add python and other language code style
 ---
 Language:        Cpp
 BasedOnStyle:  Google
@@ -22,8 +20,9 @@ IndentWidth:     2
 TabWidth:        2
 ContinuationIndentWidth: 4
 AccessModifierOffset: -2  # The private/protected/public has no indent in class
-PointerAlignment: Left    # int* p/int& p, not int *p/int &p
 Standard:  Cpp11 
 AllowAllParametersOfDeclarationOnNextLine: true
+BinPackParameters: false
+BinPackArguments: false
 ...
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+-   repo: https://github.com/Lucas-C/pre-commit-hooks.git
+    sha: c25201a00e6b0514370501050cf2a8538ac12270
+    hooks:
+    -   id: remove-crlf
+-   repo: https://github.com/reyoung/mirrors-yapf.git
+    sha: v0.13.2
+    hooks:
+    -   id: yapf
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    sha: 4ef03c4223ad322c7adaa6c6c0efb26b57df3b71
+    hooks:
+    -   id: check-added-large-files
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: detect-private-key
+    -   id: end-of-file-fixer
+# TODO(yuyang): trailing whitespace has some bugs on markdown 
+# files now, please not add it to pre-commit hook now
+#    -   id: trailing-whitespace
+#
+# TODO(yuyang): debug-statements not fit for Paddle, because
+# not all of our python code is runnable. Some are used for 
+# documenation
+#    -   id: debug-statements
--- a/demo/introduction/README.md
+++ b/demo/introduction/README.md
 This folder contains scripts used in PaddlePaddle introduction.
 - use `bash train.sh` to train a simple linear regression model
 - use `python evaluate_model.py` to read model parameters. You can see that `w` and `b` are very close to [2, 0.3].
--- a/demo/mnist/data/get_mnist_data.sh
+++ b/demo/mnist/data/get_mnist_data.sh
@@ -19,4 +19,3 @@ done
 cd $DIR
 rm -f *.list
 python generate_list.py
--- a/demo/recommendation/data/config.json
+++ b/demo/recommendation/data/config.json
@@ -14,4 +14,3 @@
    "fields": ["id", "title", "genres"]
  }
 }
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -37,4 +37,3 @@ paddle train \
  --use_gpu=false \
  --config_args=is_test=1 \
 2>&1 | tee 'test.log'
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -24,4 +24,3 @@ paddle train \
  --show_parameter_stats_period=10 \
  --test_all_data_in_one_period=1 \
 2>&1 | tee 'train.log'
--- a/doc/demo/semantic_role_labeling/semantic_role_labeling.md
+++ b/doc/demo/semantic_role_labeling/semantic_role_labeling.md
--- a/doc/index.md
+++ b/doc/index.md
@@ -8,7 +8,7 @@ User Guide
 * [Build and Installation](build/index.rst)
 * [Contribute Code](build/contribute_to_paddle.md)
 * [User Interface](ui/index.md)
-* [Model Config Interface](ui/api/trainer_config_helpers/index.md)
+* [Model Config Interface](ui/api/trainer_config_helpers/index.rst)
 * [Example and Demo](demo/index.md)
 * [Cluster Train](cluster/index.md)

--- a/doc/introduction/index.md
+++ b/doc/introduction/index.md
@@ -98,4 +98,3 @@ There, you have recovered the underlying pattern between `X` and `Y` only from o
 - <a href="../build/index.html"> Build and Installation </a>
 - <a href="../demo/quick_start/index_en.html">Quick Start</a>
 - <a href="../demo/index.html">Example and Demo</a>
--- a/doc/ui/api/trainer_config_helpers/activations.rst
+++ b/doc/ui/api/trainer_config_helpers/activations.rst
+===========
+Activations
+===========
 BaseActivation
 ==============
@@ -102,4 +106,3 @@ STanhActivation
 ..  automodule:: paddle.trainer_config_helpers.activations
    :members: STanhActivation
    :noindex:
--- a/doc/ui/api/trainer_config_helpers/activations_index.rst
+++ b/doc/ui/api/trainer_config_helpers/activations_index.rst
-Activations
-===========
-.. toctree::
-  :maxdepth: 3
-  activations.rst
--- a/doc/ui/api/trainer_config_helpers/evaluators.rst
+++ b/doc/ui/api/trainer_config_helpers/evaluators.rst
+==========
+Evaluators
+==========
 Base
 ====
 ..  automodule:: paddle.trainer_config_helpers.evaluators

--- a/doc/ui/api/trainer_config_helpers/evaluators_index.rst
+++ b/doc/ui/api/trainer_config_helpers/evaluators_index.rst
-Evaluators
-==========
-.. toctree::
-  :maxdepth: 3
-  evaluators.rst
--- a/doc/ui/api/trainer_config_helpers/index.md
+++ b/doc/ui/api/trainer_config_helpers/index.md
-# Model Config Interface
-* [Optimizer](optimizers_index.rst)
-* [Data Source](data_sources.rst)
-* [Layers](layers_index.rst)
-* [Activations](activations_index.rst)
-* [Poolings](poolings_index.rst)
-* [Networks](networks_index.rst)
-* [Evaluators](evaluators_index.rst)
-* [Parameter and Extra Layer Attribute](attrs.rst)
--- a/doc/ui/api/trainer_config_helpers/index.rst
+++ b/doc/ui/api/trainer_config_helpers/index.rst
+Model Config Interface
+======================
+.. toctree::
+  :maxdepth: 1
+  optimizers.rst
+  data_sources.rst
+  layers.rst
+  activations.rst 
+  poolings.rst
+  networks.rst
+  evaluators.rst
+  attrs.rst
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
+======
+Layers
+======
 Base
 ======
@@ -47,7 +51,7 @@ conv_operator
    :noindex:
 conv_projection
-------------
+---------------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: conv_projection
    :noindex:
@@ -187,6 +191,12 @@ embedding_layer
    :members: embedding_layer
    :noindex:
+scaling_projection
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: scaling_projection
+    :noindex:
 dotmul_projection
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers

--- a/doc/ui/api/trainer_config_helpers/layers_index.rst
+++ b/doc/ui/api/trainer_config_helpers/layers_index.rst
-Layers
-======
-.. toctree::
-  :maxdepth: 3
-  layers.rst
--- a/doc/ui/api/trainer_config_helpers/networks.rst
+++ b/doc/ui/api/trainer_config_helpers/networks.rst
+========
+Networks
+========
+The networks module contains pieces of neural network that combine multiple layers.
 NLP
 ===
@@ -111,4 +117,3 @@ outputs
 ..  automodule:: paddle.trainer_config_helpers.networks
    :members: outputs
    :noindex:
--- a/doc/ui/api/trainer_config_helpers/networks_index.rst
+++ b/doc/ui/api/trainer_config_helpers/networks_index.rst
-Networks
-========
-The networks module contains pieces of neural network that combine multiple layers.
-.. toctree::
-  :maxdepth: 3
-  networks.rst
--- a/doc/ui/api/trainer_config_helpers/optimizers.rst
+++ b/doc/ui/api/trainer_config_helpers/optimizers.rst
+==========
+Optimizers
+==========
 BaseSGDOptimizer
 ================
 ..  automodule:: paddle.trainer_config_helpers.optimizers
@@ -51,4 +55,3 @@ settings
 ..  automodule:: paddle.trainer_config_helpers.optimizers
    :members: settings
    :noindex:
--- a/doc/ui/api/trainer_config_helpers/optimizers_index.rst
+++ b/doc/ui/api/trainer_config_helpers/optimizers_index.rst
-Optimizers
-==========
-.. toctree::
-  :maxdepth: 3
-  optimizers.rst
--- a/doc/ui/api/trainer_config_helpers/poolings.rst
+++ b/doc/ui/api/trainer_config_helpers/poolings.rst
+========
+Poolings
+========
 BasePoolingType
 ===============
 ..  automodule:: paddle.trainer_config_helpers.poolings
@@ -27,4 +31,3 @@ SquareRootNPooling
 ..  automodule:: paddle.trainer_config_helpers.poolings
    :members: SquareRootNPooling
    :noindex:
--- a/doc/ui/api/trainer_config_helpers/poolings_index.rst
+++ b/doc/ui/api/trainer_config_helpers/poolings_index.rst
-Poolings
-========
-These pooling types are used for sequence input, not for images.
-.. toctree::
-  :maxdepth: 3
-  poolings.rst
--- a/doc_cn/algorithm/rnn/hierarchical-layer.md
+++ b/doc_cn/algorithm/rnn/hierarchical-layer.md
--- a/doc_cn/algorithm/rnn/hierarchical-rnn.md
+++ b/doc_cn/algorithm/rnn/hierarchical-rnn.md
--- a/doc_cn/algorithm/rnn/rnn-tutorial.md
+++ b/doc_cn/algorithm/rnn/rnn-tutorial.md
--- a/doc_cn/build_and_install/install/paddle_version.txt
+++ b/doc_cn/build_and_install/install/paddle_version.txt
--- a/doc_cn/concepts/use_concepts.rst
+++ b/doc_cn/concepts/use_concepts.rst
@@ -4,7 +4,7 @@ PaddlePaddle 基本使用概念
 PaddlePaddle是一个神经网络学习框架。其单机进程为 :code:`paddle train`。 单机的所有设备使用，均在单机进程内调度完成。 而多机辅助进程 :code:`paddle pserver` 负责联合多个单机进程进行通信，进而充分利用集群的计算资源。 PaddlePaddle同时以 :code:`swig api` 的形式，提供训练结果模型预测的方法和自定义训练流程。
-下面我们会分别介绍主要进程 :code:`paddle train` 中的一些概念。这些概念会对如何使用PaddlePaddle有一定的帮助。 了解这些概念的前提是，读者已经了解 `基本的神经网络/机器学习原理和概念 <nn.rst>`_ 。同时，如果想要了解PaddlePaddle实现中的一些概念，请参考 `PaddlePaddle 编程中的基本概念 <program_concepts.rst>`_ 。
+下面我们会分别介绍主要进程 :code:`paddle train` 中的一些概念。这些概念会对如何使用PaddlePaddle有一定的帮助。 了解这些概念的前提是，读者已经了解 `基本的神经网络/机器学习原理和概念 <nn.html>`_ 。同时，如果想要了解PaddlePaddle实现中的一些概念，请参考 `PaddlePaddle 编程中的基本概念 <program_concepts.html>`_ 。
 ..	contents::
@@ -184,8 +184,8 @@ PaddlePaddle多机使用的经典方法是通过 :code:`Parameter Server` 来对
 详细的说明可以参考，使用 `集群训练Paddle`_ 。
-..  _PyDataProvider: ../ui/data_provider/pydataprovider2.rst
+..  _PyDataProvider: ../ui/data_provider/pydataprovider2.html
-..	_settings: ../../doc/ui/api/trainer_config_helpers/optimizers.rst
+..	_settings: ../../doc/ui/api/trainer_config_helpers/optimizers.html#settings
-..	_mixed_layer: ../../doc/ui/api/trainer_config_helpers/layers.rst
+..	_mixed_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#mixed-layer
 ..	_masking-gpu: http://www.acceleware.com/blog/cudavisibledevices-masking-gpus
-..  _集群训练Paddle: ../cluster/index.rst
+..  _集群训练Paddle: ../cluster/index.html
--- a/doc_cn/faq/reduce_min_pool_size.py
+++ b/doc_cn/faq/reduce_min_pool_size.py
--- a/paddle/.common_test_util.sh
+++ b/paddle/.common_test_util.sh
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -17,5 +17,3 @@ endif()
 if(WITH_SWIG_PY)
  add_subdirectory(api)
 endif()
--- a/paddle/api/PaddleAPIPrivate.h
+++ b/paddle/api/PaddleAPIPrivate.h
@@ -65,4 +65,3 @@ struct ArgumentsPrivate {
    return *(std::shared_ptr<T>*)(rawPtr);
  }
 };
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -69,8 +69,8 @@ class TestMatrix(unittest.TestCase):
    def test_numpy(self):
        numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat)
-        self.assertEqual(
+        self.assertEqual((int(m.getHeight()), int(m.getWidth())),
-            (int(m.getHeight()), int(m.getWidth())), numpy_mat.shape)
+                         numpy_mat.shape)
        # the numpy matrix and paddle matrix shared the same memory.
        numpy_mat[0, 1] = 342.23

--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -254,4 +254,3 @@ extern __thread cudaStream_t default_stream;
 #endif  /* __NVCC__ */
 #endif  /* HL_BASE_H_ */
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -199,4 +199,3 @@ inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
                                   real *savedInvVar) {}
 #endif  // HL_CUDA_CUDNN_STUB_H_
--- a/paddle/cuda/src/avx_mathfun.h
+++ b/paddle/cuda/src/avx_mathfun.h
@@ -718,4 +718,3 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
 }
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -605,7 +605,7 @@ public:
    int batchSize = input->getHeight();
    int size = 1;
    resizeOutput(batchSize, size);
-    output_.value->sumRows(*input);
+    output_.value->sumRows(*input, /* scaleSum= */1, /* scaleDest= */0);
  }
  virtual void backward(const UpdateCallback& callback = nullptr) {

--- a/paddle/gserver/layers/FullMatrixProjection.cpp
+++ b/paddle/gserver/layers/FullMatrixProjection.cpp
@@ -52,7 +52,9 @@ void FullMatrixProjection::backward(const UpdateCallback& callback) {
  }
  hl_set_sync_flag(syncFlag);
+  if (weight_->getWGrad()) {
    parameter_->incUpdate(callback);
+  }
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
@@ -48,4 +48,3 @@ public:
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/ScalingProjection.cpp
+++ b/paddle/gserver/layers/ScalingProjection.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "Projection.h"
+namespace paddle {
+class ScalingProjection : public Projection {
+public:
+  ScalingProjection(const ProjectionConfig& config,
+                    const ParameterPtr& parameter, bool useGpu)
+      : Projection(config, parameter, useGpu) {
+    CHECK_EQ(parameter->getSize(), 1UL);
+    weight_.reset(new Weight(1, 1, parameter));
+  }
+  void forward() {
+    CHECK(in_->value);
+    out_->value->add(*in_->value, weight_->getW()->getElement(0, 0));
+  }
+  void backward(const UpdateCallback& callback) {
+    if (weight_->getWGrad()) {
+      auto sum = Matrix::create(in_->value->getHeight(), 1, false, useGpu_);
+      sum->sumOfProducts(*in_->value, *out_->grad,
+                         /* scaleSum= */1, /* scaleDest= */0);
+      weight_->getWGrad()->sumCols(*sum,
+                                   /* scaleSum= */1, /* scaleDest= */1);
+      parameter_->incUpdate(callback);
+    }
+    if (in_->grad) {
+      in_->grad->add(*out_->grad, weight_->getW()->getElement(0, 0));
+    }
+  }
+protected:
+  std::unique_ptr<Weight> weight_;
+};
+REGISTER_PROJECTION(scaling, ScalingProjection);
+}  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -135,6 +135,17 @@ TEST(Projection, identity) {
  }
 }
+TEST(Projection, scaling) {
+  ProjectionConfig conf;
+  conf.set_type("scaling");
+  conf.set_input_size(10);
+  conf.set_output_size(10);
+  for (auto useGpu : {false}) {
+    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 1,
+                       /* batchSize */ 100, useGpu);
+  }
+}
 #ifndef PADDLE_ONLY_CPU
 TEST(Projection, conv) {
  const int NUM_FILTERS = 16;

--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1451,6 +1451,8 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
  int numRows = b.height_;
  int numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
            numCols, offset, false_type(), true_type() /*aAsColVector*/);
@@ -1463,18 +1465,69 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
  int numRows = b.height_;
  int numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
            false_type(), true_type() /*aAsColVector*/);
  return 0;
 }
+template<>
+template <class Agg>
+int BaseMatrixT<real>::applyRow(
+     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+  if (scaleDest != 0) {
+    applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
+  } else {
+    applyRow(agg, base::binary::second(), b);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+template<>
+template <class Agg, class Op, class Saver>
+int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
+                                BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
+  CHECK_EQ(c.height_, numRows);
+  CHECK_EQ(c.width_, numCols);
+  aggregate(agg, op, sv,
+            b, c, numRows, numCols, offset,
+            false_type(), true_type() /*aAsColVector*/);
+  return 0;
+}
+template<>
+template <class Agg, class Op>
+int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
+                                BaseMatrixT& b, BaseMatrixT& c) {
+  if (scaleDest != 0) {
+    applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
+  } else {
+    applyRow(agg, op, base::binary::second(), b, c);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
 template<>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
  int numRows = b.height_;
  int numCols = b.width_;
+  CHECK_EQ(width_, numCols);
+  CHECK_EQ(height_, 1UL);
  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
            numCols, offset, true_type() /*aAsRowVector*/, false_type());
@@ -1487,6 +1540,8 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
  int numRows = b.height_;
  int numCols = b.width_;
+  CHECK_EQ(width_, numCols);
+  CHECK_EQ(height_, 1UL);
  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
            true_type() /*aAsRowVector*/, false_type());
@@ -1494,8 +1549,23 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
 }
 template<>
-void BaseMatrixT<real>::sumRows(BaseMatrixT& b) {
+template <class Agg>
-  applyRow(aggregate::sum(), b);
+int BaseMatrixT<real>::applyCol(
+     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+  if (scaleDest != 0) {
+    applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
+  } else {
+    applyCol(agg, base::binary::second(), b);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+template<>
+void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
+  applyRow(aggregate::sum(), scaleDest, scaleSum, b);
 }
 template<>
@@ -1524,18 +1594,22 @@ void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
 }
 template<>
-void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scale) {
+void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
-  applyCol(aggregate::sum(), base::binary::add2(1.0, scale), b);
+  applyCol(aggregate::sum(), scaleDest, scaleSum, b);
 }
 template<>
-void BaseMatrixT<real>::sumOfSquares(BaseMatrixT& b, BaseMatrixT& c) {
+void BaseMatrixT<real>::sumOfSquaredDiffs(
-  int numRows = b.height_;
+    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
-  int numCols = b.width_;
+  applyRow(aggregate::sum(), base::binary::squaredDiff(),
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+           scaleDest, scaleSum, b, c);
-  aggregate(aggregate::sum(), base::binary::squaredDiff(), base::binary::add(),
+}
-            b, c, numRows, numCols, offset, false_type(),
-            true_type() /*aAsColVector*/);
+template<>
+void BaseMatrixT<real>::sumOfProducts(
+    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
+  applyRow(aggregate::sum(), base::binary::mul(),
+           scaleDest, scaleSum, b, c);
 }
 template class BaseMatrixT<real>;

--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -305,6 +305,23 @@ public:
  template <class Agg>
  int applyRow(Agg agg, BaseMatrixT& b);
+  /**
+   * a aggregate expression that apply each row of matrix b.
+   *
+   * @code
+   * for each row i & 0 <= j < b.width_, do:
+   *   dst = agg(op(b[i*ldb + j], c[i*ldc + j])
+   *   this[i] = sv(this[i], dst)
+   * @endcode
+   */
+  template <class Agg, class Op, class Saver>
+  int applyRow(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c);
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg, class Op>
+  int applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
+               BaseMatrixT& b, BaseMatrixT& c);
  /**
   * a aggregate expression that apply each row of matrix b.
   *
@@ -317,6 +334,10 @@ public:
  template <class Agg, class Saver>
  int applyRow(Agg agg, Saver sv, BaseMatrixT& b);
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg>
+  int applyRow(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
  /**
   * a aggregate expression that apply each column of matrix b.
   *
@@ -340,6 +361,10 @@ public:
  template <class Agg, class Saver>
  int applyCol(Agg agg, Saver sv, BaseMatrixT& b);
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg>
+  int applyCol(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
  bool useGpu() const { return useGpu_; }
  const T* rowBuf(size_t row) const { return data_ + width_ * row; }
@@ -920,7 +945,9 @@ public:
  void addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
  /// calculate the sum of each row of the matrix b.
-  void sumRows(BaseMatrixT& b);
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij}
+  void sumRows(BaseMatrixT& b, T scaleSum, T scaleDest);
  /// calculate the maximum value of each row of the matrix b.
  void maxRows(BaseMatrixT& b);
  /// calculate the minimum value of each row of the matrix b.
@@ -932,10 +959,18 @@ public:
  void maxCols(BaseMatrixT& b);
  /// calculate the minimum value of each column of the matrix b.
  void minCols(BaseMatrixT& b);
-  void sumCols(BaseMatrixT& b, T scale);
-  /// calculate the sum of each row of (b - c)^2.
+  /// calculate the sum of each column of the matrix b.
-  void sumOfSquares(BaseMatrixT& b, BaseMatrixT& c);
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
+  void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest);
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2
+  void sumOfSquaredDiffs(BaseMatrixT& b, BaseMatrixT& c,
+                         T scaleSum, T scaleDest);
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
+  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c,
+                     T scaleSum, T scaleDest);
  /**
   * @code

--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -80,4 +80,3 @@ void vTanh(const int n, const T* a, T* r);
 }  // namespace paddle
 #endif  // MATHFUNCTIONS_H_
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -242,7 +242,7 @@ real GpuMatrix::getSum() {
 void GpuMatrix::accumulateColSum(Matrix& src) {
  CHECK_EQ(getWidth(), src.getWidth());
  CHECK_EQ(getHeight(), (size_t)1);
-  sumCols(src, 1.0);
+  sumCols(src, 1.0, 1.0);
 }
 real GpuMatrix::getAbsSum() {
@@ -389,7 +389,7 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
  CHECK_EQ(width_, a.getWidth());
  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
  if (!sMatPtr) {
-    sumCols(a, scale);
+    sumCols(a, /* scaleSum= */scale, /* scaleDest= */1);
  } else {
    real* data = getData();
    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
@@ -589,7 +589,7 @@ void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
 void GpuMatrix::colMerge(Matrix& src) {
  CHECK(src.height_ == height_);
  if (!trans_ && !src.trans_) {
-    sumRows(src);
+    sumRows(src, /* scaleSum= */1, /* scaleDest= */0);
  } else {
    LOG(FATAL) << "Is not supported";
  }
@@ -599,7 +599,7 @@ void GpuMatrix::rowSum(Matrix& sum) {
  CHECK_EQ(sum.getHeight(), getHeight());
  CHECK_EQ(sum.getWidth(), (size_t)1);
-  sum.sumRows(*this);
+  sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0);
 }
 void GpuMatrix::rowMax(Matrix& max) {
@@ -790,7 +790,8 @@ void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
    LOG(FATAL) << "not supported: GpuSparseMatrix as label";
  }
-  BaseMatrix::sumOfSquares(output, label);
+  BaseMatrix::sumOfSquaredDiffs(output, label,
+                                /* scaleSum= */1, /* scaleDest= */1);
 }
 void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
@@ -1501,7 +1502,7 @@ void CpuMatrix::accumulateColSum(Matrix& src) {
  CHECK_EQ(getWidth(), src.getWidth());
  CHECK_EQ(getHeight(), (size_t)1);
-  sumCols(src, 1.0);
+  sumCols(src, /* scaleSum= */1, /* scaleDest= */1);
 }
 real CpuMatrix::getAbsSum() {
@@ -2188,7 +2189,7 @@ void CpuMatrix::collectBias(Matrix& a, real scale) {
  CHECK_EQ(width_, a.getWidth());
  CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
  if (!aptr) {
-    sumCols(a, scale);
+    sumCols(a, /* scaleSum= */scale, /* scaleDest= */1);
  } else {
    size_t nnz = aptr->getElementCnt();
    int* cols = aptr->getCols();
@@ -2227,7 +2228,7 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
  real* dst = getData();
  real* src = a.getData();
  const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(1, 1, false, false);
+  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
  for (size_t i = 0; i < height; i++) {
    int sequenceLength = starts[i + 1] - starts[i];
@@ -2239,13 +2240,15 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
    dataMtx->setData(src + starts[i] * width, sequenceLength, width);
    if (mode == 0) {
      // plain average
-      outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength);
+      outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength,
+                      /* scaleDest= */1);
    } else if (mode == 1) {
      // sum instead of average
-      outMtx->sumCols(*dataMtx, (real)1);
+      outMtx->sumCols(*dataMtx,  /* scaleSum= */1, /* scaleDest= */1);
    } else if (mode == 2) {
      // divide by square root of sequenceLength
-      outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength));
+      outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength),
+                      /* scaleDest= */1);
    } else {
      LOG(FATAL) << "should not reach here";
    }
@@ -2932,7 +2935,7 @@ void CpuMatrix::rowSum(Matrix& sum) {
  CHECK_EQ(sum.getHeight(), getHeight());
  CHECK_EQ(sum.getWidth(), (size_t)1);
-  sum.sumRows(*this);
+  sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0);
 }
 void CpuMatrix::rowMaxId(IVector& maxIds) {
@@ -3485,7 +3488,8 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
    }
  }
-  BaseMatrix::sumOfSquares(output, label);
+  BaseMatrix::sumOfSquaredDiffs(output, label,
+                                /* scaleSum= */1, /* scaleDest= */1);
 }
 /* calculate the error of outputV according to label */

--- a/paddle/parameter/CMakeLists.txt
+++ b/paddle/parameter/CMakeLists.txt
--- a/paddle/parameter/tests/CMakeLists.txt
+++ b/paddle/parameter/tests/CMakeLists.txt
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
--- a/paddle/scripts/cpplint.py
+++ b/paddle/scripts/cpplint.py
--- a/paddle/scripts/deb/build_scripts/build.sh
+++ b/paddle/scripts/deb/build_scripts/build.sh
@@ -33,5 +33,3 @@ cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON -DWITH_AVX=OFF -DCUDNN_ROOT=/usr/
 make -j `nproc`
 cpack -D CPACK_GENERATOR='DEB' ..
 mv *.deb ~/dist/gpu-noavx
--- a/paddle/scripts/docker/generate.sh
+++ b/paddle/scripts/docker/generate.sh
@@ -58,4 +58,3 @@ m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
   -DPADDLE_WITH_AVX=OFF \
   Dockerfile.m4 > Dockerfile.gpu-noavx-demo
--- a/paddle/scripts/travis/common.sh
+++ b/paddle/scripts/travis/common.sh
@@ -2,4 +2,3 @@
 set -e
 mkdir -p ../../../build
 cd ../../../build
--- a/paddle/trainer/tests/test.txt
+++ b/paddle/trainer/tests/test.txt
@@ -998,4 +998,3 @@ from IN B-PP
 Friday NNP B-NP
 's POS B-NP
 Tokyo NNP I-NP
--- a/paddle/trainer/tests/test_gen_dict.txt
+++ b/paddle/trainer/tests/test_gen_dict.txt
--- a/paddle/trainer/tests/train.txt
+++ b/paddle/trainer/tests/train.txt
@@ -4998,4 +4998,3 @@ However RB B-ADVP
 the DT B-NP
 disclosure NN I-NP
 of IN B-PP
--- a/paddle/utils/tests/test_CommandLineParser.cpp
+++ b/paddle/utils/tests/test_CommandLineParser.cpp
@@ -109,4 +109,3 @@ int main(int argc, char** argv) {
 }
 #endif
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -410,8 +410,8 @@ def RecurrentLayerGroupEnd(name):
                  "RecurrentLayerGroup not begin")
    for pair in g_current_submodel.memories:  #check exist
        layer = g_layer_map[pair.layer_name]
-        config_assert(layer is not None, "memory declare wrong name:%s" %
+        config_assert(layer is not None,
-                      pair.layer_name)
+                      "memory declare wrong name:%s" % pair.layer_name)
        memory_link = g_layer_map[pair.link_name]
        config_assert(layer.size == memory_link.size,
                      "memory declare wrong size:%d" % memory_link.size)
@@ -592,6 +592,20 @@ class DotMulProjection(Projection):
    def calc_parameter_dims(self, input_size, output_size):
        return [1, output_size]
+# ScalingProjection
+@config_class
+class ScalingProjection(Projection):
+    type = 'scaling'
+    def calc_output_size(self, input_layer_config):
+        return input_layer_config.size
+    def calc_parameter_size(self, input_size, output_size):
+        return 1
+    def calc_parameter_dims(self, input_size, output_size):
+        return [1, 1]
 @config_class
 class TableProjection(Projection):
@@ -672,8 +686,8 @@ class ConvProjection(Projection):
        parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf,
                   num_filters)
        # TODO: support rectangle input
-        self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x**
+        self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x
-                                      2) * num_filters
+                                      **2) * num_filters
    def calc_output_size(self, input_layer_config):
        return self.proj_conf.output_size
@@ -2779,8 +2793,8 @@ class ConcatenateLayer2(LayerBase):
 @config_layer('recurrent')
 class RecurrentLayer(LayerBase):
    def __init__(self, name, inputs, reversed=False, bias=True, **xargs):
-        super(RecurrentLayer, self).__init__(name, 'recurrent', 0, inputs, **
+        super(RecurrentLayer, self).__init__(name, 'recurrent', 0, inputs,
-                                             xargs)
+                                             **xargs)
        config_assert(len(self.inputs) == 1, 'RecurrentLayer must have 1 input')
        input_layer = self.get_input_layer(0)
        size = input_layer.size
@@ -2862,22 +2876,22 @@ class MDLstmLayer(LayerBase):
                 active_state_type="sigmoid",
                 bias=True,
                 **xargs):
-        super(MDLstmLayer, self).__init__(name, 'mdlstmemory', 0, inputs, **
+        super(MDLstmLayer, self).__init__(name, 'mdlstmemory', 0, inputs,
-                                          xargs)
+                                          **xargs)
        config_assert(len(self.inputs) == 1, 'MDLstmLayer must have 1 input')
        input_layer = self.get_input_layer(0)
        dim_num = len(directions)
        #check input_layer.size is divided by (3+dim_num)
-        config_assert(input_layer.size %
+        config_assert(input_layer.size % (3 + dim_num) == 0,
-                      (3 + dim_num) == 0, "size % (dim_num) should be 0!")
+                      "size % (dim_num) should be 0!")
        size = input_layer.size / (3 + dim_num)
        self.set_layer_size(size)
        self.config.active_gate_type = active_gate_type
        self.config.active_state_type = active_state_type
        for i in xrange(len(directions)):
            self.config.directions.append(int(directions[i]))
-        self.create_input_parameter(0, size * size *
+        self.create_input_parameter(0, size * size * (3 + dim_num),
-                                    (3 + dim_num), [size, size, 3 + dim_num])
+                                    [size, size, 3 + dim_num])
        #bias includes 3 kinds of peephole, 3+dim_num+2+dim_num
        self.create_bias_parameter(bias, size * (5 + 2 * dim_num))
@@ -2915,8 +2929,8 @@ class GruStepLayer(LayerBase):
                 active_gate_type="sigmoid",
                 bias=True,
                 **xargs):
-        super(GruStepLayer, self).__init__(name, 'gru_step', size, inputs, **
+        super(GruStepLayer, self).__init__(name, 'gru_step', size, inputs,
-                                           xargs)
+                                           **xargs)
        config_assert(len(self.inputs) == 2, 'GruStepLayer must have 2 input')
        input_layer0 = self.get_input_layer(0)
        input_layer1 = self.get_input_layer(1)

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -65,6 +65,7 @@ __all__ = [
    'StaticInput',
    'expand_layer',
    'scaling_layer',
+    'scaling_projection',
    'power_layer',
    'interpolation_layer',
    'bilinear_interp_layer',
@@ -458,7 +459,7 @@ def identity_projection(input, offset=None):
    :type input: LayerOutput
    :param offset: Offset, None if use default.
    :type offset: int
-    :return: A IdentityProjection or IdentityOffsetProjection Object
+    :return: A IdentityProjection or IdentityOffsetProjection object
    :rtype: IdentityProjection or IdentityOffsetProjection
    """
    if offset is None:
@@ -471,6 +472,34 @@ def identity_projection(input, offset=None):
    return proj
+@wrap_param_attr_default()
+def scaling_projection(input, param_attr=None):
+    """
+    scaling_projection multiplies the input with a scalar parameter and add to
+    the output.
+    .. math::
+       out += w * in
+    The example usage is:
+    .. code-block:: python
+       proj = scaling_projection(input=layer)
+    :param input: Input Layer.
+    :type input: LayerOutput
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :return: A ScalingProjection object
+    :rtype: ScalingProjection
+    """
+    proj = ScalingProjection(input_layer_name=input.name,
+                             **param_attr.attr)
+    proj.origin = input
+    return proj
 @wrap_param_attr_default()
 def dotmul_projection(input, param_attr=None):
    """

--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
@@ -26,6 +26,7 @@ with mixed_layer() as m5:
 with mixed_layer() as m6:
    m6 += dotmul_operator(a=m3, b=m4)
+    m6 += scaling_projection(m3)
 img = data_layer(name='img', size=32 * 32)
 flt = data_layer(name='filter', size=3 * 3 * 1 * 64)

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -111,13 +111,23 @@ layers {
  inputs {
    input_layer_name: "__mixed_2__"
  }
+  inputs {
+    input_layer_name: "__mixed_2__"
+    input_parameter_name: "___mixed_5__.w1"
+    proj_conf {
+      type: "scaling"
+      name: "___mixed_5__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
  inputs {
    input_layer_name: "__mixed_3__"
  }
  operator_confs {
    type: "dot_mul"
    input_indices: 0
-    input_indices: 1
+    input_indices: 2
    input_sizes: 100
    input_sizes: 100
    output_size: 100
@@ -258,6 +268,16 @@ parameters {
  initial_strategy: 0
  initial_smart: false
 }
+parameters {
+  name: "___mixed_5__.w1"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
 parameters {
  name: "___mixed_7__.w0"
  size: 30000