Merge pull request #421 from emailweixu/scaling_projection

Add ScalingProjection

Merge pull request #421 from emailweixu/scaling_projection
Add ScalingProjection
2b841ec8 · Tao Luo · GitHub · 0ba0f02c · a6ad9a16 · 2b841ec8
12 changed file
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -191,6 +191,12 @@ embedding_layer
    :members: embedding_layer
    :noindex:
+scaling_projection
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: scaling_projection
+    :noindex:
 dotmul_projection
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers

--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -605,7 +605,7 @@ public:
    int batchSize = input->getHeight();
    int size = 1;
    resizeOutput(batchSize, size);
-    output_.value->sumRows(*input);
+    output_.value->sumRows(*input, /* scaleSum= */1, /* scaleDest= */0);
  }
  virtual void backward(const UpdateCallback& callback = nullptr) {

--- a/paddle/gserver/layers/FullMatrixProjection.cpp
+++ b/paddle/gserver/layers/FullMatrixProjection.cpp
@@ -52,7 +52,9 @@ void FullMatrixProjection::backward(const UpdateCallback& callback) {
  }
  hl_set_sync_flag(syncFlag);
+  if (weight_->getWGrad()) {
    parameter_->incUpdate(callback);
+  }
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/ScalingProjection.cpp
+++ b/paddle/gserver/layers/ScalingProjection.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "Projection.h"
+namespace paddle {
+class ScalingProjection : public Projection {
+public:
+  ScalingProjection(const ProjectionConfig& config,
+                    const ParameterPtr& parameter, bool useGpu)
+      : Projection(config, parameter, useGpu) {
+    CHECK_EQ(parameter->getSize(), 1UL);
+    weight_.reset(new Weight(1, 1, parameter));
+  }
+  void forward() {
+    CHECK(in_->value);
+    out_->value->add(*in_->value, weight_->getW()->getElement(0, 0));
+  }
+  void backward(const UpdateCallback& callback) {
+    if (weight_->getWGrad()) {
+      auto sum = Matrix::create(in_->value->getHeight(), 1, false, useGpu_);
+      sum->sumOfProducts(*in_->value, *out_->grad,
+                         /* scaleSum= */1, /* scaleDest= */0);
+      weight_->getWGrad()->sumCols(*sum,
+                                   /* scaleSum= */1, /* scaleDest= */1);
+      parameter_->incUpdate(callback);
+    }
+    if (in_->grad) {
+      in_->grad->add(*out_->grad, weight_->getW()->getElement(0, 0));
+    }
+  }
+protected:
+  std::unique_ptr<Weight> weight_;
+};
+REGISTER_PROJECTION(scaling, ScalingProjection);
+}  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -135,6 +135,17 @@ TEST(Projection, identity) {
  }
 }
+TEST(Projection, scaling) {
+  ProjectionConfig conf;
+  conf.set_type("scaling");
+  conf.set_input_size(10);
+  conf.set_output_size(10);
+  for (auto useGpu : {false}) {
+    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 1,
+                       /* batchSize */ 100, useGpu);
+  }
+}
 #ifndef PADDLE_ONLY_CPU
 TEST(Projection, conv) {
  const int NUM_FILTERS = 16;

--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1451,6 +1451,8 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
  int numRows = b.height_;
  int numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
            numCols, offset, false_type(), true_type() /*aAsColVector*/);
@@ -1463,18 +1465,69 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
  int numRows = b.height_;
  int numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
            false_type(), true_type() /*aAsColVector*/);
  return 0;
 }
+template<>
+template <class Agg>
+int BaseMatrixT<real>::applyRow(
+     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+  if (scaleDest != 0) {
+    applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
+  } else {
+    applyRow(agg, base::binary::second(), b);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+template<>
+template <class Agg, class Op, class Saver>
+int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
+                                BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
+  CHECK_EQ(c.height_, numRows);
+  CHECK_EQ(c.width_, numCols);
+  aggregate(agg, op, sv,
+            b, c, numRows, numCols, offset,
+            false_type(), true_type() /*aAsColVector*/);
+  return 0;
+}
+template<>
+template <class Agg, class Op>
+int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
+                                BaseMatrixT& b, BaseMatrixT& c) {
+  if (scaleDest != 0) {
+    applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
+  } else {
+    applyRow(agg, op, base::binary::second(), b, c);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
 template<>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
  int numRows = b.height_;
  int numCols = b.width_;
+  CHECK_EQ(width_, numCols);
+  CHECK_EQ(height_, 1UL);
  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
            numCols, offset, true_type() /*aAsRowVector*/, false_type());
@@ -1487,6 +1540,8 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
  int numRows = b.height_;
  int numCols = b.width_;
+  CHECK_EQ(width_, numCols);
+  CHECK_EQ(height_, 1UL);
  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
            true_type() /*aAsRowVector*/, false_type());
@@ -1494,8 +1549,23 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
 }
 template<>
-void BaseMatrixT<real>::sumRows(BaseMatrixT& b) {
+template <class Agg>
-  applyRow(aggregate::sum(), b);
+int BaseMatrixT<real>::applyCol(
+     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+  if (scaleDest != 0) {
+    applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
+  } else {
+    applyCol(agg, base::binary::second(), b);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+template<>
+void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
+  applyRow(aggregate::sum(), scaleDest, scaleSum, b);
 }
 template<>
@@ -1524,18 +1594,22 @@ void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
 }
 template<>
-void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scale) {
+void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
-  applyCol(aggregate::sum(), base::binary::add2(1.0, scale), b);
+  applyCol(aggregate::sum(), scaleDest, scaleSum, b);
 }
 template<>
-void BaseMatrixT<real>::sumOfSquares(BaseMatrixT& b, BaseMatrixT& c) {
+void BaseMatrixT<real>::sumOfSquaredDiffs(
-  int numRows = b.height_;
+    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
-  int numCols = b.width_;
+  applyRow(aggregate::sum(), base::binary::squaredDiff(),
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+           scaleDest, scaleSum, b, c);
-  aggregate(aggregate::sum(), base::binary::squaredDiff(), base::binary::add(),
+}
-            b, c, numRows, numCols, offset, false_type(),
-            true_type() /*aAsColVector*/);
+template<>
+void BaseMatrixT<real>::sumOfProducts(
+    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
+  applyRow(aggregate::sum(), base::binary::mul(),
+           scaleDest, scaleSum, b, c);
 }
 template class BaseMatrixT<real>;

--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -305,6 +305,23 @@ public:
  template <class Agg>
  int applyRow(Agg agg, BaseMatrixT& b);
+  /**
+   * a aggregate expression that apply each row of matrix b.
+   *
+   * @code
+   * for each row i & 0 <= j < b.width_, do:
+   *   dst = agg(op(b[i*ldb + j], c[i*ldc + j])
+   *   this[i] = sv(this[i], dst)
+   * @endcode
+   */
+  template <class Agg, class Op, class Saver>
+  int applyRow(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c);
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg, class Op>
+  int applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
+               BaseMatrixT& b, BaseMatrixT& c);
  /**
   * a aggregate expression that apply each row of matrix b.
   *
@@ -317,6 +334,10 @@ public:
  template <class Agg, class Saver>
  int applyRow(Agg agg, Saver sv, BaseMatrixT& b);
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg>
+  int applyRow(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
  /**
   * a aggregate expression that apply each column of matrix b.
   *
@@ -340,6 +361,10 @@ public:
  template <class Agg, class Saver>
  int applyCol(Agg agg, Saver sv, BaseMatrixT& b);
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg>
+  int applyCol(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
  bool useGpu() const { return useGpu_; }
  const T* rowBuf(size_t row) const { return data_ + width_ * row; }
@@ -920,7 +945,9 @@ public:
  void addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
  /// calculate the sum of each row of the matrix b.
-  void sumRows(BaseMatrixT& b);
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij}
+  void sumRows(BaseMatrixT& b, T scaleSum, T scaleDest);
  /// calculate the maximum value of each row of the matrix b.
  void maxRows(BaseMatrixT& b);
  /// calculate the minimum value of each row of the matrix b.
@@ -932,10 +959,18 @@ public:
  void maxCols(BaseMatrixT& b);
  /// calculate the minimum value of each column of the matrix b.
  void minCols(BaseMatrixT& b);
-  void sumCols(BaseMatrixT& b, T scale);
-  /// calculate the sum of each row of (b - c)^2.
+  /// calculate the sum of each column of the matrix b.
-  void sumOfSquares(BaseMatrixT& b, BaseMatrixT& c);
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
+  void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest);
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2
+  void sumOfSquaredDiffs(BaseMatrixT& b, BaseMatrixT& c,
+                         T scaleSum, T scaleDest);
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
+  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c,
+                     T scaleSum, T scaleDest);
  /**
   * @code

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -242,7 +242,7 @@ real GpuMatrix::getSum() {
 void GpuMatrix::accumulateColSum(Matrix& src) {
  CHECK_EQ(getWidth(), src.getWidth());
  CHECK_EQ(getHeight(), (size_t)1);
-  sumCols(src, 1.0);
+  sumCols(src, 1.0, 1.0);
 }
 real GpuMatrix::getAbsSum() {
@@ -389,7 +389,7 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
  CHECK_EQ(width_, a.getWidth());
  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
  if (!sMatPtr) {
-    sumCols(a, scale);
+    sumCols(a, /* scaleSum= */scale, /* scaleDest= */1);
  } else {
    real* data = getData();
    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
@@ -589,7 +589,7 @@ void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
 void GpuMatrix::colMerge(Matrix& src) {
  CHECK(src.height_ == height_);
  if (!trans_ && !src.trans_) {
-    sumRows(src);
+    sumRows(src, /* scaleSum= */1, /* scaleDest= */0);
  } else {
    LOG(FATAL) << "Is not supported";
  }
@@ -599,7 +599,7 @@ void GpuMatrix::rowSum(Matrix& sum) {
  CHECK_EQ(sum.getHeight(), getHeight());
  CHECK_EQ(sum.getWidth(), (size_t)1);
-  sum.sumRows(*this);
+  sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0);
 }
 void GpuMatrix::rowMax(Matrix& max) {
@@ -790,7 +790,8 @@ void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
    LOG(FATAL) << "not supported: GpuSparseMatrix as label";
  }
-  BaseMatrix::sumOfSquares(output, label);
+  BaseMatrix::sumOfSquaredDiffs(output, label,
+                                /* scaleSum= */1, /* scaleDest= */1);
 }
 void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
@@ -1501,7 +1502,7 @@ void CpuMatrix::accumulateColSum(Matrix& src) {
  CHECK_EQ(getWidth(), src.getWidth());
  CHECK_EQ(getHeight(), (size_t)1);
-  sumCols(src, 1.0);
+  sumCols(src, /* scaleSum= */1, /* scaleDest= */1);
 }
 real CpuMatrix::getAbsSum() {
@@ -2188,7 +2189,7 @@ void CpuMatrix::collectBias(Matrix& a, real scale) {
  CHECK_EQ(width_, a.getWidth());
  CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
  if (!aptr) {
-    sumCols(a, scale);
+    sumCols(a, /* scaleSum= */scale, /* scaleDest= */1);
  } else {
    size_t nnz = aptr->getElementCnt();
    int* cols = aptr->getCols();
@@ -2227,7 +2228,7 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
  real* dst = getData();
  real* src = a.getData();
  const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(1, 1, false, false);
+  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
  for (size_t i = 0; i < height; i++) {
    int sequenceLength = starts[i + 1] - starts[i];
@@ -2239,13 +2240,15 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
    dataMtx->setData(src + starts[i] * width, sequenceLength, width);
    if (mode == 0) {
      // plain average
-      outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength);
+      outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength,
+                      /* scaleDest= */1);
    } else if (mode == 1) {
      // sum instead of average
-      outMtx->sumCols(*dataMtx, (real)1);
+      outMtx->sumCols(*dataMtx,  /* scaleSum= */1, /* scaleDest= */1);
    } else if (mode == 2) {
      // divide by square root of sequenceLength
-      outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength));
+      outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength),
+                      /* scaleDest= */1);
    } else {
      LOG(FATAL) << "should not reach here";
    }
@@ -2932,7 +2935,7 @@ void CpuMatrix::rowSum(Matrix& sum) {
  CHECK_EQ(sum.getHeight(), getHeight());
  CHECK_EQ(sum.getWidth(), (size_t)1);
-  sum.sumRows(*this);
+  sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0);
 }
 void CpuMatrix::rowMaxId(IVector& maxIds) {
@@ -3485,7 +3488,8 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
    }
  }
-  BaseMatrix::sumOfSquares(output, label);
+  BaseMatrix::sumOfSquaredDiffs(output, label,
+                                /* scaleSum= */1, /* scaleDest= */1);
 }
 /* calculate the error of outputV according to label */

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -592,6 +592,20 @@ class DotMulProjection(Projection):
    def calc_parameter_dims(self, input_size, output_size):
        return [1, output_size]
+# ScalingProjection
+@config_class
+class ScalingProjection(Projection):
+    type = 'scaling'
+    def calc_output_size(self, input_layer_config):
+        return input_layer_config.size
+    def calc_parameter_size(self, input_size, output_size):
+        return 1
+    def calc_parameter_dims(self, input_size, output_size):
+        return [1, 1]
 @config_class
 class TableProjection(Projection):

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -65,6 +65,7 @@ __all__ = [
    'StaticInput',
    'expand_layer',
    'scaling_layer',
+    'scaling_projection',
    'power_layer',
    'interpolation_layer',
    'bilinear_interp_layer',
@@ -458,7 +459,7 @@ def identity_projection(input, offset=None):
    :type input: LayerOutput
    :param offset: Offset, None if use default.
    :type offset: int
-    :return: A IdentityProjection or IdentityOffsetProjection Object
+    :return: A IdentityProjection or IdentityOffsetProjection object
    :rtype: IdentityProjection or IdentityOffsetProjection
    """
    if offset is None:
@@ -471,6 +472,34 @@ def identity_projection(input, offset=None):
    return proj
+@wrap_param_attr_default()
+def scaling_projection(input, param_attr=None):
+    """
+    scaling_projection multiplies the input with a scalar parameter and add to
+    the output.
+    .. math::
+       out += w * in
+    The example usage is:
+    .. code-block:: python
+       proj = scaling_projection(input=layer)
+    :param input: Input Layer.
+    :type input: LayerOutput
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :return: A ScalingProjection object
+    :rtype: ScalingProjection
+    """
+    proj = ScalingProjection(input_layer_name=input.name,
+                             **param_attr.attr)
+    proj.origin = input
+    return proj
 @wrap_param_attr_default()
 def dotmul_projection(input, param_attr=None):
    """

--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
@@ -26,6 +26,7 @@ with mixed_layer() as m5:
 with mixed_layer() as m6:
    m6 += dotmul_operator(a=m3, b=m4)
+    m6 += scaling_projection(m3)
 img = data_layer(name='img', size=32 * 32)
 flt = data_layer(name='filter', size=3 * 3 * 1 * 64)

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -111,13 +111,23 @@ layers {
  inputs {
    input_layer_name: "__mixed_2__"
  }
+  inputs {
+    input_layer_name: "__mixed_2__"
+    input_parameter_name: "___mixed_5__.w1"
+    proj_conf {
+      type: "scaling"
+      name: "___mixed_5__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
  inputs {
    input_layer_name: "__mixed_3__"
  }
  operator_confs {
    type: "dot_mul"
    input_indices: 0
-    input_indices: 1
+    input_indices: 2
    input_sizes: 100
    input_sizes: 100
    output_size: 100
@@ -258,6 +268,16 @@ parameters {
  initial_strategy: 0
  initial_smart: false
 }
+parameters {
+  name: "___mixed_5__.w1"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
 parameters {
  name: "___mixed_7__.w0"
  size: 30000