From bf6f690f314eb170454dcbd6c3a01980c98968bb Mon Sep 17 00:00:00 2001 From: xuwei06 Date: Tue, 8 Nov 2016 14:02:58 -0800 Subject: [PATCH] Add ScalingProjection out = w * input where w is a parameter of size 1 Change-Id: Ife682d62323ceb1a20cbbf6269421b20a862d888 --- doc/ui/api/trainer_config_helpers/layers.rst | 6 +++ .../gserver/layers/FullMatrixProjection.cpp | 4 +- paddle/gserver/layers/ScalingProjection.cpp | 53 +++++++++++++++++++ paddle/gserver/tests/test_LayerGrad.cpp | 11 ++++ paddle/math/BaseMatrix.cu | 51 ++++++++++++++---- paddle/math/BaseMatrix.h | 30 +++++++++-- paddle/math/Matrix.cpp | 30 ++++++----- python/paddle/trainer/config_parser.py | 14 +++++ .../paddle/trainer_config_helpers/layers.py | 32 ++++++++++- .../tests/configs/projections.py | 1 + .../configs/protostr/projections.protostr | 22 +++++++- 11 files changed, 223 insertions(+), 31 deletions(-) create mode 100644 paddle/gserver/layers/ScalingProjection.cpp diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst index 8d297b0cf23..4a02af39699 100644 --- a/doc/ui/api/trainer_config_helpers/layers.rst +++ b/doc/ui/api/trainer_config_helpers/layers.rst @@ -191,6 +191,12 @@ embedding_layer :members: embedding_layer :noindex: +scaling_projection +----------------- +.. automodule:: paddle.trainer_config_helpers.layers + :members: scaling_projection + :noindex: + dotmul_projection ----------------- .. automodule:: paddle.trainer_config_helpers.layers diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/gserver/layers/FullMatrixProjection.cpp index 8241cbd37ec..f17c1b05bd8 100644 --- a/paddle/gserver/layers/FullMatrixProjection.cpp +++ b/paddle/gserver/layers/FullMatrixProjection.cpp @@ -52,7 +52,9 @@ void FullMatrixProjection::backward(const UpdateCallback& callback) { } hl_set_sync_flag(syncFlag); - parameter_->incUpdate(callback); + if (weight_->getWGrad()) { + parameter_->incUpdate(callback); + } } } // namespace paddle diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/gserver/layers/ScalingProjection.cpp new file mode 100644 index 00000000000..c0a7072c6a7 --- /dev/null +++ b/paddle/gserver/layers/ScalingProjection.cpp @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "Projection.h" + +namespace paddle { + +class ScalingProjection : public Projection { +public: + ScalingProjection(const ProjectionConfig& config, + const ParameterPtr& parameter, bool useGpu) + : Projection(config, parameter, useGpu) { + CHECK_EQ(parameter->getSize(), 1UL); + weight_.reset(new Weight(1, 1, parameter)); + } + + void forward() { + CHECK(in_->value); + out_->value->add(*in_->value, weight_->getW()->getElement(0, 0)); + } + + void backward(const UpdateCallback& callback) { + if (weight_->getWGrad()) { + auto sum = Matrix::create(in_->value->getHeight(), 1, false, useGpu_); + sum->sumOfProducts(*in_->value, *out_->grad, + /* scaleSum= */1, /* scaleDest= */0); + weight_->getWGrad()->sumCols(*sum, + /* scaleSum= */1, /* scaleDest= */1); + parameter_->incUpdate(callback); + } + if (in_->grad) { + in_->grad->add(*out_->grad, weight_->getW()->getElement(0, 0)); + } + } + +protected: + std::unique_ptr weight_; +}; + +REGISTER_PROJECTION(scaling, ScalingProjection); + +} // namespace paddle diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index f3cd2b4faf0..a79dfe39c9b 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -135,6 +135,17 @@ TEST(Projection, identity) { } } +TEST(Projection, scaling) { + ProjectionConfig conf; + conf.set_type("scaling"); + conf.set_input_size(10); + conf.set_output_size(10); + for (auto useGpu : {false}) { + testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 1, + /* batchSize */ 100, useGpu); + } +} + #ifndef PADDLE_ONLY_CPU TEST(Projection, conv) { const int NUM_FILTERS = 16; diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu index d81b99e5441..c3c425a23dc 100644 --- a/paddle/math/BaseMatrix.cu +++ b/paddle/math/BaseMatrix.cu @@ -1451,6 +1451,8 @@ int BaseMatrixT::applyRow(Agg agg, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; + CHECK_EQ(height_, numRows); + CHECK_EQ(width_, 1UL); aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, numCols, offset, false_type(), true_type() /*aAsColVector*/); @@ -1463,18 +1465,39 @@ int BaseMatrixT::applyRow(Agg agg, Saver sv, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; + CHECK_EQ(height_, numRows); + CHECK_EQ(width_, 1UL); aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, false_type(), true_type() /*aAsColVector*/); return 0; } +template<> +template +int BaseMatrixT::applyRow(Agg agg, Op op, Saver sv, + BaseMatrixT& b, BaseMatrixT& c) { + MatrixOffset offset(0, 0, 0, 0, 0, 0); + int numRows = b.height_; + int numCols = b.width_; + CHECK_EQ(height_, numRows); + CHECK_EQ(width_, 1UL); + CHECK_EQ(c.height_, numRows); + CHECK_EQ(c.width_, numCols); + aggregate(agg, op, sv, + b, c, numRows, numCols, offset, + false_type(), true_type() /*aAsColVector*/); + return 0; +} + template<> template int BaseMatrixT::applyCol(Agg agg, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; + CHECK_EQ(width_, numCols); + CHECK_EQ(height_, 1UL); aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, numCols, offset, true_type() /*aAsRowVector*/, false_type()); @@ -1487,6 +1510,8 @@ int BaseMatrixT::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; + CHECK_EQ(width_, numCols); + CHECK_EQ(height_, 1UL); aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, true_type() /*aAsRowVector*/, false_type()); @@ -1494,8 +1519,8 @@ int BaseMatrixT::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { } template<> -void BaseMatrixT::sumRows(BaseMatrixT& b) { - applyRow(aggregate::sum(), b); +void BaseMatrixT::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) { + applyRow(aggregate::sum(), base::binary::add2(scaleDest, scaleSum), b); } template<> @@ -1524,18 +1549,22 @@ void BaseMatrixT::minCols(BaseMatrixT& b) { } template<> -void BaseMatrixT::sumCols(BaseMatrixT& b, real scale) { - applyCol(aggregate::sum(), base::binary::add2(1.0, scale), b); +void BaseMatrixT::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) { + applyCol(aggregate::sum(), base::binary::add2(scaleDest, scaleSum), b); } template<> -void BaseMatrixT::sumOfSquares(BaseMatrixT& b, BaseMatrixT& c) { - int numRows = b.height_; - int numCols = b.width_; - MatrixOffset offset(0, 0, 0, 0, 0, 0); - aggregate(aggregate::sum(), base::binary::squaredDiff(), base::binary::add(), - b, c, numRows, numCols, offset, false_type(), - true_type() /*aAsColVector*/); +void BaseMatrixT::sumOfSquaredDiffs( + BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) { + applyRow(aggregate::sum(), base::binary::squaredDiff(), + base::binary::add2(scaleDest, scaleSum), b, c); +} + +template<> +void BaseMatrixT::sumOfProducts( + BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) { + applyRow(aggregate::sum(), base::binary::mul(), + base::binary::add2(scaleDest, scaleSum), b, c); } template class BaseMatrixT; diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h index 2dd2c2c7a9b..fd1604b985d 100644 --- a/paddle/math/BaseMatrix.h +++ b/paddle/math/BaseMatrix.h @@ -305,6 +305,18 @@ public: template int applyRow(Agg agg, BaseMatrixT& b); + /** + * a aggregate expression that apply each row of matrix b. + * + * @code + * for each row i & 0 <= j < b.width_, do: + * dst = agg(op(b[i*ldb + j], c[i*ldc + j]) + * this[i] = sv(this[i], dst) + * @endcode + */ + template + int applyRow(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c); + /** * a aggregate expression that apply each row of matrix b. * @@ -920,7 +932,9 @@ public: void addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c); /// calculate the sum of each row of the matrix b. - void sumRows(BaseMatrixT& b); + /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} + void sumRows(BaseMatrixT& b, T scaleSum, T scaleDest); + /// calculate the maximum value of each row of the matrix b. void maxRows(BaseMatrixT& b); /// calculate the minimum value of each row of the matrix b. @@ -932,10 +946,18 @@ public: void maxCols(BaseMatrixT& b); /// calculate the minimum value of each column of the matrix b. void minCols(BaseMatrixT& b); - void sumCols(BaseMatrixT& b, T scale); - /// calculate the sum of each row of (b - c)^2. - void sumOfSquares(BaseMatrixT& b, BaseMatrixT& c); + /// calculate the sum of each column of the matrix b. + /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji} + void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest); + + /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2 + void sumOfSquaredDiffs(BaseMatrixT& b, BaseMatrixT& c, + T scaleSum, T scaleDest); + + /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij} + void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, + T scaleSum, T scaleDest); /** * @code diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 5ee8fbebfcf..706a598d0c3 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -242,7 +242,7 @@ real GpuMatrix::getSum() { void GpuMatrix::accumulateColSum(Matrix& src) { CHECK_EQ(getWidth(), src.getWidth()); CHECK_EQ(getHeight(), (size_t)1); - sumCols(src, 1.0); + sumCols(src, 1.0, 1.0); } real GpuMatrix::getAbsSum() { @@ -389,7 +389,7 @@ void GpuMatrix::collectBias(Matrix& a, real scale) { CHECK_EQ(width_, a.getWidth()); GpuSparseMatrix* sMatPtr = dynamic_cast(&a); if (!sMatPtr) { - sumCols(a, scale); + sumCols(a, /* scaleSum= */scale, /* scaleDest= */1); } else { real* data = getData(); hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get(); @@ -589,7 +589,7 @@ void GpuMatrix::addToRows(Matrix& table, IVector& ids) { void GpuMatrix::colMerge(Matrix& src) { CHECK(src.height_ == height_); if (!trans_ && !src.trans_) { - sumRows(src); + sumRows(src, /* scaleSum= */1, /* scaleDest= */0); } else { LOG(FATAL) << "Is not supported"; } @@ -599,7 +599,7 @@ void GpuMatrix::rowSum(Matrix& sum) { CHECK_EQ(sum.getHeight(), getHeight()); CHECK_EQ(sum.getWidth(), (size_t)1); - sum.sumRows(*this); + sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0); } void GpuMatrix::rowMax(Matrix& max) { @@ -790,7 +790,8 @@ void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) { LOG(FATAL) << "not supported: GpuSparseMatrix as label"; } - BaseMatrix::sumOfSquares(output, label); + BaseMatrix::sumOfSquaredDiffs(output, label, + /* scaleSum= */1, /* scaleDest= */1); } void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) { @@ -1501,7 +1502,7 @@ void CpuMatrix::accumulateColSum(Matrix& src) { CHECK_EQ(getWidth(), src.getWidth()); CHECK_EQ(getHeight(), (size_t)1); - sumCols(src, 1.0); + sumCols(src, /* scaleSum= */1, /* scaleDest= */1); } real CpuMatrix::getAbsSum() { @@ -2188,7 +2189,7 @@ void CpuMatrix::collectBias(Matrix& a, real scale) { CHECK_EQ(width_, a.getWidth()); CpuSparseMatrix* aptr = dynamic_cast(&a); if (!aptr) { - sumCols(a, scale); + sumCols(a, /* scaleSum= */scale, /* scaleDest= */1); } else { size_t nnz = aptr->getElementCnt(); int* cols = aptr->getCols(); @@ -2227,7 +2228,7 @@ void CpuMatrix::sequenceAvgForward(Matrix& a, real* dst = getData(); real* src = a.getData(); const int* starts = startsPos.getData(); - MatrixPtr outMtx = Matrix::create(1, 1, false, false); + MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false); MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false); for (size_t i = 0; i < height; i++) { int sequenceLength = starts[i + 1] - starts[i]; @@ -2239,13 +2240,15 @@ void CpuMatrix::sequenceAvgForward(Matrix& a, dataMtx->setData(src + starts[i] * width, sequenceLength, width); if (mode == 0) { // plain average - outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength); + outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength, + /* scaleDest= */1); } else if (mode == 1) { // sum instead of average - outMtx->sumCols(*dataMtx, (real)1); + outMtx->sumCols(*dataMtx, /* scaleSum= */1, /* scaleDest= */1); } else if (mode == 2) { // divide by square root of sequenceLength - outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength)); + outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength), + /* scaleDest= */1); } else { LOG(FATAL) << "should not reach here"; } @@ -2932,7 +2935,7 @@ void CpuMatrix::rowSum(Matrix& sum) { CHECK_EQ(sum.getHeight(), getHeight()); CHECK_EQ(sum.getWidth(), (size_t)1); - sum.sumRows(*this); + sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0); } void CpuMatrix::rowMaxId(IVector& maxIds) { @@ -3485,7 +3488,8 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) { } } - BaseMatrix::sumOfSquares(output, label); + BaseMatrix::sumOfSquaredDiffs(output, label, + /* scaleSum= */1, /* scaleDest= */1); } /* calculate the error of outputV according to label */ diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 881f0b82149..3e55a9f9f56 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -592,6 +592,20 @@ class DotMulProjection(Projection): def calc_parameter_dims(self, input_size, output_size): return [1, output_size] +# ScalingProjection +@config_class +class ScalingProjection(Projection): + type = 'scaling' + + def calc_output_size(self, input_layer_config): + return input_layer_config.size + + def calc_parameter_size(self, input_size, output_size): + return 1 + + def calc_parameter_dims(self, input_size, output_size): + return [1, 1] + @config_class class TableProjection(Projection): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 796121a6413..ca5ab68c5c2 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -29,6 +29,7 @@ except ImportError: import pickle import copy +<<<<<<< 0ba0f02c685e52b14632f6b9bfca4321494505c7 __all__ = [ "full_matrix_projection", "AggregateLevel", @@ -65,6 +66,7 @@ __all__ = [ 'StaticInput', 'expand_layer', 'scaling_layer', + 'scaling_projection', 'power_layer', 'interpolation_layer', 'bilinear_interp_layer', @@ -458,7 +460,7 @@ def identity_projection(input, offset=None): :type input: LayerOutput :param offset: Offset, None if use default. :type offset: int - :return: A IdentityProjection or IdentityOffsetProjection Object + :return: A IdentityProjection or IdentityOffsetProjection object :rtype: IdentityProjection or IdentityOffsetProjection """ if offset is None: @@ -471,6 +473,34 @@ def identity_projection(input, offset=None): return proj +@wrap_param_attr_default() +def scaling_projection(input, param_attr=None): + """ + scaling_projection multiplies the input with a scalar parameter and add to + the output. + + .. math:: + out += w * in + + The example usage is: + + .. code-block:: python + + proj = scaling_projection(input=layer) + + :param input: Input Layer. + :type input: LayerOutput + :param param_attr: Parameter config, None if use default. + :type param_attr: ParameterAttribute + :return: A ScalingProjection object + :rtype: ScalingProjection + """ + proj = ScalingProjection(input_layer_name=input.name, + **param_attr.attr) + proj.origin = input + return proj + + @wrap_param_attr_default() def dotmul_projection(input, param_attr=None): """ diff --git a/python/paddle/trainer_config_helpers/tests/configs/projections.py b/python/paddle/trainer_config_helpers/tests/configs/projections.py index 19ac6ec9061..aa4521dcd5d 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/projections.py +++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py @@ -26,6 +26,7 @@ with mixed_layer() as m5: with mixed_layer() as m6: m6 += dotmul_operator(a=m3, b=m4) + m6 += scaling_projection(m3) img = data_layer(name='img', size=32 * 32) flt = data_layer(name='filter', size=3 * 3 * 1 * 64) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr index e47e531a222..2b3951c2424 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr @@ -111,13 +111,23 @@ layers { inputs { input_layer_name: "__mixed_2__" } + inputs { + input_layer_name: "__mixed_2__" + input_parameter_name: "___mixed_5__.w1" + proj_conf { + type: "scaling" + name: "___mixed_5__.w1" + input_size: 100 + output_size: 100 + } + } inputs { input_layer_name: "__mixed_3__" } operator_confs { type: "dot_mul" input_indices: 0 - input_indices: 1 + input_indices: 2 input_sizes: 100 input_sizes: 100 output_size: 100 @@ -258,6 +268,16 @@ parameters { initial_strategy: 0 initial_smart: false } +parameters { + name: "___mixed_5__.w1" + size: 1 + initial_mean: 0.0 + initial_std: 1.0 + dims: 1 + dims: 1 + initial_strategy: 0 + initial_smart: true +} parameters { name: "___mixed_7__.w0" size: 30000 -- GitLab