diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh index 51e483d1fb2ff311ae68219d823411a3d707a30e..5691a6a6255dd29f6319cb53d4a53f3836187baf 100644 --- a/paddle/cuda/include/hl_matrix_type.cuh +++ b/paddle/cuda/include/hl_matrix_type.cuh @@ -19,7 +19,6 @@ limitations under the License. */ #include "hl_base.h" #ifdef __CUDA_ARCH__ -// typedef void* vecType; #include #ifndef PADDLE_TYPE_DOUBLE typedef float4 vecType; @@ -37,4 +36,10 @@ typedef __m128d vecType; #endif #endif -#endif /* HL_MATRIX_TYPE_CUH_ */ +#ifdef __CUDA_ARCH__ +#define INLINE __device__ inline +#else +#define INLINE inline +#endif + +#endif // HL_MATRIX_TYPE_CUH_ diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/cuda/include/hl_tensor_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..0912083ba0aa7c95d3a902e0e83717b1c600e822 --- /dev/null +++ b/paddle/cuda/include/hl_tensor_ops.h @@ -0,0 +1,316 @@ +/** + * hl_tensor_ops.h + * + * Author: hedaoyuan (hedaoyuan@baidu.com) + * Created on: 2016-06-06 + * + * Copyright (c) Baidu.com, Inc. All Rights Reserved + * + */ + +#ifndef HL_TENSOR_OPS_H_ +#define HL_TENSOR_OPS_H_ + +#include +#include "hl_matrix_type.cuh" + +namespace hppl { +namespace unary { + +template +class add_scale{ +private: + const T p; +public: + INLINE add_scale(const T s) : p(s) {} + INLINE T operator()(const T a) const { return a + p; } +}; + +template +class sub_scale { +private: + const T p; +public: + INLINE sub_scale(const T s) : p(s) {} + INLINE T operator()(const T a) const { return a - p; } +}; + +template +class mul_scale { +private: + const T p; +public: + INLINE mul_scale(const T s) : p(s) {} + INLINE T operator()(const T a) const { return a * p; } +}; + +template +class div_scale { +private: + const T p; +public: + INLINE div_scale(const T s) : p(s) {} + INLINE T operator()(const T a) const { return a / p; } +}; + +template +class neg { +public: + INLINE T operator()(const T a) const { return -a; } +}; + +template +class exp_op { +public: + INLINE T operator()(const T a) const { return std::exp(a); } +}; + +template +class log_op { +public: + INLINE T operator()(const T a) const { return std::log(a); } +}; + +template +class sqrt_op { +public: + INLINE T operator()(const T a) const { return std::sqrt(a); } +}; + +template +class square { +public: + INLINE T operator()(const T a) const { return a * a; } +}; + +template +class reciprocal { +public: + INLINE T operator()(const T a) const { return T(1) / a; } +}; + +template +class abs { +public: + INLINE T operator()(const T a) const { return a > 0 ? a : -a; } +}; + +template +class sign { +public: + INLINE T operator()(const T a) const { return (a > 0) - (a < 0); } +}; + +template +class min { +private: + const T p; +public: + INLINE min(const T s) : p(s) {} + INLINE T operator()(const T a) const { return a > p ? p : a; } +}; + +template +class max { +private: + const T p; +public: + INLINE max(const T s) : p(s) {} + INLINE T operator()(const T a) const { return a < p ? p : a; } +}; + +template +class pow_op { +private: + const T p; +public: + INLINE pow_op(const T s) : p(s) {} + INLINE T operator()(const T a) const { return std::pow(a, p); } +}; + +template +class constant { +private: + const T p; +public: + INLINE constant(const T s) : p(s) {} + INLINE T operator()(int i) const { return p; } + INLINE T operator()(int i, int j) const { return p; } +}; + +template +class cmp_eq { +private: + const T p; +public: + INLINE cmp_eq(const T s) : p(s) {} + INLINE bool operator()(const T a) const { return a == p; } +}; + +template +class cmp_ne { +private: + const T p; +public: + INLINE cmp_ne(const T s) : p(s) {} + INLINE bool operator()(const T a) const { return a != p; } +}; + +template +class cmp_le { +private: + const T p; +public: + INLINE cmp_le(const T s) : p(s) {} + INLINE bool operator()(const T a) const { return a <= p; } +}; + +template +class cmp_lt { +private: + const T p; +public: + INLINE cmp_lt(const T s) : p(s) {} + INLINE bool operator()(const T a) const { return a < p; } +}; + +template +class cmp_ge { +private: + const T p; +public: + INLINE cmp_ge(const T s) : p(s) {} + INLINE bool operator()(const T a) const { return a >= p; } +}; + +template +class cmp_gt { +private: + const T p; +public: + INLINE cmp_gt(const T s) : p(s) {} + INLINE bool operator()(const T a) const { return a > p; } +}; + +template +class and_op { +private: + const T p; +public: + INLINE and_op(const T s) : p(s) {} + INLINE bool operator()(const T a) const { return a && p; } +}; + +template +class or_op { +private: + const T p; +public: + INLINE or_op(const T s) : p(s) {} + INLINE bool operator()(const T a) const { return a || p; } +}; + +} // namespace unary + +namespace binary { +template +class add { +public: + INLINE T operator()(const T a, const T b) const { return a + b; } +}; + +template +class add_scale { +private: + const T p1; + const T p2; +public: + INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {} + INLINE T operator()(const T a, const T b) const { + return p1 * a + p2 * b; + } +}; + +template +class sub { +public: + INLINE T operator()(const T a, const T b) const { return a - b; } +}; + +template +class mul { +public: + INLINE T operator()(const T a, const T b) const { return a * b; } +}; + +template +class div { +public: + INLINE T operator()(const T a, const T b) const { return a / b; } +}; + +template +class cmp_eq { +public: + INLINE bool operator()(const T a, const T b) const { return a == b; } +}; + +template +class cmp_ne { +public: + INLINE bool operator()(const T a, const T b) const { return a != b; } +}; + +template +class cmp_le { +public: + INLINE bool operator()(const T a, const T b) const { return a <= b; } +}; + +template +class cmp_lt { +public: + INLINE bool operator()(const T a, const T b) const { return a < b; } +}; + +template +class cmp_ge { +public: + INLINE bool operator()(const T a, const T b) const { return a >= b; } +}; + +template +class cmp_gt { +public: + INLINE bool operator()(const T a, const T b) const { return a > b; } +}; + +template +class and_op { +public: + INLINE bool operator()(const T a, const T b) const { return a && b; } +}; + +template +class or_op { +public: + INLINE bool operator()(const T a, const T b) const { return a || b; } +}; + +template +class min { +public: + INLINE T operator()(const T a, const T b) const { return a > b ? b : a; } +}; + +template +class max { +public: + INLINE T operator()(const T a, const T b) const { return a < b ? b : a; } +}; + +} // namespace binary +} // namespace hppl + +#endif // HL_TENSOR_OPS_H_ + diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp index 9918d20d9082ae6c07684ce05eba68c4989dd5d5..0822a32ba0b5aa143b4c87ea2e8746df5201380a 100644 --- a/paddle/gserver/activations/ActivationFunction.cpp +++ b/paddle/gserver/activations/ActivationFunction.cpp @@ -271,7 +271,7 @@ void forward(Argument& act) { /* trans */ false, useGpu(act.deviceId)); act.in->copyFrom(*act.value); - act.value->abs(*act.value); + act.value->abs2(*act.value); } void backward(Argument& act) { act.grad->absDerivative(*act.in); } @@ -290,7 +290,7 @@ void forward(Argument& act) { /* trans */ false, useGpu(act.deviceId)); act.in->copyFrom(*act.value); - act.value->square(*act.value); + act.value->square2(*act.value); } void backward(Argument& act) { act.grad->squareDerivative(*act.in); } @@ -302,7 +302,7 @@ END_DEFINE_ACTIVATION(square) * \f] */ BEGIN_DEFINE_ACTIVATION(exponential) -void forward(Argument& act) { act.value->exp(*act.value); } +void forward(Argument& act) { act.value->exp2(*act.value); } void backward(Argument& act) { act.grad->expDerivative(*act.value); } END_DEFINE_ACTIVATION(exponential) diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp index b2921e6d40d3d5d3777fbb26fa9314aaa73f82da..f5387f8d7ea967d524ce6773387c1ab05d161183 100644 --- a/paddle/gserver/layers/BatchNormalizationLayer.cpp +++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp @@ -41,7 +41,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) { savedMean_->mulScalar(1.0 / numSamples); // E[x] tmpMat_->assign(*mat); - tmpMat_->square(); + tmpMat_->square2(); savedInvVar_->zeroMem(); savedInvVar_->accumulateColSum(*tmpMat_); savedInvVar_->mulScalar(1.0 / numSamples); // E[x^2] @@ -55,7 +55,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) { calMovingMeanAndVar(); savedInvVar_->subScalar(-EPS); - savedInvVar_->sqrt(*savedInvVar_); + savedInvVar_->sqrt2(*savedInvVar_); } void BatchNormalizationLayer::calMovingMeanAndVar() { @@ -86,7 +86,7 @@ void BatchNormalizationLayer::setMeanAndStd() { savedInvVar_->downClip(real(0.0)); savedInvVar_->subScalar(-EPS); - savedInvVar_->sqrt(*savedInvVar_); + savedInvVar_->sqrt2(*savedInvVar_); } void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) { diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp index 14ff8510f7b19dc24b7b1ba603485488ddd4979d..7dd2bcb957fa12611e9db75ed21dbc7b44e9a9a4 100644 --- a/paddle/gserver/layers/CostLayer.cpp +++ b/paddle/gserver/layers/CostLayer.cpp @@ -114,12 +114,12 @@ void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output, Matrix& target) { Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_); output.rowSum(*sftMaxSum_); - sftMaxSum_->log(); + sftMaxSum_->log2(); target.oneHotCrossEntropy(output, *label.ids); target.add(*sftMaxSum_); - sftMaxSum_->square(); + sftMaxSum_->square2(); target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha()); } @@ -130,12 +130,12 @@ void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output, output.rowSum(*sftMaxSum_); Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_); - sftMaxSum_->reciprocal(*sumInv_); + sftMaxSum_->reciprocal2(*sumInv_); outputG.oneHotCrossEntropyBp(output, *label.ids); outputG.addColumnVector(*sumInv_); - sftMaxSum_->log(); + sftMaxSum_->log2(); sumInv_->dotMul(*sumInv_, *sftMaxSum_); sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha()); diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp index 78d15c553021de6bbda210cb782c8a240cc2bf73..ed6a3537d1e207a60b46ed4ce29ea1d1f4149e2e 100644 --- a/paddle/gserver/layers/Layer.cpp +++ b/paddle/gserver/layers/Layer.cpp @@ -310,12 +310,12 @@ void Layer::showOutputStats() { auto tmpMat = dynamic_cast(outSquare.get()); min = tmpMat->getMin(); max = tmpMat->getMax(); - tmpMat->square(); + tmpMat->square2(); LOG(INFO) << "show statistics of [none zero values] in sparse matrix"; } else { min = outSquare->getMin(); max = outSquare->getMax(); - outSquare->square(); + outSquare->square2(); } real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean; std = std > 0 ? std : 0; diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp index fb54fd26cf36e2b23deba7186c3dcdd0cc445870..7e699b3086e4f46700c5b4c30e228ce6e06a3074 100644 --- a/paddle/gserver/layers/LinearChainCRF.cpp +++ b/paddle/gserver/layers/LinearChainCRF.cpp @@ -61,7 +61,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) { expX_->assign(*matX); // subtract max to avoid overflow or underflow expX_->mul(maxX_, ones_, (real)-1, (real)1); - expX_->exp(); + expX_->exp2(); real* a = a_->getData(); real* b = b_->getData(); @@ -70,7 +70,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) { real* expX = expX_->getData(); real* maxX = maxX_->getData(); - expW_->exp(*w_); + expW_->exp2(*w_); real* expW = expW_->getData(); for (int i = 0; i < numClasses_; ++i) { diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp index 44c5e6063b1aed93b3fbb175821f911ca26fac1a..967f50cda34d0d4139fdbbfd1d00938b5d4dec4f 100644 --- a/paddle/gserver/layers/PowerLayer.cpp +++ b/paddle/gserver/layers/PowerLayer.cpp @@ -100,7 +100,7 @@ void PowerLayer::backward(const UpdateCallback& callback) { Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_); if (inG0) { - tmpMtx->log(*inV1); + tmpMtx->log2(*inV1); tmpMtx->dotMul(*tmpMtx, *outV); // inG0 += outG .* (log(inV1) * outV) diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu index 8b888b1ee5e46ec5cac316d9f90095a7e314ae13..62ddfe6520cc1769776502eb257b756e516a82db 100644 --- a/paddle/math/BaseMatrix.cu +++ b/paddle/math/BaseMatrix.cu @@ -355,11 +355,11 @@ void BaseMatrixT::neg() { applyUnary(unary::Neg()); } DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a)); template<> -void BaseMatrixT::exp() { applyUnary(unary::Exp()); } +void BaseMatrixT::exp2() { applyUnary(unary::Exp()); } DEFINE_MATRIX_UNARY_OP(Log, a = log(a)); template<> -void BaseMatrixT::log() { +void BaseMatrixT::log2() { if (useGpu_) { applyUnary(unary::Log()); } else { @@ -369,23 +369,23 @@ void BaseMatrixT::log() { DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a)); template<> -void BaseMatrixT::sqrt() { applyUnary(unary::Sqrt()); } +void BaseMatrixT::sqrt2() { applyUnary(unary::Sqrt()); } DEFINE_MATRIX_UNARY_OP(Square, a = a * a); template -void BaseMatrixT::square() { applyUnary(unary::Square()); } +void BaseMatrixT::square2() { applyUnary(unary::Square()); } DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a); template -void BaseMatrixT::reciprocal() { applyUnary(unary::Reciprocal()); } +void BaseMatrixT::reciprocal2() { applyUnary(unary::Reciprocal()); } DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a); template -void BaseMatrixT::abs() { applyUnary(unary::Abs()); } +void BaseMatrixT::abs2() { applyUnary(unary::Abs()); } DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0)); template -void BaseMatrixT::sign() { applyUnary(unary::Sign()); } +void BaseMatrixT::sign2() { applyUnary(unary::Sign()); } DEFINE_MATRIX_UNARY_OP(Zero, a = 0); template @@ -405,7 +405,7 @@ void BaseMatrixT::one() { applyUnary(unary::One()); } DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p)); template<> -void BaseMatrixT::pow(real p) { +void BaseMatrixT::pow2(real p) { if (useGpu_) { applyUnary(unary::Pow(p)); } else { @@ -534,7 +534,7 @@ void BaseMatrixT::add(BaseMatrixT& b, T p) { DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p)); template<> -void BaseMatrixT::pow(BaseMatrixT& b, real p) { +void BaseMatrixT::pow2(BaseMatrixT& b, real p) { if (useGpu_) { applyBinary(binary::Pow(p), b); } else { @@ -615,7 +615,7 @@ void BaseMatrixT::breluDerivative(BaseMatrixT& b) { DEFINE_MATRIX_BINARY_OP(Square, b = a * a); template -void BaseMatrixT::square(BaseMatrixT& b) { +void BaseMatrixT::square2(BaseMatrixT& b) { applyBinary(binary::Square(), b); } @@ -654,7 +654,7 @@ void BaseMatrixT::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) { DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a); template -void BaseMatrixT::reciprocal(BaseMatrixT& b) { +void BaseMatrixT::reciprocal2(BaseMatrixT& b) { applyBinary(binary::Reciprocal(), b); } @@ -666,7 +666,7 @@ void BaseMatrixT::reciprocalDerivative(BaseMatrixT& b) { DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a); template -void BaseMatrixT::abs(BaseMatrixT& b) { applyBinary(binary::Abs(), b); } +void BaseMatrixT::abs2(BaseMatrixT& b) { applyBinary(binary::Abs(), b); } DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0); template @@ -726,17 +726,19 @@ void BaseMatrixT::expDerivative(BaseMatrixT& b) { DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f); template -void BaseMatrixT::sign(BaseMatrixT& b) { applyBinary(binary::Sign(), b); } +void BaseMatrixT::sign2(BaseMatrixT& b) { + applyBinary(binary::Sign(), b); +} DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b)); template<> -void BaseMatrixT::exp(BaseMatrixT& b) { +void BaseMatrixT::exp2(BaseMatrixT& b) { applyBinary(binary::Exp(), b); } DEFINE_MATRIX_BINARY_OP(Log, a = log(b)); template<> -void BaseMatrixT::log(BaseMatrixT& b) { +void BaseMatrixT::log2(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Log(), b); } else { @@ -746,7 +748,7 @@ void BaseMatrixT::log(BaseMatrixT& b) { DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b)); template<> -void BaseMatrixT::sqrt(BaseMatrixT& b) { +void BaseMatrixT::sqrt2(BaseMatrixT& b) { applyBinary(binary::Sqrt(), b); } @@ -1062,7 +1064,7 @@ void BaseMatrixT::biggerThan(BaseMatrixT& b, DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c); template -void BaseMatrixT::max(BaseMatrixT& b, BaseMatrixT& c) { // NOLINT +void BaseMatrixT::max2(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Max(), b, c); } @@ -1165,7 +1167,7 @@ void BaseMatrixT::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER, a = 1 / (p1 * b + p2)); template -void BaseMatrixT::reciprocal(BaseMatrixT& b, T p1, T p2) { +void BaseMatrixT::reciprocal2(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::Reciprocal2(p1, p2), b); } diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h index 2dd2c2c7a9b985924d53cb3bf8840eb1e55eee3e..7237f843322b18c1ed6d794e73a315ef75a8a799 100644 --- a/paddle/math/BaseMatrix.h +++ b/paddle/math/BaseMatrix.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include "paddle/utils/TypeDefs.h" +#include "TensorExpression.h" namespace paddle { @@ -66,7 +67,7 @@ public: }; template -class BaseMatrixT { +class BaseMatrixT : public TensorExpression, T> { public: size_t height_, width_; size_t stride_; @@ -351,14 +352,14 @@ public: * */ void neg(); - void exp(); - void pow(T p); - void log(); - void sqrt(); - void square(); - void reciprocal(); - void abs(); - void sign(); + void exp2(); + void pow2(T p); + void log2(); + void sqrt2(); + void square2(); + void reciprocal2(); + void abs2(); + void sign2(); void zero(); /** @@ -516,7 +517,7 @@ public: * b = this * this * @endcode */ - void square(BaseMatrixT& b); + void square2(BaseMatrixT& b); void squareDerivative(BaseMatrixT& b); /** @@ -540,7 +541,7 @@ public: * b = 1.0f / this * @endcode */ - void reciprocal(BaseMatrixT& b); + void reciprocal2(BaseMatrixT& b); void reciprocalDerivative(BaseMatrixT& b); /** @@ -548,7 +549,7 @@ public: * b = this > 0.0f ? this : -this * @endcode */ - void abs(BaseMatrixT& b); + void abs2(BaseMatrixT& b); void absDerivative(BaseMatrixT& b); /** @@ -566,12 +567,12 @@ public: */ void expDerivative(BaseMatrixT& b); - void sign(BaseMatrixT& b); + void sign2(BaseMatrixT& b); - void exp(BaseMatrixT& b); - void pow(BaseMatrixT& b, T p); - void log(BaseMatrixT& b); - void sqrt(BaseMatrixT& b); + void exp2(BaseMatrixT& b); + void pow2(BaseMatrixT& b, T p); + void log2(BaseMatrixT& b); + void sqrt2(BaseMatrixT& b); void addScalar(BaseMatrixT& b, T p); void subScalar(BaseMatrixT& b, T p); void mulScalar(BaseMatrixT& b, T p); @@ -742,7 +743,7 @@ public: * this = b>c ? b : c * @endcode */ - void max(BaseMatrixT& b, BaseMatrixT& c); // NOLINT + void max2(BaseMatrixT& b, BaseMatrixT& c); /** * @code @@ -837,7 +838,7 @@ public: * this = 1 / (p1 * b + p2) * @endcode */ - void reciprocal(BaseMatrixT& b, T p1, T p2); + void reciprocal2(BaseMatrixT& b, T p1, T p2); /** * @code @@ -953,6 +954,32 @@ public: virtual bool isSparse() const { return false; } + + template + void operator=(const ExpressionType& expr) { + if (useGpu_) { + TensorGpuApply(*this, expr); + } else { + TensorCpuApply(*this, expr); + } + } + + template + void operator+=(const ExpressionType& expr) { + (*this) = (*this) + expr; + } + template + void operator-=(const ExpressionType& expr) { + (*this) = (*this) - expr; + } + template + void operator*=(const ExpressionType& expr) { + (*this) = (*this) * expr; + } + template + void operator/=(const ExpressionType& expr) { + (*this) = (*this) / expr; + } }; typedef BaseMatrixT BaseMatrix; diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt index db305812a7c036177022836d877661c8f83e999f..1bb271ba80e1aa899f967afe27d6f5b8cf71eecd 100644 --- a/paddle/math/CMakeLists.txt +++ b/paddle/math/CMakeLists.txt @@ -16,10 +16,12 @@ file(GLOB MATH_HEADERS . *.h) file(GLOB MATH_SOURCES . *.cpp) set(MATH_SOURCES "${PROJ_ROOT}/paddle/math/BaseMatrix.cu" + "${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu" ${MATH_SOURCES}) if(NOT WITH_GPU) # then compile BaseMatrix.cu as c++ file compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/BaseMatrix.cu") + compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu") add_library(paddle_math STATIC ${MATH_SOURCES}) else() diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h index fd3b5030bea7ac937d9cf828e29d3441446a65f6..93f62fa9487417251eb7d8bf8bc6f62c486fb23b 100644 --- a/paddle/math/CpuSparseMatrix.h +++ b/paddle/math/CpuSparseMatrix.h @@ -125,7 +125,7 @@ public: return sum; } - virtual void square() { + virtual void square2() { CHECK(isContiguous()); if (valueType_ == NO_VALUE) { return; diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index 25104fe1c6d70afbf39ab47a17ce0bf21a121427..06e72fc34d7d0de29c52cbf34c023443a4f7eca6 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -930,6 +930,15 @@ public: virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) { LOG(FATAL) << "Not implemented"; } + + template + void operator=(const ExpressionType& expr) { + if (useGpu_) { + TensorGpuApply(*this, expr); + } else { + TensorCpuApply(*this, expr); + } + } }; inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) { @@ -1191,6 +1200,11 @@ public: int contextLength, int contextStart, int totalPad, size_t beginPad); + + template + void operator=(const ExpressionType& expr) { + TensorGpuApply(*this, expr); + } }; class CpuMatrix : public Matrix { @@ -1469,6 +1483,11 @@ public: void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label); void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label); void classificationErrorMulti(Matrix& output, Matrix& label, real threshold); + + template + void operator=(const ExpressionType& expr) { + TensorCpuApply(*this, expr); + } }; class SharedCpuMatrix : public CpuMatrix { @@ -1504,6 +1523,7 @@ public: void add(real p1, real p2); private: + using Matrix::mul; void initShared(int blockNum); void initBlock(int blockNum); diff --git a/paddle/math/TensorApply.h b/paddle/math/TensorApply.h new file mode 100644 index 0000000000000000000000000000000000000000..4f19d5232a4d30195e679114c2547a9572e4d13b --- /dev/null +++ b/paddle/math/TensorApply.h @@ -0,0 +1,226 @@ +/** + * TensorApply.h + * + * Author: hedaoyuan (hedaoyuan@baidu.com) + * Created on: 2016-06-06 + * + * Copyright (c) Baidu.com, Inc. All Rights Reserved + * + */ + +#pragma once + +namespace paddle { + +/** + * \brief The tensor evaluator classes. + */ +template +class TensorApply { +public: + explicit INLINE TensorApply(const Derived& p) + : data_(p.data_), stride_(p.stride_), + height_(p.height_), width_(p.width_), useGpu_(p.useGpu_) {} + + INLINE T apply(int i, int j) const { + return data_[i * stride_ + j]; + } + INLINE T apply(int index) const { + return data_[index]; + } + INLINE T& applyRef(int i, int j) { + return data_[i * stride_ + j]; + } + INLINE T& applyRef(int index) { + return data_[index]; + } + + INLINE size_t getWidth() const { return width_; } + INLINE size_t getHeight() const { return height_; } + INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; } + INLINE bool useGpu() const { return useGpu_; } + + T* data_; + size_t stride_; + size_t height_; + size_t width_; + bool useGpu_; +}; + +/** + * \brief The tensor evaluator classes. + * + * evaluator for rvalues + */ +template +class TensorApply { +public: + explicit INLINE TensorApply(const Derived& p) + : data_(p.data_), stride_(p.stride_), + height_(p.height_), width_(p.width_), useGpu_(p.useGpu_) {} + + INLINE T apply(int i, int j) const { + return data_[i * stride_ + j]; + } + INLINE T apply(int index) const { + return data_[index]; + } + + INLINE size_t getWidth() const { return width_; } + INLINE size_t getHeight() const { return height_; } + INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; } + INLINE bool useGpu() const { return useGpu_; } + + const T* data_; + size_t stride_; + size_t height_; + size_t width_; + bool useGpu_; +}; + +template +class TensorApply, T> { +public: + explicit TensorApply(const TensorExpression& expr) + : expr_(expr.derived()) {} + + INLINE T apply(int i, int j) const { + return expr_.apply(i, j); + } + INLINE T apply(int index) const { + return expr_.apply(index); + } + + INLINE size_t getWidth() const { return expr_.getWidth(); } + INLINE size_t getHeight() const { return expr_.getHeight(); } + INLINE bool isContiguous() const { return expr_.isContiguous(); } + INLINE bool useGpu() const { return expr_.useGpu(); } + + TensorApply expr_; +}; + +/** + * \brief The unary expression evaluator classes. + */ +template +class TensorApply, T> { +public: + explicit INLINE TensorApply(const TensorUnaryOp& expr) + : op_(expr.op_), expr_(expr.expr_) {} + + INLINE T apply(int i, int j) const { + return op_(expr_.apply(i, j)); + } + INLINE T apply(int index) const { + return op_(expr_.apply(index)); + } + + INLINE size_t getWidth() const { return expr_.getWidth(); } + INLINE size_t getHeight() const { return expr_.getHeight(); } + INLINE bool isContiguous() const { return expr_.isContiguous(); } + INLINE bool useGpu() const { return expr_.useGpu(); } + + const OP op_; + TensorApply expr_; +}; + +/** + * \brief The binary expression evaluator classes. + */ +template +class TensorApply, T> { +public: + explicit INLINE TensorApply( + const TensorBinaryOp& expr) + : op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) { + #ifndef __CUDA_ARCH__ + CHECK_EQ(lhs_.getWidth(), rhs_.getWidth()); + CHECK_EQ(lhs_.getHeight(), rhs_.getHeight()); + CHECK_EQ(lhs_.useGpu(), rhs_.useGpu()); + #endif + } + + INLINE T apply(int i, int j) const { + return op_(lhs_.apply(i, j), rhs_.apply(i, j)); + } + INLINE T apply(int index) const { + return op_(lhs_.apply(index), rhs_.apply(index)); + } + + INLINE size_t getWidth() const { return lhs_.getWidth(); } + INLINE size_t getHeight() const { return rhs_.getHeight(); } + INLINE bool isContiguous() const { + return lhs_.isContiguous() && rhs_.isContiguous(); + } + INLINE bool useGpu() const { return lhs_.useGpu(); } + + const OP op_; + TensorApply lhs_; + TensorApply rhs_; +}; + +/** + * \brief The ternary expression evaluator classes. + */ +template +class TensorApply, T> { +public: + explicit INLINE TensorApply( + const TensorTernaryOp& expr) + : expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) { + #ifndef __CUDA_ARCH__ + CHECK_EQ(expr1_.getWidth(), expr2_.getWidth()); + CHECK_EQ(expr1_.getWidth(), expr3_.getWidth()); + CHECK_EQ(expr1_.getHeight(), expr2_.getHeight()); + CHECK_EQ(expr1_.getHeight(), expr3_.getHeight()); + CHECK_EQ(expr1_.useGpu(), expr2_.useGpu()); + CHECK_EQ(expr1_.useGpu(), expr3_.useGpu()); + #endif + } + + INLINE T apply(int i, int j) const { + return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j); + } + INLINE T apply(int index) const { + return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index); + } + + INLINE size_t getWidth() const { return expr1_.getWidth(); } + INLINE size_t getHeight() const { return expr1_.getHeight(); } + INLINE bool isContiguous() const { + return expr1_.isContiguous() && + expr2_.isContiguous() && expr3_.isContiguous(); + } + INLINE bool useGpu() const { return expr1_.useGpu(); } + + TensorApply expr1_; + TensorApply expr2_; + TensorApply expr3_; +}; + +/** + * \brief The const expression evaluator classes. + */ +template +class TensorApply, T> { +public: + explicit INLINE TensorApply(const TensorConstant& expr) + : op_(expr.op_), expr_(expr.expr_) {} + + INLINE T apply(int i, int j) const { + return op_(i, j); + } + INLINE T apply(int index) const { + return op_(index); + } + + INLINE size_t getWidth() const { return expr_.getWidth(); } + INLINE size_t getHeight() const { return expr_.getHeight(); } + INLINE bool isContiguous() const { return true; } + INLINE bool useGpu() const { return expr_.useGpu(); } + + const OP op_; + TensorApply expr_; +}; + +} // namespace paddle diff --git a/paddle/math/TensorEvaluate.h b/paddle/math/TensorEvaluate.h new file mode 100644 index 0000000000000000000000000000000000000000..87f32996e7897524ac9ee1733d06e9bff616119b --- /dev/null +++ b/paddle/math/TensorEvaluate.h @@ -0,0 +1,104 @@ +/** + * TensorEvaluate.h + * + * Author: hedaoyuan (hedaoyuan@baidu.com) + * Created on: 2016-06-06 + * + * Copyright (c) Baidu.com, Inc. All Rights Reserved + * + */ + +#pragma once + +#include +#include "paddle/utils/Logging.h" +#include "hl_base.h" + +namespace paddle { + +/** + * \brief The tensor cpu evaluate api. + */ +template +inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) { + TensorApply lhs_(lhs); + TensorApply rhs_(rhs); + CHECK_EQ(lhs_.getWidth(), rhs_.getWidth()); + CHECK_EQ(lhs_.getHeight(), rhs_.getHeight()); + CHECK_EQ(lhs_.useGpu(), rhs_.useGpu()); + + if (lhs_.isContiguous() && rhs_.isContiguous()) { + int size = lhs_.getHeight() * lhs_.getWidth(); + for (int index = 0; index < size; index++) { + lhs_.applyRef(index) = rhs_.apply(index); + } + } else { + for (size_t i = 0; i < lhs_.getHeight(); i++) { + for (size_t j = 0; j < lhs_.getWidth(); j++) { + lhs_.applyRef(i, j) = rhs_.apply(i, j); + } + } + } +} + +#ifdef __NVCC__ +template +__global__ +void TensorElementWiseOp(LeftType lhs, RightType rhs, const int border) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < border) { + lhs.applyRef(idx) = rhs.apply(idx); + } +} + +template +__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) { + const int colIdx = blockIdx.x * blockDim.x + threadIdx.x; + const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y; + for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) { + for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) { + lhs.applyRef(i, j) = rhs.apply(i, j); + } + } +} + +/** + * \brief The tensor gpu evaluate api. + */ +template +inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) { + TensorApply lhs_(lhs); + TensorApply rhs_(rhs); + CHECK_EQ(lhs_.getWidth(), rhs_.getWidth()); + CHECK_EQ(lhs_.getHeight(), rhs_.getHeight()); + CHECK_EQ(lhs_.useGpu(), rhs_.useGpu()); + + int dimM = lhs_.getHeight(); + int dimN = lhs_.getWidth(); + + if (lhs_.isContiguous() && rhs_.isContiguous()) { + int size = dimM * dimN; + int blockSize = size <= 1024 ? size : 1024; + int gridSize = (size + 1024 - 1) / 1024; + TensorElementWiseOp + <<>>(lhs_, rhs_, size); + } else { + int blockSizeY = std::min(32, dimM); + int blockSizeX = (32 / blockSizeY) * 32; + int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX); + int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY); + dim3 threads(blockSizeX, blockSizeY); + dim3 grid(gridSizeX, gridSizeY); + TensorElementWiseOp + <<>>(lhs_, rhs_); + } + + CHECK_SYNC("TensorGpuApply failed"); +} +#else +template +inline void TensorGpuApply(LeftType& lhs, RightType& rhs) { +} +#endif + +} // namespace paddle diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h new file mode 100644 index 0000000000000000000000000000000000000000..f9ebd8b68fbe2f5fa51af5f50fb9fc0e4c3b1176 --- /dev/null +++ b/paddle/math/TensorExpression.h @@ -0,0 +1,409 @@ +/** + * TensorExpression.h + * + * Author: hedaoyuan (hedaoyuan@baidu.com) + * Created on: 2016-06-06 + * + * Copyright (c) Baidu.com, Inc. All Rights Reserved + * + */ + +#pragma once +#include +#include +#include "paddle/utils/TypeDefs.h" +#include "paddle/utils/Logging.h" +#include "hl_tensor_ops.h" + +namespace paddle { + +template class TensorConstant; +template class TensorUnaryOp; +template< +class OP, typename LhsType, typename RhsType, class T> class TensorBinaryOp; +template< +typename ExprType1, +typename ExprType2, +typename ExprType3, +class T> class TensorTernaryOp; + +/** + * \brief Tensor base class. + * + * This is the base class of all Tensor and Expression class. + */ +template +class TensorExpression { +public: + /** + * Element wise unary expression. + */ + template + const TensorUnaryOp + unaryExpression(const UnaryOp& op) const { + return TensorUnaryOp(op, derived()); + } + + const TensorUnaryOp, const Derived, T> + operator+(T p) const { + return unaryExpression(hppl::unary::add_scale(p)); + } + + const TensorUnaryOp, const Derived, T> + operator-(T p) const { + return unaryExpression(hppl::unary::sub_scale(p)); + } + + const TensorUnaryOp, const Derived, T> + operator*(T p) const { + return unaryExpression(hppl::unary::mul_scale(p)); + } + + const TensorUnaryOp, const Derived, T> + operator/(T p) const { + return unaryExpression(hppl::unary::div_scale(p)); + } + + const TensorUnaryOp, const Derived, T> + operator-() const { + return unaryExpression(hppl::unary::neg()); + } + + const TensorUnaryOp, const Derived, T> + exp() const { + return unaryExpression(hppl::unary::exp_op()); + } + + const TensorUnaryOp, const Derived, T> + log() const { + return unaryExpression(hppl::unary::log_op()); + } + + const TensorUnaryOp, const Derived, T> + sqrt() const { + return unaryExpression(hppl::unary::sqrt_op()); + } + + const TensorUnaryOp, const Derived, T> + square() const { + return unaryExpression(hppl::unary::square()); + } + + const TensorUnaryOp, const Derived, T> + reciprocal() const { + return unaryExpression(hppl::unary::reciprocal()); + } + + const TensorUnaryOp, const Derived, T> + abs() const { + return unaryExpression(hppl::unary::abs()); + } + + const TensorUnaryOp, const Derived, T> + sign() const { + return unaryExpression(hppl::unary::sign()); + } + + const TensorUnaryOp, const Derived, T> + pow(T p) const { + return unaryExpression(hppl::unary::pow_op(p)); + } + + const TensorUnaryOp, const Derived, T> + min(T p) const { + return unaryExpression(hppl::unary::min(p)); + } + + const TensorUnaryOp, const Derived, T> + max(T p) const { + return unaryExpression(hppl::unary::max(p)); + } + + const TensorUnaryOp, const Derived, T> + operator==(T p) const { + return unaryExpression(hppl::unary::cmp_eq(p)); + } + + const TensorUnaryOp, const Derived, T> + operator!=(T p) const { + return unaryExpression(hppl::unary::cmp_ne(p)); + } + + const TensorUnaryOp, const Derived, T> + operator<=(T p) const { + return unaryExpression(hppl::unary::cmp_le(p)); + } + + const TensorUnaryOp, const Derived, T> + operator<(T p) const { + return unaryExpression(hppl::unary::cmp_lt(p)); + } + + const TensorUnaryOp, const Derived, T> + operator>=(T p) const { + return unaryExpression(hppl::unary::cmp_ge(p)); + } + + const TensorUnaryOp, const Derived, T> + operator>(T p) const { + return unaryExpression(hppl::unary::cmp_gt(p)); + } + + const TensorUnaryOp, const Derived, T> + operator&&(T p) const { + return unaryExpression(hppl::unary::and_op(p)); + } + + const TensorUnaryOp, const Derived, T> + operator||(T p) const { + return unaryExpression(hppl::unary::or_op(p)); + } + + /** + * Element wise binary expression. + */ + template + const TensorBinaryOp + binaryExpression(const BinaryOp& op, const ExpressionType& expr) const { + return TensorBinaryOp( + op, derived(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::cmp_eq, const Derived, const ExpressionType, T> + operator==(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::cmp_eq(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::cmp_ne, const Derived, const ExpressionType, T> + operator!=(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::cmp_ne(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::cmp_le, const Derived, const ExpressionType, T> + operator<=(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::cmp_le(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::cmp_lt, const Derived, const ExpressionType, T> + operator<(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::cmp_lt(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::cmp_ge, const Derived, const ExpressionType, T> + operator>=(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::cmp_ge(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::cmp_gt, const Derived, const ExpressionType, T> + operator>(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::cmp_gt(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::and_op, const Derived, const ExpressionType, T> + operator&&(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::and_op(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::or_op, const Derived, const ExpressionType, T> + operator||(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::or_op(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::add, const Derived, const ExpressionType, T> + operator+(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::add(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::sub, const Derived, const ExpressionType, T> + operator-(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::sub(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::mul, const Derived, const ExpressionType, T> + operator*(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::mul(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::div, const Derived, const ExpressionType, T> + operator/(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::div(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::min, const Derived, const ExpressionType, T> + min(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::min(), expr); + } + + template + const TensorBinaryOp< + hppl::binary::max, const Derived, const ExpressionType, T> + max(const ExpressionType& expr) const { + return binaryExpression(hppl::binary::max(), expr); + } + + /** + * Element wise ternary expression. + * + * ternary conditional operator(?: operator). + * The conditional expression returns one of two values depending on + * the result of derived expression. + * If derived expression evaluates to true, then expression1 is evaluated. + * If derived expression evaluates to false, then expression2 is evaluated. + */ + template + const TensorTernaryOp + condition(const ExprType1& expr1, const ExprType2& expr2) const { + return TensorTernaryOp + (derived(), expr1, expr2); + } + + template + const TensorTernaryOp< + const Derived, + const TensorConstant, const Derived, T>, + const ExprType, + T> + condition(T p, const ExprType& expr) const { + return condition(constant(p), expr); + } + + template + const TensorTernaryOp< + const Derived, + const ExprType, + const TensorConstant, const Derived, T>, + T> + condition(const ExprType& expr, T p) const { + return condition(expr, constant(p)); + } + + const TensorTernaryOp< + const Derived, + const TensorConstant, const Derived, T>, + const TensorConstant, const Derived, T>, + T> + condition(T p1, T p2) const { + return condition(constant(p1), constant(p2)); + } + + const TensorConstant, const Derived, T> + constant(T p) const { + return TensorConstant, const Derived, T> + (hppl::unary::constant(p), derived()); + } + +protected: + const Derived& derived() const { return *static_cast(this); } +}; + +/** + * \brief Unary Operator Expression + */ +template +class TensorUnaryOp + : public TensorExpression, T> { +public: + explicit TensorUnaryOp(const OP op, const ExprType& expr) + : op_(op), expr_(expr) {} + + const OP op_; + const ExprType expr_; +}; + +/** + * \brief Binary Operator Expression + */ +template +class TensorBinaryOp + : public TensorExpression, T> { +public: + explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs) + : op_(op), lhs_(lhs), rhs_(rhs) {} + + const OP op_; + const LhsType lhs_; + const RhsType rhs_; +}; + +/** + * \brief Ternary Operator Expression + */ +template +class TensorTernaryOp + : public TensorExpression< + TensorTernaryOp, T> { +public: + explicit TensorTernaryOp( + const ExprType1& expr1, const ExprType2& expr2, const ExprType3& expr3) + : expr1_(expr1), expr2_(expr2), expr3_(expr3) {} + + const ExprType1 expr1_; + const ExprType2 expr2_; + const ExprType3 expr3_; +}; + +/** + * \brief Constant Expression + */ +template +class TensorConstant + : public TensorExpression, T> { +public: + explicit TensorConstant(const OP op, const ExprType& expr) + : op_(op), expr_(expr) {} + + const OP op_; + const ExprType expr_; +}; + +/** + * \brief operator+ overload + * \return a unary operator expression + */ +template +const TensorUnaryOp, const Derived, T> +operator+(T p, const TensorExpression& expr) { + return expr + p; +} + +/** + * \brief operator* overload + * \return a unary operator expression + */ +template +const TensorUnaryOp, const Derived, T> +operator*(T p, const TensorExpression& expr) { + return expr * p; +} + +} // namespace paddle + +#include "TensorApply.h" +#include "TensorEvaluate.h" + diff --git a/paddle/math/TrainingAlgorithmOp.cu b/paddle/math/TrainingAlgorithmOp.cu new file mode 100644 index 0000000000000000000000000000000000000000..7fae2c7bdd7164552e12c7b5a9fd764688712910 --- /dev/null +++ b/paddle/math/TrainingAlgorithmOp.cu @@ -0,0 +1,182 @@ +/** + * TrainingAlgorithmOp.cu + * + * Author: hedaoyuan (hedaoyuan@baidu.com) + * Created on: 2016-06-29 + * + * Copyright (c) Baidu.com, Inc. All Rights Reserved + * + */ + +#include "paddle/utils/Logging.h" +#include "BaseMatrix.h" +#include "TrainingAlgorithmOp.h" + +namespace paddle { + +void sparseMomentumApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& momU, + BaseMatrix& momV, + real alpha, + real beta, + real gamma, + real tau, + real learningRate) { + /** + * \alpha_t = \alpha_{t-1} / k + * \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t) + * u_t = u_{t-1} - \alpha_t \gamma_t g_t + * v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t + * \tau_t = \tau_{t-1} + \beta_t / \alpha_t + */ + momU -= (alpha * gamma * learningRate) * grad; + momV += (tau * alpha * gamma * learningRate) * grad; + value = (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV; +} + +void adadeltaApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& mom, + BaseMatrix& accum, + BaseMatrix& accum_update, + BaseMatrix& lr, + real rou, + real epsilon, + real learningRate, + real momentum, + real decayRate) { + // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2 + accum = rou * accum + ((real)1 - rou) * grad.square(); + + // learn_rate: sqrt(( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon )) + lr = ((accum_update + epsilon) / (accum + epsilon)).sqrt(); + + // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2 + accum_update = rou * accum_update + ((real)1 - rou) * (grad * lr).square(); + + mom = mom * momentum - learningRate * lr * (grad + value * decayRate); + value += mom; +} + +void adagradApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& mom, + BaseMatrix& accum_buffer, + BaseMatrix& accum, + BaseMatrix& lr, + real epsilon, + real learningRate, + real momentum, + real decayRate) { + accum += grad.square(); + lr = (accum_buffer + accum + epsilon).sqrt().reciprocal(); + mom = mom * momentum - learningRate * lr * (grad + value * decayRate); + value += mom; +} + +void rmspropApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& mom, + BaseMatrix& g, + BaseMatrix& f, + BaseMatrix& lr, + real accumulatedRou, + real rou, + real epsilon, + real learningRate, + real momentum, + real decayRate, + bool firstTime) { + // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2 + // For the first time update, make the sum be the current square + // so that the initial estimation of E(g_t^2) will not be too small. + if (firstTime) { + g = accumulatedRou * g + grad.square(); + } else { + g = accumulatedRou * g + ((real)1 - rou) * grad.square(); + } + + // E(f_t) = \rou * E(f_{t-1}) + (1-\rou) * g + f = accumulatedRou * f + ((real)1 - rou) * grad; + + // learn_rate = 1/sqrt( ( E(g_t^2) - (E(f_t))^2 + epsilon ) + // Basiclly if the sign of the gradient changes more often, + // the learning rate will be decreased. + lr = (g - f.square() + epsilon).sqrt().reciprocal(); + + mom = mom * momentum - learningRate * lr * (grad + value * decayRate); + value += mom; +} + +void decayedAdagradApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& mom, + BaseMatrix& accum, + BaseMatrix& lr, + real accumulatedRou, + real rou, + real epsilon, + real learningRate, + real momentum, + real decayRate, + bool firstTime) { + // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2 + // For the first time update, make the sum be the current square + // so that the initial estimation of E(g_t^2) will not be too small. + if (firstTime) { + accum = accumulatedRou * accum + grad.square(); + } else { + accum = accumulatedRou * accum + ((real)1 - rou) * grad.square(); + } + + // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon ) + // Basiclly if the bigger the magnitude gradient is, + // the smaller the learning rate will be. + lr = (accum + epsilon).sqrt().reciprocal(); + + mom = mom * momentum - learningRate * lr * (grad + value * decayRate); + value += mom; +} + +void adamApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& mom, // firse moment + BaseMatrix& v, // second moment + real beta1, + real beta2, + real beta1_power, + real beta2_power, + real epsilon, + real learningRate) { + real alpha = learningRate * + std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); + + // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t; + mom = beta1 * mom + ((real)1 - beta1) * grad; + + // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2 + v = beta2 * v + ((real)1 - beta2) * grad.square(); + + value -= (mom * alpha) / (v.sqrt() + epsilon); +} + +void adamaxApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& mom, // firse moment + BaseMatrix& u, // weighted infinity norm + real beta1, + real beta2, + int64_t step, + real alpha) { + // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t; + mom = beta1 * mom + ((real)1 - beta1) * grad; + + // u_t = max(\beta_2*u_{t-1}, abs(g_t)) + u = (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()); + + // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t + value -= (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u); +} + +} // namespace paddle diff --git a/paddle/math/TrainingAlgorithmOp.h b/paddle/math/TrainingAlgorithmOp.h new file mode 100644 index 0000000000000000000000000000000000000000..5c413328a0ec7e7c6cd7923a208ed91aef246a10 --- /dev/null +++ b/paddle/math/TrainingAlgorithmOp.h @@ -0,0 +1,119 @@ +/** + * TrainingAlgorithmOp.h + * + * Author: hedaoyuan (hedaoyuan@baidu.com) + * Created on: 2016-06-29 + * + * Copyright (c) Baidu.com, Inc. All Rights Reserved + * + */ + +#pragma once + +#include "paddle/utils/Logging.h" +#include "BaseMatrix.h" + +namespace paddle { + +/** + * \brief Sparse Momentum optimizer. + */ +extern void sparseMomentumApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& momU, + BaseMatrix& momV, + real alpha, + real beta, + real gamma, + real tau, + real learningRate); + +/** + * \brief AdaDelta optimizer. + */ +extern void adadeltaApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& sum, + BaseMatrix& sum1, + BaseMatrix& mom, + BaseMatrix& lr, + real rou, + real epsilon, + real learningRate, + real momentum, + real decayRate); + +/** + * \brief AdaGrad optimizer. + */ +extern void adagradApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& sum, + BaseMatrix& sum1, + BaseMatrix& mom, + BaseMatrix& lr, + real epsilon, + real learningRate, + real momentum, + real decayRate); + +/** + * \brief RMSProp optimizer. + */ +extern void rmspropApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& g, + BaseMatrix& f, + BaseMatrix& mom, + BaseMatrix& lr, + real accumulatedRou, + real rou, + real epsilon, + real learningRate, + real momentum, + real decayRate, + bool firstTime); + +/** + * \brief Decayed AdaGrad optimizer. + */ +extern void decayedAdagradApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& mom, + BaseMatrix& accum, + BaseMatrix& lr, + real accumulatedRou, + real rou, + real epsilon, + real learningRate, + real momentum, + real decayRate, + bool firstTime); + +/** + * \brief Adam optimizer. + */ +extern void adamApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& mom, + BaseMatrix& v, + real beta1, + real beta2, + real beta1_power, + real beta2_power, + real epsilon, + real learningRate); + +/** + * \brief AdaMax optimizer. + */ +extern void adamaxApply(BaseMatrix& value, + BaseMatrix& grad, + BaseMatrix& mom, // firse moment + BaseMatrix& u, // weighted infinity norm + real beta1, + real beta2, + int64_t step, + real alpha); + +} // namespace paddle diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h index ee0a83bf038f04ee9f7b3561639aa90da68a6e29..ba728e7578de83d902cb63618e5cafc91409207e 100644 --- a/paddle/math/Vector.h +++ b/paddle/math/Vector.h @@ -258,6 +258,15 @@ public: /// print the "idx" element of the Vector virtual void printOneElement(std::ostream& os, size_t idx) const = 0; + template + void operator=(const ExpressionType& expr) { + if (BaseVector::useGpu_) { + TensorGpuApply(*this, expr); + } else { + TensorCpuApply(*this, expr); + } + } + protected: friend class GpuVectorT; friend class CpuVectorT; @@ -315,6 +324,11 @@ public: virtual void print(std::ostream& os, size_t num) const; virtual void printOneElement(std::ostream& os, size_t idx) const; + template + void operator=(const ExpressionType& expr) { + TensorGpuApply(*this, expr); + } + protected: virtual void copyTo(CpuVectorT* dest) const; virtual void copyTo(GpuVectorT* dest) const; @@ -378,6 +392,11 @@ public: virtual T get(size_t pos); virtual void print(std::ostream& os, size_t num) const; virtual void printOneElement(std::ostream& os, size_t idx) const; + + template + void operator=(const ExpressionType& expr) { + TensorCpuApply(*this, expr); + } }; template diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index eb72f11e1c6538cd2c66bc56dbc8686a942bd308..07fd1adc1be9c969ca3500cd4e8ce5275b5b5956 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -3,6 +3,7 @@ add_simple_unittest(test_ExecViaCpu) add_simple_unittest(test_SIMDFunctions) add_simple_unittest(test_matrix) +add_simple_unittest(test_TrainingAlgorithm) # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference. add_unittest(test_matrixCompare @@ -13,3 +14,8 @@ add_simple_unittest(test_sparseMatrixCompare) add_simple_unittest(test_perturbation) add_simple_unittest(test_CpuGpuVector) add_simple_unittest(test_Allocator) +if(COMPILER_SUPPORT_CXX11) + LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 -Xcompiler -fPIC --use_fast_math) + CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu) + link_paddle_test(test_Tensor) +endif() diff --git a/paddle/math/tests/OriginalOptimizerApi.h b/paddle/math/tests/OriginalOptimizerApi.h new file mode 100644 index 0000000000000000000000000000000000000000..9a2e38246f5073b0b3661a307f16d6de4ae50914 --- /dev/null +++ b/paddle/math/tests/OriginalOptimizerApi.h @@ -0,0 +1,190 @@ +/** + * OriginalOptimizerApi.h + * + * Author: hedaoyuan (hedaoyuan@baidu.com) + * Created on: 2016-06-29 + * + * Copyright (c) Baidu.com, Inc. All Rights Reserved + */ + +#pragma once + +#include "paddle/utils/GlobalConstants.h" +#include "paddle/math/Vector.h" + +using namespace paddle; // NOLINT + +void SparseMomentumParameterOptimizer(const VectorPtr vecs[], + real alpha, + real beta, + real gamma, + real tau, + real learningRate) { + vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT], + -alpha * gamma * learningRate); + vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT], + tau * alpha * gamma * learningRate); + vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT], + tau / beta + 1.0 / alpha, + *vecs[PARAMETER_MOMENTUM_VT], 1.0 / beta); +} + +void AdagradParameterOptimizer(const VectorPtr vecs[], + real epsilon, + real learningRate, + real momentum, + real decayRate) { + vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT], + 1.0f); + vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM], + *vecs[PARAMETER_GRADIENT_SQURESUM1]); + vecs[PARAMETER_LEARNING_RATE]->add(epsilon); + vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]); + + vecs[PARAMETER_VALUE]->sgdUpdate( + *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], + *vecs[PARAMETER_LEARNING_RATE], learningRate, + momentum, decayRate); +} + +void AdaDeltaParameterOptimizer(const VectorPtr vecs[], + real rou, + real epsilon, + real learningRate, + real momentum, + real decayRate) { + // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2 + vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(*vecs[PARAMETER_GRADIENT], + rou, 1.0f - rou); + + // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) ) + vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1], + *vecs[PARAMETER_GRADIENT_SQURESUM], + epsilon, epsilon); + vecs[PARAMETER_LEARNING_RATE]->sqrt2(); + + // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2 + vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul( + *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_LEARNING_RATE], rou, + 1.0f - rou); + + vecs[PARAMETER_VALUE]->sgdUpdate( + *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], + *vecs[PARAMETER_LEARNING_RATE], learningRate, + momentum, decayRate); +} + +void RMSPropParameterOptimizer(const VectorPtr vecs[], + real accumulatedRou, + real rou, + real epsilon, + real learningRate, + real momentum, + real decayRate, + bool firstTime) { + // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2 + // For the first time update, make the sum be the current square + // so that the initial estimation of E(g_t^2) will not be too small. + vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare( + *vecs[PARAMETER_GRADIENT], accumulatedRou, + firstTime ? 1.0f : 1.0f - rou); + + // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g + vecs[PARAMETER_GRADIENT_SQURESUM1]->add(*vecs[PARAMETER_GRADIENT], + accumulatedRou, 1.0f - rou); + + // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon ) + // Basiclly if the sign of the gradient changes more often, + // the learning rate will be decreased. + vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]); + vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1], + -1.0f); + vecs[PARAMETER_LEARNING_RATE]->add(epsilon); + vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]); + + vecs[PARAMETER_VALUE]->sgdUpdate( + *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], + *vecs[PARAMETER_LEARNING_RATE], learningRate, + momentum, decayRate); +} + +void DecayedAdagradParameterOptimizer(const VectorPtr vecs[], + real accumulatedRou, + real rou, + real epsilon, + real learningRate, + real momentum, + real decayRate, + bool firstTime) { + // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2 + // For the first time update, make the sum be the current square + // so that the initial estimation of E(g_t^2) will not be too small. + vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare( + *vecs[PARAMETER_GRADIENT], accumulatedRou, + firstTime ? 1.0f : 1.0f - rou); + + // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon ) + // Basiclly if the bigger the magnitude gradient is, + // the smaller the learning rate will be. + vecs[PARAMETER_LEARNING_RATE]->assign(epsilon); + vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]); + vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]); + + vecs[PARAMETER_VALUE]->sgdUpdate( + *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], + *vecs[PARAMETER_LEARNING_RATE], learningRate, + momentum, decayRate); +} + +void AdamParameterOptimizer(const VectorPtr vecs[], + real beta1, + real beta2, + real beta1_power, + real beta2_power, + real epsilon, + real learningRate) { + Vector* m = vecs[PARAMETER_MOMENTUM].get(); + Vector* g = vecs[PARAMETER_GRADIENT].get(); + Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get(); + Vector* theta = vecs[PARAMETER_VALUE].get(); + + // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t; + m->add(*g, beta1, 1 - beta1); + + // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2 + g->square2(); + v->add(*g, beta2, 1 - beta2); + + // tmp = m_t / ( \sqrt{v_t} + \epsilon ) + // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp + g->sqrt2(*v); + g->dotDiv(*m, *g, 0., epsilon); + real alpha = learningRate * + std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power); + theta->add(*theta, 1.0, *g, -alpha); +} + +void AdamaxParameterOptimizer(const VectorPtr vecs[], + real beta1, + real beta2, + int64_t step, + real alpha) { + Vector* m = vecs[PARAMETER_MOMENTUM].get(); + Vector* g = vecs[PARAMETER_GRADIENT].get(); + Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get(); + Vector* theta = vecs[PARAMETER_VALUE].get(); + + // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t; + m->add(*g, beta1, 1 - beta1); + + // u_t = max(\beta_2*u_{t-1}, abs(g_t)) + u->mulScalar(beta2); + g->abs2(); + u->max2(*u, *g); + + // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t + g->dotDiv(*m, *u); + real learningRate = alpha / (1 - std::pow(beta1, step)); + theta->add(*theta, 1.0, *g, -learningRate); +} + diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu new file mode 100644 index 0000000000000000000000000000000000000000..34a13c2e9c98fc62f5927472396c92a18ac638ff --- /dev/null +++ b/paddle/math/tests/test_Tensor.cu @@ -0,0 +1,1252 @@ +/** + * test_Tensor.cpp + * + * Author: hedaoyuan (hedaoyuan@baidu.com) + * Created on: 2016-06-06 + * + * Copyright (c) Baidu.com, Inc. All Rights Reserved + */ + +#include +#include "paddle/math/Matrix.h" +using namespace paddle; // NOLINT +using namespace std; // NOLINT + +template +extern void TensorCheckEqual(const Tensor& tensor1, const Tensor& tensor2); + +void TensorCheckEqual(const CpuMatrix& matrix1, const CpuMatrix& matrix2) { + CHECK(matrix1.getHeight() == matrix2.getHeight()); + CHECK(matrix1.getWidth() == matrix2.getWidth()); + + int height = matrix1.getHeight(); + int width = matrix1.getWidth(); + const real* data1 = matrix1.getData(); + const real* data2 = matrix2.getData(); + int count = 0; + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + if (data1[i * width + j] != data2[i * width + j]) { + count++; + } + } + } + EXPECT_EQ(count, 0) << "There are " << count << " different element."; +} + +void TensorCheckEqual(const GpuMatrix& matrix1, const GpuMatrix& matrix2) { + CpuMatrix cpu1(matrix1.getHeight(), matrix1.getWidth()); + CpuMatrix cpu2(matrix2.getHeight(), matrix2.getWidth()); + cpu1.copyFrom(matrix1); + cpu2.copyFrom(matrix2); + TensorCheckEqual(cpu1, cpu2); +} + +void TensorCheckErr(const CpuMatrix& matrix1, const CpuMatrix& matrix2) { + CHECK(matrix1.getHeight() == matrix2.getHeight()); + CHECK(matrix1.getWidth() == matrix2.getWidth()); +#ifndef PADDLE_TYPE_DOUBLE + real err = 1e-5; +#else + real err = 1e-10; +#endif + + int height = matrix1.getHeight(); + int width = matrix1.getWidth(); + const real* data1 = matrix1.getData(); + const real* data2 = matrix2.getData(); + int count = 0; + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + real a = data1[i * width + j]; + real b = data2[i * width + j]; + if (fabs(a - b) > err) { + if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) { + count++; + } + } + } + } + EXPECT_EQ(count, 0) << "There are " << count << " different element."; +} + +void TensorCheckErr(const GpuMatrix& matrix1, const GpuMatrix& matrix2) { + CpuMatrix cpu1(matrix1.getHeight(), matrix1.getWidth()); + CpuMatrix cpu2(matrix2.getHeight(), matrix2.getWidth()); + cpu1.copyFrom(matrix1); + cpu2.copyFrom(matrix2); + TensorCheckErr(cpu1, cpu2); +} + +template +void TensorCheckEqual(const CpuVectorT& vector1, + const CpuVectorT& vector2) { + CHECK(vector1.getSize() == vector2.getSize()); + + const T* data1 = vector1.getData(); + const T* data2 = vector2.getData(); + size_t size = vector1.getSize(); + int count = 0; + for (size_t i = 0; i < size; i++) { + if (data1[i] != data2[i]) { + count++; + } + } + EXPECT_EQ(count, 0) << "There are " << count << " different element."; +} + +template +void TensorCheckEqual(const GpuVectorT& vector1, + const GpuVectorT& vector2) { + CpuVectorT cpu1(vector1.getSize()); + CpuVectorT cpu2(vector2.getSize()); + cpu1.copyFrom(vector1); + cpu2.copyFrom(vector2); + TensorCheckEqual(cpu1, cpu2); +} + +#define INIT_UNARY(A1, A2) \ + Tensor A1(height, width); \ + Tensor A2(height, width); \ + A1.randomizeUniform(); \ + A2.copyFrom(A1) +#define INIT_BINARY(A1, A2, B) \ + INIT_UNARY(A1, A2); \ + Tensor B(height, width); \ + B.randomizeUniform() +#define INIT_TERNARY(A1, A2, B, C) \ + INIT_BINARY(A1, A2, B); \ + Tensor C(height, width); \ + C.randomizeUniform() +#define INIT_QUATERNARY(A1, A2, B, C, D) \ + INIT_TERNARY(A1, A2, B, C); \ + Tensor D(height, width); \ + D.randomizeUniform() + +template +struct TestUnaryMatrix { + typedef std::function UnaryFunc; + + explicit TestUnaryMatrix(UnaryFunc testUnaryFunc) { + for (auto height : {1, 11, 73, 128, 200, 330}) { + for (auto width : {1, 32, 100, 512, 1000, 3210}) { + LOG(INFO) << " height=" << height << " width=" << width; + INIT_UNARY(A1, A2); + testUnaryFunc(A1, A2); + } + } + } +}; + +template +struct TestBinaryMatrix { + typedef std::function BinaryFunc; + + explicit TestBinaryMatrix(BinaryFunc testBinaryFunc) { + for (auto height : {1, 11, 73, 128, 200, 330}) { + for (auto width : {1, 32, 100, 512, 1000, 3210}) { + LOG(INFO) << " height=" << height << " width=" << width; + INIT_BINARY(A1, A2, B); + testBinaryFunc(A1, A2, B); + } + } + } +}; + +template +struct TestTernaryMatrix { + typedef std::function TernaryFunc; + + explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) { + for (auto height : {1, 11, 73, 128, 200, 330}) { + for (auto width : {1, 32, 100, 512, 1000, 3210}) { + LOG(INFO) << " height=" << height << " width=" << width; + INIT_TERNARY(A1, A2, B, C); + testTernaryFunc(A1, A2, B, C); + } + } + } +}; + +template +struct TestQuaternaryMatrix { + typedef std::function QuaternaryFunc; + + explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) { + for (auto height : {1, 11, 73, 128, 200, 330}) { + for (auto width : {1, 32, 100, 512, 1000, 3210}) { + LOG(INFO) << " height=" << height << " width=" << width; + INIT_QUATERNARY(A1, A2, B, C, D); + testQuaternaryFunc(A1, A2, B, C, D); + } + } + } +}; + +template +struct TestUnaryVectorT { + typedef std::function UnaryFunc; + + explicit TestUnaryVectorT(UnaryFunc testUnaryFunc) { + for (auto size : {1, 11, 73, 128, 200, 330, 512, 1000, 4210}) { + LOG(INFO) << " size=" << size; + Tensor A1(size); + Tensor A2(size); + if (typeid(T) == typeid(real)) { + A1.rand(); + } else { + A1.rand(1000); + } + A2.copyFrom(A1); + testUnaryFunc(A1, A2); + } + } +}; + +void SetTensorValue(Matrix& matrix, real value) { + int height = matrix.getHeight(); + int width = matrix.getWidth(); + int stride = matrix.getStride(); + real* data = matrix.getData(); + for (int i = 0; i < height; i++) { + int j = rand() % width; // NOLINT + if (typeid(matrix) == typeid(CpuMatrix)) { + data[i * stride + j] = value; + } else if (typeid(matrix) == typeid(GpuMatrix)) { + hl_memcpy(&data[i * stride + j], &value, sizeof(real)); + } else { + } + } +} + +template +void testTensorAddScalar(Tensor& A1, Tensor& A2) { + real p1 = 2.5; + real p2 = 3.0; + A1.add(p1); // a += p + A2 += p1; + TensorCheckEqual(A1, A2); + + A1.add(p1, p2); // a = a * p1 + p2 + A2 = A2 * p1 + p2; + TensorCheckEqual(A1, A2); +} + +template +void testTensorSubScalar(Tensor& A1, Tensor& A2) { + real p = 2.5; + A1.subScalar(p); // a -= p + A2 -= p; + TensorCheckEqual(A1, A2); +} + +template +void testTensorMulScalar(Tensor& A1, Tensor& A2) { + real p = 2.5; + A1.mulScalar(p); // a *= p + A2 *= p; + TensorCheckEqual(A1, A2); + + real learningRate = 0.7f; + real decayRate = 1.2f; + A1.applyL2(learningRate, decayRate); + A2 = A2 * (1.0f / (1.0f + learningRate * decayRate)); + TensorCheckEqual(A1, A2); +} + +template +void testTensorDivScalar(Tensor& A1, Tensor& A2) { + real p = 2.5; + A1.divScalar(p); // a /= p + A2 /= p; + TensorCheckEqual(A1, A2); +} + +template +void testTensorNeg(Tensor& A1, Tensor& A2) { + A1.neg(); // a = -a + A2 = -A2; + TensorCheckEqual(A1, A2); +} + +template +void testTensorAbs(Tensor& A1, Tensor& A2) { + A1.abs2(); // a = a > 0 ? a : -a + A2 = A2.abs(); + TensorCheckEqual(A1, A2); +} + +template +void testTensorSquare(Tensor& A1, Tensor& A2) { + A1.square2(); // a = a * a + A2 = A2.square(); + TensorCheckEqual(A1, A2); +} + +template +void testTensorReciprocal(Tensor& A1, Tensor& A2) { + A1.reciprocal2(); // a = 1.0f / a + A2 = A2.reciprocal(); + TensorCheckEqual(A1, A2); +} + +template +void testTensorSign(Tensor& A1, Tensor& A2) { + A1.sign2(); // a = (a > 0) - (a < 0) + A2 = A2.sign(); + TensorCheckEqual(A1, A2); +} + +template +void testTensorAssign(Tensor& A1, Tensor& A2) { + A1.assign(1.5); // a = p + A2 = A2.constant(1.5); + TensorCheckEqual(A1, A2); + + A1.one(); // a = 1 + A2 = A2.constant(1.0); + TensorCheckEqual(A1, A2); + + A1.zero(); // a = 0 + A2 = A2.constant(0.0); + TensorCheckEqual(A1, A2); +} + +template +void testUnaryBaseOp(Tensor& A1, Tensor& A2) { + testTensorAddScalar(A1, A2); + testTensorSubScalar(A1, A2); + testTensorMulScalar(A1, A2); + testTensorDivScalar(A1, A2); + testTensorNeg(A1, A2); + testTensorAbs(A1, A2); + testTensorSquare(A1, A2); + testTensorReciprocal(A1, A2); + testTensorSign(A1, A2); + testTensorAssign(A1, A2); +} + +template +void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) { + A1.add(2); // a += p + A2 += 2; + TensorCheckEqual(A1, A2); + + A1.add(3, 2); // a = a * p1 + p2 + A2 = A2 * 3 + 2; + TensorCheckEqual(A1, A2); + + testTensorNeg(A1, A2); + testTensorAbs(A1, A2); +} + +TEST(Unary, BaseOp) { + TestUnaryMatrix testCpuMatrix(testUnaryBaseOp); + TestUnaryVectorT testCpuVector(testUnaryBaseOp); + TestUnaryVectorT + testCpuIVector(testUnaryBaseOpInt); + +#ifndef PADDLE_ONLY_CPU + TestUnaryMatrix testGpuMatrix(testUnaryBaseOp); + TestUnaryVectorT testGpuVector(testUnaryBaseOp); + TestUnaryVectorT + testGpuIVector(testUnaryBaseOpInt); +#endif +} + +template +void testTensorExp(Tensor& A1, Tensor& A2) { + A1.exp2(); // a = exp(a) + A2 = A2.exp(); + TensorCheckErr(A1, A2); +} + +template +void testTensorLog(Tensor& A1, Tensor& A2) { + A1.log2(); // a = log(a) + A2 = A2.log(); + TensorCheckErr(A1, A2); +} + +template +void testTensorSqrt(Tensor& A1, Tensor& A2) { + A1.sqrt2(); // a = sqrt(a) + A2 = A2.sqrt(); + TensorCheckErr(A1, A2); +} + +template +void testTensorPow(Tensor& A1, Tensor& A2) { + A1.pow2(3.2); // a = pow(a, p) + A2 = A2.pow(3.2); + TensorCheckErr(A1, A2); +} + +template +void testUnayrMathOp(Tensor& A1, Tensor& A2) { + testTensorExp(A1, A2); + testTensorLog(A1, A2); + testTensorSqrt(A1, A2); + testTensorPow(A1, A2); +} + +TEST(Unary, MathOp) { + TestUnaryMatrix testCpu(testUnayrMathOp); + +#ifndef PADDLE_ONLY_CPU + TestUnaryMatrix testGpu(testUnayrMathOp); +#endif +} + +template +void testTensorClip(Tensor& A1, Tensor& A2) { + real p1 = 0.003f; + real p2 = 0.877f; + A1.clip(p1, p2); // a = a < p1 ? p1 : (a > p2 ? p2 : a) + // A2 = A2.min(0.877f).max(0.003f); + A2 = (A2 < p1).condition(p1, (A2 > p2).condition(p2, A2)); + TensorCheckEqual(A1, A2); +} + +template +void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) { + real p = 0.5f; + A1.biggerThanScalar(p); // a = a > p ? 1.0f : 0.0f + A2 = (A2 > p).condition((real)1.0, (real)0.0); + TensorCheckEqual(A1, A2); +} + +template +void testTensorapplyL1(Tensor& A1, Tensor& A2) { + /** + * T lambda = p; + * a = (a > lambda) ? (a - lambda) + * : (a < -lambda) ? (a + lambda) : 0 + * + * p = learningRate * decayRate; + */ + real learningRate = 0.7f; + real decayRate = 0.6f; + A1.applyL1(learningRate, decayRate); + A2 = (A2 > (learningRate * decayRate)).condition( + (A2 - (learningRate * decayRate)), + (A2 < -(learningRate * decayRate)).condition( + (A2 + (learningRate * decayRate)), (real)0.0)); + TensorCheckEqual(A1, A2); +} + +template +void testUnayrCompareOp(Tensor& A1, Tensor& A2) { + testTensorClip(A1, A2); + testTensorBiggerThanScalar(A1, A2); + + A1.randomizeUniform(); + A1.subScalar(0.5f); + A2.copyFrom(A1); + testTensorapplyL1(A1, A2); +} + +TEST(Unary, CompareOp) { + TestUnaryMatrix testCpu(testUnayrCompareOp); + +#ifndef PADDLE_ONLY_CPU + TestUnaryMatrix testGpu(testUnayrCompareOp); +#endif +} + +template +void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) { + real p1 = 2.5; + real p2 = 3.2; + A1.add(B); // a += b + A2 += B; + TensorCheckEqual(A1, A2); + + A1.add(B, p1); // a += b * p + A2 += B * p1; + TensorCheckEqual(A1, A2); + + A1.add(B, p1, p2); // a = p1 * a + p2 * b + A2 = A2 * p1 + B * p2; + TensorCheckEqual(A1, A2); + + A1.addScalar(B, p1); // a = b + p + A2 = B + p1; + TensorCheckEqual(A1, A2); + + A1.addSquare(B, p1); // a += p * b * b + A2 += B.constant(p1) * B * B; + TensorCheckEqual(A1, A2); + + A1.decayAddSquare(B, p1, p2); // a = p1 * a + p2 * b * b + A2 = A2 * p1 + B.constant(p2) * B * B; + TensorCheckEqual(A1, A2); +} + +template +void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) { + real p = 2.5; + A1.sub(B); // a -= b + A2 -= B; + TensorCheckEqual(A1, A2); + + A1.sub(B, p); // a -= b * p + A2 -= B * p; + TensorCheckEqual(A1, A2); + + A1.subScalar(B, p); // a = b - p + A2 = B - p; + TensorCheckEqual(A1, A2); +} + +template +void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) { + real p = 2.5; + A1.mulScalar(B, p); // a = b * p + A2 = B * p; + TensorCheckEqual(A1, A2); + + A1.dotMulSquare(B); // a *= b * b + A2 *= B * B; + TensorCheckEqual(A1, A2); + + A1.dotSquareMul(B); // a = a * a * b + A2 = A2 * A2 * B; + TensorCheckEqual(A1, A2); + + A1.dotMul(B); // a *= b + A2 *= B; + TensorCheckEqual(A1, A2); +} + +template +void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) { + real p = 2.5; + A1.divScalar(B, p); // a = b / p + A2 = B / p; + TensorCheckEqual(A1, A2); + + A1.scalarDiv(B, p); // a = p / b + A2 = B.constant(p) / B; + TensorCheckEqual(A1, A2); +} + +template +void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) { + A1.assign(B); // a = b + A2 = B; + TensorCheckEqual(A1, A2); +} + +template +void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) { + B.square2(A1); // b = a * a + A2 = B.square(); + TensorCheckEqual(A1, A2); +} + +template +void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) { + A1.squareDerivative(B); // a *= 2.0 * b + A2 = A2 * (real)2.0 * B; + TensorCheckEqual(A1, A2); +} + +template +void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) { + B.reciprocal2(A1); // b = 1.0f / a + A2 = B.reciprocal(); + TensorCheckEqual(A1, A2); + + real p1 = 0.58; + real p2 = 0.32; + A1.reciprocal2(B, p1, p2); // a = 1 / (p1 * b + p2) + A2 = (B * p1 + p2).reciprocal(); + TensorCheckEqual(A1, A2); + + real learningRate = 0.7f; + real decayRate = 1.2f; + A1.applyL2(B, learningRate, decayRate); // a *= (1.0f / (1.0f + p * b)) + A2 *= (B.constant(1.0f) + + B.constant(learningRate * decayRate) * B).reciprocal(); + TensorCheckEqual(A1, A2); +} + +template +void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) { + A1.reciprocalDerivative(B); // a *= -b * b + A2 *= (-B) * B; + TensorCheckEqual(A1, A2); +} + +template +void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) { + B.sign2(A1); // b = a > 0.0f ? 1.0f : -1.0f + A2 = B.sign(); + TensorCheckEqual(A1, A2); +} + +template +void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) { + B.abs2(A1); // b = a > 0.0f ? a : -a + A2 = B.abs(); + TensorCheckEqual(A1, A2); +} + +template +void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) { + testTensorAdd(A1, A2, B); + testTensorSub(A1, A2, B); + testTensorMul(A1, A2, B); + testTensorDiv(A1, A2, B); + testTensorSquare(A1, A2, B); + testTensorSquareDerivative(A1, A2, B); + testTensorReciprocal(A1, A2, B); + testTensorReciprocalDerivative(A1, A2, B); + testTensorAbs(A1, A2, B); + testTensorSign(A1, A2, B); + testTensorAssign(A1, A2, B); +} + +TEST(Binary, BaseOp) { + TestBinaryMatrix testCpu(testBinaryBaseOp); + +#ifndef PADDLE_ONLY_CPU + TestBinaryMatrix testGpu(testBinaryBaseOp); +#endif +} + +template +void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) { + // a = exp(b) + A1.exp2(B); + A2 = B.exp(); + TensorCheckErr(A1, A2); +} + +template +void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) { + A1.expDerivative(B); // a *= b + A2 *= B; + TensorCheckEqual(A1, A2); +} + +template +void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) { + // a = log(b) + A1.log2(B); + A2 = B.log(); + TensorCheckErr(A1, A2); +} + +template +void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) { + // a = sqrt(b) + A1.sqrt2(B); + A2 = B.sqrt(); + TensorCheckErr(A1, A2); +} + +template +void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) { + // a = 1.0f / sqrt(b) + A1.invSqrt(B); + A2 = B.sqrt().reciprocal(); + TensorCheckErr(A1, A2); +} + +template +void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) { + A1.pow2(B, 2.5f); // a = pow(b, p) + A2 = B.pow(2.5f); + TensorCheckErr(A1, A2); +} + +template +void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) { + /* + * const T THRESHOLD = 40.0; + * b = log(1.0 + + * exp((a > THRESHOLD) ? THRESHOLD + * : ((a < -THRESHOLD) ? (-THRESHOLD) : a))) + */ + B.softrelu(A1); + + real THRESHOLD = 40.0; + A2 = (B.constant(1.0f) + + (B > THRESHOLD).condition( + THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)).exp()).log(); + TensorCheckErr(A1, A2); +} + +template +void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { + /* + * const T THRESHOLD = 40.0; + * a *= (1.0 - exp(-1.0 * ((b > THRESHOLD) + * ? THRESHOLD + * : ((b < -THRESHOLD) ? (-THRESHOLD) : b))))); + */ + A1.softreluDerivative(B); + real THRESHOLD = 40.0; + A2 = A2 * (B.constant(1.0f) - + (B.constant(-1.0f) * + (B > THRESHOLD).condition( + THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))).exp()); + TensorCheckErr(A1, A2); +} + +template +void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) { + /* + const T THRESHOLD_MIN = -40.0; + const T THRESHOLD_MAX = 13.0; + T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN + : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a); + b = 1.0f / (1.0f + exp(-tmp))) + */ + B.sigmoid(A1); + + const real THRESHOLD_MIN = -40.0; + const real THRESHOLD_MAX = 13.0; + auto tmp = (B < THRESHOLD_MIN).condition( + THRESHOLD_MIN, (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B)); + A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal(); + TensorCheckErr(A1, A2); +} + +template +void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) { + A1.sigmoidDerivative(B); // a *= b * (1 - b) + A2 *= B * (B.constant(1.0f) - B); + TensorCheckEqual(A1, A2); +} + +template +void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) { + B.tanh(A1); // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0 + A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f; + TensorCheckErr(A1, A2); +} + +template +void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { + A1.tanhDerivative(B); // a *= 1 - b * b + A2 *= B.constant(1.0f) - B * B; + TensorCheckEqual(A1, A2); +} + +template +void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) { + real p1 = 2.5; + real p2 = 3.1; + // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0) + B.scaledTanh(A1, p1, p2); + A2 = B.constant(p1) * + (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) + - (real)1.0); + TensorCheckErr(A1, A2); +} + +template +void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) { + real p1 = 2.5; + real p2 = 3.1; + // a *= (p2 / p1) * (p1 * p1 - b * b)); + A1.scaledTanhDerivative(B, p1, p2); + A2 = A2 * (B.constant(p2 / p1) * (B.constant(p1 * p1) - B * B)); + TensorCheckEqual(A1, A2); +} + +template +void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) { + testTensorTanhDerivative(A1, A2, B); + testTensorScaledTanhDerivative(A1, A2, B); + testTensorSigmoidDerivative(A1, A2, B); + testTensorExpDerivative(A1, A2, B); + testTensorScaledTanh(A1, A2, B); + testTensorTanh(A1, A2, B); + testTensorExp(A1, A2, B); + testTensorLog(A1, A2, B); + testTensorSqrt(A1, A2, B); + testTensorInvSqrt(A1, A2, B); + testTensorPow(A1, A2, B); + + testTensorSoftrelu(A1, A2, B); + testTensorSoftreluDerivative(A1, A2, B); + testTensorSigmoid(A1, A2, B); +} + +TEST(Binary, MathOp) { + TestBinaryMatrix testCpu(testBinaryMathOp); + +#ifndef PADDLE_ONLY_CPU + TestBinaryMatrix testGpu(testBinaryMathOp); +#endif +} + +template +void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) { + B.relu(A1); // b = a > 0.0f ? a : 0.0f + A2 = (B > (real)0.0f).condition(B, (real)0.0f); + TensorCheckEqual(A1, A2); +} + +template +void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { + A1.reluDerivative(B); // a *= (b > 0.0f ? 1.0f : 0.0f) + A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0); + TensorCheckEqual(A1, A2); +} + +template +void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) { + /* + * b = a > p1 ? a : p1 + * b = b < p2 ? b : p2 + * int p1 = 0, p2 = 24; + */ + SetTensorValue(B, 32.0f); + B.brelu(A1); + auto tmp = (B > (real)0.0f).condition(B, (real)0.0f); + A2 = (tmp < (real)24.0f).condition(tmp, (real)24.0f); + TensorCheckEqual(A1, A2); +} + +template +void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) { + SetTensorValue(B, 32.0f); + /* + * a *= (b > p1 && b < p2) ? 1.0 : 0.0 + * int p1 = 0, p2 = 24; + */ + A1.breluDerivative(B); + A2 *= (B > (real)0.0f && B < (real)24.0f).condition((real)1.0f, (real)0.0f); + TensorCheckEqual(A1, A2); +} + +template +void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) { + A1.absDerivative(B); // a = (b > 0) ? a : (b < 0) ? -a : 0 + A2 = (B > (real)0.0f).condition(A2, + (B < (real)0.0f).condition(-A2, (real)0.0f)); + TensorCheckEqual(A1, A2); +} + +template +void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) { + real p = 0.613; + SetTensorValue(B, p); + A1.isEqualTo(B, p); // a = (b == p) + A2 = (B == p); + TensorCheckEqual(A1, A2); +} + +template +void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) { + /** + * T lambda = p * b; + * a = (a > lambda) ? (a - lambda) + * : (a < -lambda) ? (a + lambda) : 0 + * + * p = learningRate * decayRate; + */ + real learningRate = 0.7f; + real decayRate = 0.6f; + A1.applyL1(B, learningRate, decayRate); + auto lambda = B.constant(learningRate * decayRate) * B; + A2 = (A2 > lambda).condition( + (A2 - lambda), (A2 < -lambda).condition((A2 + lambda), (real)0.0f)); + TensorCheckEqual(A1, A2); +} + +template +void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) { + B.subScalar(0.5f); + SetTensorValue(B, 0.0f); + testTensorReluDerivative(A1, A2, B); + + A1.randomizeUniform(); + A2.copyFrom(A1); + testTensorBreluDerivative(A1, A2, B); + + testTensorAbsDerivative(A1, A2, B); + testTensorRelu(A1, A2, B); + testTensorBrelu(A1, A2, B); + testTensorIsEqualTo(A1, A2, B); +} + +TEST(Binary, CompareOp) { + TestBinaryMatrix testCpu(testBinaryCompareOp); + +#ifndef PADDLE_ONLY_CPU + TestBinaryMatrix testGpu(testBinaryCompareOp); +#endif +} + +template +void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { + A1.add(B, C); // a = b + c + A2 = B + C; + TensorCheckEqual(A1, A2); + + real p1 = 1.5; + real p2 = 2.5; + real p3 = 3.8; + A1.add(B, p1, C, p2); // a = p1 * b + p2 * c + A2 = B * p1 + C * p2; + TensorCheckEqual(A1, A2); + + A1.add2(B, C); // a = a + b + c + A2 = A2 + B + C; + TensorCheckEqual(A1, A2); + + A1.add2(B, C, p1, p2, p3); // a = p1 * a + p2 * b + p3 * c + A2 = A2 * p1 + B * p2 + C * p3; + TensorCheckEqual(A1, A2); + + A1.decayAddSquareMul(B, C, p1, p2); // a = p1 * a + p2 * b * b * c * c + A2 = A2 * p1 + B.constant(p2) * B * B * C * C; + TensorCheckEqual(A1, A2); +} + +template +void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { + A1.sub(B, C); // a = b - c + A2 = B - C; + TensorCheckEqual(A1, A2); + + real p1 = 1.5; + real p2 = 2.5; + A1.sub(B, p1, C, p2); // a = p1 * b - p2 * c + A2 = B * p1 - C * p2; + TensorCheckEqual(A1, A2); +} + +template +void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { + A1.dotMul(B, C); // a = b * c + A2 = B * C; + TensorCheckEqual(A1, A2); + + A1.dotMulSquare(B, C); // a = b * c * c + A2 = B * C * C; + TensorCheckEqual(A1, A2); + + A1.dotSquareSquare(B, C); // a = b * b * c * c + A2 = B * B * C * C; + TensorCheckEqual(A1, A2); + + real p1 = 1.5; + real p2 = 2.5; + + /* + * T tmp = p1 * b + p2 * c; + * a *= tmp * tmp + */ + A1.dotMulSquareSum(B, C, p1, p2); + auto tmp = B * p1 + C * p2; + A2 *= tmp * tmp; + TensorCheckEqual(A1, A2); + + /* + * T tmp = p1 * b + p2 * c; + * a = tmp * tmp + */ + A1.dotSquareSum(B, C, p1, p2); + auto tmp2 = B * p1 + C * p2; + A2 = tmp2 * tmp2; + TensorCheckEqual(A1, A2); + + // a *= p1 * b + p2 * c + A1.dotMulSum(B, C, p1, p2); + A2 *= B * p1 + C * p2; + TensorCheckEqual(A1, A2); + + // a = p1 * a + p2 * b * c + A1.addDotMul(B, C, p1, p2); + A2 = A2 * p1 + B.constant(p2) * B * C; + TensorCheckEqual(A1, A2); +} + +template +void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { + A1.dotDiv(B, C); // a = (b == 0.0) ? 0.0 : b / c + A2 = (B == (real)0.0).condition((real)0.0, B / C); + TensorCheckEqual(A1, A2); + + real p1 = 1.5; + real p2 = 2.5; + A1.dotDiv(B, C, p1, p2); // a = (b + p1) / (c + p2) + A2 = (B + p1) / (C + p2); + TensorCheckEqual(A1, A2); +} + +template +void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { + real p1 = 1.5; + real p2 = 2.5; + real p3 = 3.5; + A1.reciprocalSum(B, C, p1, p2, p3); // a = 1 / (p1 * b + p2 * c + p3) + A2 = (B * p1 + C * p2 + p3).reciprocal(); + TensorCheckEqual(A1, A2); +} + +template +void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { + A1.softCrossEntropy(B, C); // a = -c * log(b) - (1 - c) * log(1 - b) + A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log(); + TensorCheckErr(A1, A2); +} + +template +void testTensorSoftCrossEntropyBp(Tensor& A1, + Tensor& A2, + Tensor& B, + Tensor& C) { + A1.softCrossEntropyBp(B, C); // a += (b - c) / (b * (1 - b)) + A2 += (B - C) / (B * (B.constant(1.0f) - B)); + TensorCheckEqual(A1, A2); +} + +template +void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { + testTensorAdd(A1, A2, B, C); + testTensorSub(A1, A2, B, C); + testTensorMul(A1, A2, B, C); + testTensorDiv(A1, A2, B, C); + testTensorReciprocal(A1, A2, B, C); + testTensorSoftCrossEntropyBp(A1, A2, B, C); + + testTensorSoftCrossEntropy(A1, A2, B, C); +} + +TEST(Ternary, BaseOp) { + TestTernaryMatrix testCpu(testTernaryBaseOp); + +#ifndef PADDLE_ONLY_CPU + TestTernaryMatrix testGpu(testTernaryBaseOp); +#endif +} + +template +void testTensorBinaryLabelCrossEntropy(Tensor& A1, + Tensor& A2, + Tensor& B, + Tensor& C) { + A1.binaryLabelCrossEntropy(B, C); // a = c > 0.5 ? -log(b) : -log(1.0 - b) + A2 = (C > (real)0.5).condition( + -(B.log()), -((B.constant(1.0f) - B).log())); + TensorCheckErr(A1, A2); +} + +template +void testTensorBinaryLabelCrossEntropyBp(Tensor& A1, + Tensor& A2, + Tensor& B, + Tensor& C) { + // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b) + A1.binaryLabelCrossEntropyBp(B, C); + A2 += (C > (real)0.5).condition( + (B.constant(-1.0f) / B), (B.constant(1.0f) - B).reciprocal()); + TensorCheckErr(A1, A2); +} + +template +void testTensorLogisticRegressionLoss(Tensor& A1, + Tensor& A2, + Tensor& B, + Tensor& C) { + SetTensorValue(B, 50.0f); + SetTensorValue(B, -50.0f); + /** + * const T THRESHOLD = 40.0; + * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD) + * ? -THRESHOLD + * : b; + * a = log(1 + exp(x)) - c * x + */ + A1.logisticRegressionLoss(B, C); + real THRESHOLD = 40.0; + auto tmp = (B > THRESHOLD).condition( + THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); + A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp; + TensorCheckErr(A1, A2); +} + +template +void testTensorLogisticRegressionLossBp(Tensor& A1, + Tensor& A2, + Tensor& B, + Tensor& C) { + SetTensorValue(B, 50.0f); + SetTensorValue(B, -50.0f); + /** + * const T THRESHOLD = 40.0; + * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD) + * ? -THRESHOLD + * : b; + * x = exp(x); a = x / (1 + x) - c + */ + A1.logisticRegressionLossBp(B, C); + real THRESHOLD = 40.0; + auto tmp = (B > THRESHOLD).condition( + THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)); + auto tmp2 = tmp.exp(); + A2 = tmp2 / (C.constant(1.0) + tmp2) - C; + TensorCheckErr(A1, A2); +} + +template +void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { + A1.biggerThan(B, C); // a = (b > c) ? 1.0f : 0.0f + A2 = (B > C).condition((real)1.0f, (real)0.0f); + TensorCheckEqual(A1, A2); +} + +template +void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { + A1.max2(B, C); // a = (b > c) ? b : c + A2 = (B > C).condition(B, C); + TensorCheckEqual(A1, A2); +} + +template +void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { + testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C); + testTensorBinaryLabelCrossEntropy(A1, A2, B, C); + testTensorBiggerThan(A1, A2, B, C); + testTensorMax(A1, A2, B, C); + + testTensorLogisticRegressionLoss(A1, A2, B, C); + testTensorLogisticRegressionLossBp(A1, A2, B, C); +} + +TEST(Ternary, CompareOp) { + TestTernaryMatrix testCpu(testTernaryCompareOp); + +#ifndef PADDLE_ONLY_CPU + TestTernaryMatrix testGpu(testTernaryCompareOp); +#endif +} + +template +void testQuaternaryAdd(Tensor& A1, + Tensor& A2, + Tensor& B, + Tensor& C, + Tensor& D) { + // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f); // a = p1 * b + p2 * c + p3 * d + // A2 = B * 1.5f + C * 2.5f + D * 3.5f; + // TensorCheckEqual(A1, A2); + + /* + * T tmp = p1 * b + p2 * c + p3 * d; + * a += tmp * tmp + */ + real p1 = 1.5f; + real p2 = 2.5f; + real p3 = 3.5f; + A1.addSquareSum(B, C, D, p1, p2, p3); + auto tmp = B * p1 + C * p2 + D * p3; + A2 += tmp * tmp; + TensorCheckEqual(A1, A2); +} + +TEST(Quaternary, BaseOp) { + TestQuaternaryMatrix testCpu(testQuaternaryAdd); + +#ifndef PADDLE_ONLY_CPU + TestQuaternaryMatrix testGpu(testQuaternaryAdd); +#endif +} + +template +void testTensorBiggerThan(Tensor& A1, + Tensor& A2, + Tensor& B, + Tensor& C, + Tensor& D) { + // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f); + A1.biggerThan(B, C, D); + A2 = ((B > C && D > (real)0.5) + || (B < C && D < (real)0.5)).condition((real)1.0, (real)0.0); + TensorCheckEqual(A1, A2); +} + +template +void testTensorRankLoss(Tensor& A1, + Tensor& A2, + Tensor& B, + Tensor& C, + Tensor& D) { + /** + * const T THRESHOLD = 40.0; a = b - c; + * a = (a > THRESHOLD) + * ? THRESHOLD + * : ((a < -THRESHOLD) ? (-THRESHOLD) : a); + * a = log(1 + exp(a)) - a * d + */ + A1.rankLoss(B, C, D); + + real THRESHOLD = 40.0; + auto tmp = B - C; + auto tmp2 = (tmp > THRESHOLD).condition( + THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); + A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D; + + TensorCheckErr(A1, A2); +} + +template +void testTensorRankLossBp(Tensor& A1, + Tensor& A2, + Tensor& B, + Tensor& C, + Tensor& D) { + /** + * const T THRESHOLD = 40.0; a = b - c; + * a = (a > THRESHOLD) + * ? THRESHOLD + * : ((a < -THRESHOLD) ? (-THRESHOLD) : a); + * a = exp(a); a = (a / (1 + a) - d) + */ + A1.rankLossBp(B, C, D); + real THRESHOLD = 40.0; + auto tmp = B - C; + auto tmp2 = (tmp > THRESHOLD).condition( + THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp)); + auto tmp3 = tmp2.exp(); + A2 = tmp3 / (D.constant(1.0f) + tmp3) - D; + + TensorCheckErr(A1, A2); +} + +template +void testQuaternaryCompareOp(Tensor& A1, + Tensor& A2, + Tensor& B, + Tensor& C, + Tensor& D) { + testTensorBiggerThan(A1, A2, B, C, D); + testTensorRankLoss(A1, A2, B, C, D); + testTensorRankLossBp(A1, A2, B, C, D); +} + +TEST(Quaternary, CompareOp) { + TestQuaternaryMatrix testCpu(testQuaternaryCompareOp); + +#ifndef PADDLE_ONLY_CPU + TestQuaternaryMatrix testGpu(testQuaternaryCompareOp); +#endif +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + hl_start(); + hl_init(0); + return RUN_ALL_TESTS(); +} + diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b465ae6c26264317515dfdab027305fa391c1834 --- /dev/null +++ b/paddle/math/tests/test_TrainingAlgorithm.cpp @@ -0,0 +1,426 @@ +/** + * test_TrainingAlgorithm.cpp + * + * Author: hedaoyuan (hedaoyuan@baidu.com) + * Created on: 2016-06-29 + * + * Copyright (c) Baidu.com, Inc. All Rights Reserved + */ + +#include +#include "paddle/utils/Util.h" +#include "paddle/math/TrainingAlgorithmOp.h" +#include "OriginalOptimizerApi.h" + +using namespace paddle; // NOLINT + +#ifndef PADDLE_TYPE_DOUBLE +P_DEFINE_double(max_diff, 1e-5, "max diff allowed"); +#else +P_DEFINE_double(max_diff, 1e-13, "max diff allowed"); +#endif + +class SetMaxDiff { +public: + explicit SetMaxDiff(double max_diff) { + max_diff_ = FLAGS_max_diff; + FLAGS_max_diff = max_diff; + } + ~SetMaxDiff() { + FLAGS_max_diff = max_diff_; + } +private: + double max_diff_; +}; + +int VectorCheckErr(const Vector& vector1, const Vector& vector2) { + CHECK(vector1.getSize() == vector2.getSize()); + + const real* data1 = vector1.getData(); + const real* data2 = vector2.getData(); + size_t size = vector1.getSize(); + int count = 0; + for (size_t i = 0; i < size; i++) { + real a = data1[i]; + real b = data2[i]; + if (fabs(a - b) > FLAGS_max_diff) { + if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) { + count++; + } + } + } + + return count; +} + +#define COPY_VECTOR_TO_CPU(cpuVec, vector) \ + do {\ + if (vector->useGpu()) {\ + cpuVec = Vector::create(vector->getSize(), false);\ + cpuVec->copyFrom(*vector);\ + } else {\ + cpuVec = vector;\ + }\ + } while (0) + +int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) { + VectorPtr tmp1; + VectorPtr tmp2; + COPY_VECTOR_TO_CPU(tmp1, vector1); + COPY_VECTOR_TO_CPU(tmp2, vector2); + return VectorCheckErr(*tmp1, *tmp2); +} + +#ifdef PADDLE_DISABLE_TIMER + +#define CHECK_VECTORPTR(vector1, vector2) \ + EXPECT_EQ(VectorCheckErr(vector1, vector2), 0) + +#define EXPRESSION_PERFORMANCE(expression) \ + expression; + +#else + +#include "paddle/common/Stat.h" + +#define CHECK_VECTORPTR(vector1, vector2) + +#define EXPRESSION_PERFORMANCE(expression) \ + do {\ + char expr[30];\ + strncpy(expr, #expression, 30);\ + if (expr[29] != '\0') {\ + expr[27] = '.'; expr[28] = '.'; expr[29] = '\0';\ + }\ + expression;\ + for (int i = 0; i < 20; i++) {\ + REGISTER_TIMER(expr);\ + expression;\ + }\ + LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')\ + << *globalStat.getStat(expr);\ + globalStat.reset();\ + } while (0) + +#endif + +typedef std::function testMatrixFunc; + +void testCase(testMatrixFunc matrixFunc) { + for (auto useGpu : {false, true}) { + for (auto size : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072, + 262144, 524288, 1048576, 2097152}) { + LOG(INFO) << " size=" << size << " useGpu=" << useGpu; + matrixFunc(size, useGpu); + } + } +} + +#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \ + vec1[type] = Vector::create(size, useGpu); \ + vec2[type] = Vector::create(size, useGpu); \ + vec1[type]->rand(); \ + vec2[type]->copyFrom(*vec1[type]); + +void testAdagrad(size_t size, bool useGpu) { + VectorPtr bufs1[NUM_PARAMETER_TYPES]; + VectorPtr bufs2[NUM_PARAMETER_TYPES]; + INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu); + + real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT + real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT + real momentum = (real)rand() / (real)RAND_MAX; // NOLINT + real decayRate = (real)rand() / (real)RAND_MAX; // NOLINT + + EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(bufs1, + epsilon, learningRate, momentum, decayRate)); + + BaseMatrix& value = *bufs2[PARAMETER_VALUE]; + BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT]; + BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM]; + BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM]; + BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1]; + BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE]; + + EXPRESSION_PERFORMANCE(adagradApply(value, grad, mom, accum_buffer, accum, lr, + epsilon, learningRate, momentum, decayRate)); + + CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]); + CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]); + CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1], + bufs2[PARAMETER_GRADIENT_SQURESUM1]); + CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE], + bufs2[PARAMETER_LEARNING_RATE]); +} + +TEST(Training, Adagrad) { + testCase(testAdagrad); +} + +void testAdaDelta(size_t size, bool useGpu) { + VectorPtr bufs1[NUM_PARAMETER_TYPES]; + VectorPtr bufs2[NUM_PARAMETER_TYPES]; + INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu); + + real rou = (real)rand() / (real)RAND_MAX; // NOLINT + real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT + real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT + real momentum = (real)rand() / (real)RAND_MAX; // NOLINT + real decayRate = (real)rand() / (real)RAND_MAX; // NOLINT + + EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(bufs1, + rou, epsilon, learningRate, momentum, decayRate)); + + BaseMatrix& value = *bufs2[PARAMETER_VALUE]; + BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT]; + BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM]; + BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM]; + BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1]; + BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE]; + + EXPRESSION_PERFORMANCE(adadeltaApply(value, grad, mom, accum, accum_update, + lr, rou, epsilon, learningRate, momentum, decayRate)); + + CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]); + CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]); + CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM], + bufs2[PARAMETER_GRADIENT_SQURESUM]); + CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1], + bufs2[PARAMETER_GRADIENT_SQURESUM1]); + CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE], + bufs2[PARAMETER_LEARNING_RATE]); +} + +TEST(Training, AdaDelta) { + testCase(testAdaDelta); +} + +template +void testRMSProp(size_t size, bool useGpu) { + VectorPtr bufs1[NUM_PARAMETER_TYPES]; + VectorPtr bufs2[NUM_PARAMETER_TYPES]; + INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu); + + /* make sure 'g - f.square()' greater than 0 */ + bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0); + bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom( + *bufs1[PARAMETER_GRADIENT_SQURESUM]); + + real rou = (real)rand() / (real)RAND_MAX; // NOLINT + real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT + real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT + real momentum = (real)rand() / (real)RAND_MAX; // NOLINT + real decayRate = (real)rand() / (real)RAND_MAX; // NOLINT + real accumulatedRou = rou; + + EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1, + accumulatedRou, rou, epsilon, learningRate, momentum, decayRate, + isFirstTime)); + + BaseMatrix& value = *bufs2[PARAMETER_VALUE]; + BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT]; + BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM]; + BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM]; + BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1]; + BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE]; + + EXPRESSION_PERFORMANCE(rmspropApply(value, grad, mom, sum, sum1, lr, + accumulatedRou, rou, epsilon, learningRate, momentum, decayRate, + isFirstTime)); + + CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]); + CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]); + CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM], + bufs2[PARAMETER_GRADIENT_SQURESUM]); + CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1], + bufs2[PARAMETER_GRADIENT_SQURESUM1]); + CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE], + bufs2[PARAMETER_LEARNING_RATE]); +} + +TEST(Training, RMSProp) { + testCase(testRMSProp); + testCase(testRMSProp); +} + +template +void testDecayedAdagrad(size_t size, bool useGpu) { + VectorPtr bufs1[NUM_PARAMETER_TYPES]; + VectorPtr bufs2[NUM_PARAMETER_TYPES]; + INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu); + + real rou = (real)rand() / (real)RAND_MAX; // NOLINT + real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT + real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT + real momentum = (real)rand() / (real)RAND_MAX; // NOLINT + real decayRate = (real)rand() / (real)RAND_MAX; // NOLINT + real accumulatedRou = rou; + + if (isFirstTime) { + bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem(); + bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem(); + } + + EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1, + accumulatedRou, rou, epsilon, learningRate, momentum, decayRate, + isFirstTime)); + + BaseMatrix& value = *bufs2[PARAMETER_VALUE]; + BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT]; + BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM]; + BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM]; + BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE]; + + EXPRESSION_PERFORMANCE(decayedAdagradApply(value, grad, mom, sum, lr, + accumulatedRou, rou, epsilon, learningRate, momentum, decayRate, + isFirstTime)); + + CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]); + CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]); + CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM], + bufs2[PARAMETER_GRADIENT_SQURESUM]); + CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE], + bufs2[PARAMETER_LEARNING_RATE]); +} + +TEST(Training, DecayedAdagrad) { + testCase(testDecayedAdagrad); + testCase(testDecayedAdagrad); +} + +void testAdam(size_t size, bool useGpu) { + VectorPtr bufs1[NUM_PARAMETER_TYPES]; + VectorPtr bufs2[NUM_PARAMETER_TYPES]; + INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu); + + real beta1 = (real)rand() / (real)RAND_MAX; // NOLINT + real beta2 = (real)rand() / (real)RAND_MAX; // NOLINT + real beta1_power = (real)rand() / (real)RAND_MAX; // NOLINT + real beta2_power = (real)rand() / (real)RAND_MAX; // NOLINT + real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT + real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT + + EXPRESSION_PERFORMANCE(AdamParameterOptimizer(bufs1, + beta1, beta2, beta1_power, beta2_power, epsilon, learningRate)); + + BaseMatrix& value = *bufs2[PARAMETER_VALUE]; + BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT]; + BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM]; + BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM]; + + EXPRESSION_PERFORMANCE(adamApply(value, grad, mom, v, + beta1, beta2, beta1_power, beta2_power, epsilon, learningRate)); + + CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]); + CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]); + CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM], + bufs2[PARAMETER_SECOND_MOMENTUM]); +} + +TEST(Training, Adam) { + testCase(testAdam); +} + +void testAdamax(size_t size, bool useGpu) { + VectorPtr bufs1[NUM_PARAMETER_TYPES]; + VectorPtr bufs2[NUM_PARAMETER_TYPES]; + INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu); + + real beta1 = (real)rand() / (real)RAND_MAX; // NOLINT + real beta2 = (real)rand() / (real)RAND_MAX; // NOLINT + real alpha = (real)rand() / (real)RAND_MAX; // NOLINT + int64_t step = 2; + + EXPRESSION_PERFORMANCE(AdamaxParameterOptimizer(bufs1, + beta1, beta2, step, alpha)); + + BaseMatrix& value = *bufs2[PARAMETER_VALUE]; + BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT]; + BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM]; + BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]; + + EXPRESSION_PERFORMANCE(adamaxApply(value, grad, mom, u, + beta1, beta2, step, alpha)); + + CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]); + CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]); + CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM], + bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]); +} + +TEST(Training, Adamax) { +#ifndef PADDLE_TYPE_DOUBLE + SetMaxDiff diff(1e-4); +#endif + testCase(testAdamax); +} + +void testSparseMomentum(size_t size, bool useGpu) { + VectorPtr bufs1[NUM_PARAMETER_TYPES]; + VectorPtr bufs2[NUM_PARAMETER_TYPES]; + INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu); + INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu); + + real alpha = (real)rand() / (real)RAND_MAX; // NOLINT + real beta = (real)rand() / (real)RAND_MAX; // NOLINT + real gamma = (real)rand() / (real)RAND_MAX; // NOLINT + real tau = (real)rand() / (real)RAND_MAX; // NOLINT + real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT + + EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(bufs1, + alpha, beta, gamma, tau, learningRate)); + + BaseMatrix& value = *bufs2[PARAMETER_VALUE]; + BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT]; + BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT]; + BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT]; + + EXPRESSION_PERFORMANCE(sparseMomentumApply(value, grad, momU, momV, + alpha, beta, gamma, tau, learningRate)); + + CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]); + CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], + bufs2[PARAMETER_MOMENTUM_UT]); + CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], + bufs2[PARAMETER_MOMENTUM_VT]); +} + +TEST(Training, SparseMomentum) { + testCase(testSparseMomentum); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + hl_start(); + hl_init(FLAGS_gpu_id); + return RUN_ALL_TESTS(); +} + diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp index e1bda79a8acb16ffb9025ff92afa2bb24d76c4fe..1483e04d0d061937bd6be636bda84f18283d89cd 100644 --- a/paddle/math/tests/test_matrixCompare.cpp +++ b/paddle/math/tests/test_matrixCompare.cpp @@ -381,8 +381,8 @@ void testMatrixSqrt(int height, int width) { cpuA->randomizeUniform(); gpuA->copyFrom(*cpuA); - cpuA->sqrt(); - gpuA->sqrt(); + cpuA->sqrt2(); + gpuA->sqrt2(); MatrixPtr outputCheck = std::make_shared(height, width); outputCheck->copyFrom(*gpuA); diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp index bb46a51d1e02c6d7e96e33c2cac0585055f026a1..5fd04396c46c5e8538c66d27c4aff965c84db8d5 100644 --- a/paddle/parameter/FirstOrderOptimizer.cpp +++ b/paddle/parameter/FirstOrderOptimizer.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/utils/Util.h" #include "paddle/utils/Flags.h" - +#include "paddle/math/TrainingAlgorithmOp.h" #include "FirstOrderOptimizer.h" #include @@ -113,17 +113,20 @@ void SparseMomentumParameterOptimizer::finishBatch() { void AdagradParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { - vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT], - 1.0f); - vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM], - *vecs[PARAMETER_GRADIENT_SQURESUM1]); - vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon()); - vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]); - - vecs[PARAMETER_VALUE]->sgdUpdate( - *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], - *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(), - config.momentum(), applyDecay_ ? config.decay_rate() : 0); + BaseMatrix& value = *vecs[PARAMETER_VALUE]; + BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; + BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; + BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM]; + BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1]; + BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE]; + + real epsilon = optConfig_.ada_epsilon(); + real learningRate = learningRate_ * config.learning_rate(); + real momentum = config.momentum(); + real decayRate = applyDecay_ ? config.decay_rate() : 0; + + adagradApply(value, grad, mom, accum_buffer, accum, lr, + epsilon, learningRate, momentum, decayRate); } ParameterOptimizer::TraverseCallback @@ -147,32 +150,32 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { CHECK(sparseId == -1LU) << "Sparse update is not supported"; - // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2 - vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(*vecs[PARAMETER_GRADIENT], - rou_, 1.0f - rou_); - - // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) ) - vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1], - *vecs[PARAMETER_GRADIENT_SQURESUM], - epsilon_, epsilon_); - vecs[PARAMETER_LEARNING_RATE]->sqrt(); - - // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2 - vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul( - *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_LEARNING_RATE], rou_, - 1.0f - rou_); - - vecs[PARAMETER_VALUE]->sgdUpdate( - *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], - *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(), - config.momentum(), applyDecay_ ? config.decay_rate() : 0); + BaseMatrix& value = *vecs[PARAMETER_VALUE]; + BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; + BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; + BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM]; + BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1]; + BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE]; + + real learningRate = learningRate_ * config.learning_rate(); + real momentum = config.momentum(); + real decayRate = applyDecay_ ? config.decay_rate() : 0; + + adadeltaApply(value, grad, mom, accum, accum_update, lr, + rou_, epsilon_, learningRate, momentum, decayRate); } void RMSPropParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { - real accumulatedRou = rou_; + BaseMatrix& value = *vecs[PARAMETER_VALUE]; + BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; + BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; + BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM]; + BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1]; + BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE]; + real accumulatedRou = rou_; bool firstTime = timer_ == 0; if (sparseId != -1LU) { CHECK_LT(sparseId, t0Vec_.size()); @@ -181,37 +184,26 @@ void RMSPropParameterOptimizer::update(const VectorPtr vecs[], t0Vec_[sparseId] = timer_ + 1; } - // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2 - // For the first time update, make the sum be the current square - // so that the initial estimation of E(g_t^2) will not be too small. - vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare( - *vecs[PARAMETER_GRADIENT], accumulatedRou, - firstTime ? 1.0f : 1.0f - rou_); - - // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g - vecs[PARAMETER_GRADIENT_SQURESUM1]->add(*vecs[PARAMETER_GRADIENT], - accumulatedRou, 1.0f - rou_); - - // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon ) - // Basiclly if the sign of the gradient changes more often, - // the learning rate will be decreased. - vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]); - vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1], - -1.0f); - vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon()); - vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]); - - vecs[PARAMETER_VALUE]->sgdUpdate( - *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], - *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(), - config.momentum(), applyDecay_ ? config.decay_rate() : 0); + real epsilon = optConfig_.ada_epsilon(); + real learningRate = learningRate_ * config.learning_rate(); + real momentum = config.momentum(); + real decayRate = applyDecay_ ? config.decay_rate() : 0; + + rmspropApply(value, grad, mom, sum, sum1, lr, + accumulatedRou, rou_, epsilon, learningRate, momentum, decayRate, + firstTime); } void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { - real accumulatedRou = rou_; + BaseMatrix& value = *vecs[PARAMETER_VALUE]; + BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; + BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; + BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM]; + BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE]; + real accumulatedRou = rou_; bool firstTime = timer_ == 0; if (sparseId != -1LU) { CHECK_LT(sparseId, t0Vec_.size()); @@ -220,77 +212,48 @@ void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[], t0Vec_[sparseId] = timer_ + 1; } - // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2 - // For the first time update, make the sum be the current square - // so that the initial estimation of E(g_t^2) will not be too small. - vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare( - *vecs[PARAMETER_GRADIENT], accumulatedRou, - firstTime ? 1.0f : 1.0f - rou_); - - // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon ) - // Basiclly if the bigger the magnitude gradient is, - // the smaller the learning rate will be. - vecs[PARAMETER_LEARNING_RATE]->assign(optConfig_.ada_epsilon()); - vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]); - vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]); - - vecs[PARAMETER_VALUE]->sgdUpdate( - *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], - *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(), - config.momentum(), applyDecay_ ? config.decay_rate() : 0); + real epsilon = optConfig_.ada_epsilon(); + real learningRate = learningRate_ * config.learning_rate(); + real momentum = config.momentum(); + real decayRate = applyDecay_ ? config.decay_rate() : 0; + + decayedAdagradApply(value, grad, mom, sum, lr, + accumulatedRou, rou_, epsilon, learningRate, momentum, decayRate, + firstTime); } void AdamParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { CHECK(sparseId == -1UL) << "Sparse update is not supported"; - Vector* m = vecs[PARAMETER_MOMENTUM].get(); - Vector* g = vecs[PARAMETER_GRADIENT].get(); - Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get(); - Vector* theta = vecs[PARAMETER_VALUE].get(); - - // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t; - m->add(*g, beta1_, 1 - beta1_); - - // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2 - g->square(); - v->add(*g, beta2_, 1 - beta2_); - - // tmp = m_t / ( \sqrt{v_t} + \epsilon ) - // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp - g->sqrt(*v); - g->dotDiv(*m, *g, 0., epsilon_); - real alpha = config.learning_rate() * learningRate_; - alpha = alpha * std::sqrt(1 - std::pow(beta2_, step_)) / - (1 - std::pow(beta1_, step_)); - theta->add(*theta, 1.0, *g, -alpha); + real beta1_power = std::pow(beta1_, step_); + real beta2_power = std::pow(beta2_, step_); + real learningRate = config.learning_rate() * learningRate_; + + BaseMatrix& value = *vecs[PARAMETER_VALUE]; + BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; + BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; + BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM]; + + adamApply(value, grad, mom, v, + beta1_, beta2_, beta1_power, beta2_power, epsilon_, learningRate); } void AdamaxParameterOptimizer::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { CHECK(sparseId == -1UL) << "Sparse update is not supported"; - Vector* m = vecs[PARAMETER_MOMENTUM].get(); - Vector* g = vecs[PARAMETER_GRADIENT].get(); - Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get(); - Vector* theta = vecs[PARAMETER_VALUE].get(); - - // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t; - m->add(*g, beta1_, 1 - beta1_); + real learningRate = config.learning_rate() * learningRate_; - // u_t = max(\beta_2*u_{t-1}, abs(g_t)) - u->mulScalar(beta2_); - g->abs(); - u->max(*u, *g); + BaseMatrix& value = *vecs[PARAMETER_VALUE]; + BaseMatrix& grad = *vecs[PARAMETER_GRADIENT]; + BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM]; + BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM]; - // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t - g->dotDiv(*m, *u); - real learningRate = config.learning_rate() * learningRate_; - learningRate /= (1 - std::pow(beta1_, step_)); - theta->add(*theta, 1.0, *g, -learningRate); + adamaxApply(value, grad, mom, u, + beta1_, beta2_, step_, learningRate); } - void OptimizerWithGradientClipping::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const {