提交 6216b595 编写于 作者: Y Yi Wang

Resolve conflicts

......@@ -19,8 +19,8 @@ automatically runs the following commands:
.. code-block:: base
docker build -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
docker build -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile .
docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu .
To run the CPU-only image as an interactive container:
......@@ -81,3 +81,25 @@ source code:
cd Paddle
docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
Documentation
-------------
Paddle Docker images include an HTML version of C++ source code
generated using `woboq code browser
<https://github.com/woboq/woboq_codebrowser>`_. This makes it easy
for users to browse and understand the C++ source code.
As long as we give the Paddle Docker container a name, we can run an
additional nginx Docker container to serve the volume from the Paddle
container:
.. code-block:: bash
docker run -d --name paddle-cpu-doc paddle:cpu
docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
Then we can direct our Web browser to the HTML version of source code
at http://localhost:8088/paddle/
......@@ -20,11 +20,7 @@ popd > /dev/null
cd $SCRIPTPATH
if [ ! -f ../../dist/*.whl ] ; then # Swig not compiled.
exit 0
fi
rm .test_env -rf
rm -rf .test_env
virtualenv .test_env
source .test_env/bin/activate
......
......@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_MATRIX_TYPE_CUH_
#define HL_MATRIX_TYPE_CUH_
#include "hl_base.h"
#ifdef __CUDA_ARCH__
// typedef void* vecType;
#include <vector_types.h>
#ifndef PADDLE_TYPE_DOUBLE
typedef float4 vecType;
......@@ -37,4 +35,10 @@ typedef __m128d vecType;
#endif
#endif
#endif /* HL_MATRIX_TYPE_CUH_ */
#ifdef __CUDA_ARCH__
#define INLINE __device__ inline
#else
#define INLINE inline
#endif
#endif // HL_MATRIX_TYPE_CUH_
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_TENSOR_OPS_H_
#define HL_TENSOR_OPS_H_
#include <cmath>
#include "hl_matrix_type.cuh"
namespace hppl {
namespace unary {
template <class T>
class add_scale {
private:
const T p;
public:
INLINE add_scale(const T s) : p(s) {}
INLINE T operator()(const T a) const { return a + p; }
};
template <class T>
class sub_scale {
private:
const T p;
public:
INLINE sub_scale(const T s) : p(s) {}
INLINE T operator()(const T a) const { return a - p; }
};
template <class T>
class mul_scale {
private:
const T p;
public:
INLINE mul_scale(const T s) : p(s) {}
INLINE T operator()(const T a) const { return a * p; }
};
template <class T>
class div_scale {
private:
const T p;
public:
INLINE div_scale(const T s) : p(s) {}
INLINE T operator()(const T a) const { return a / p; }
};
template <class T>
class neg {
public:
INLINE T operator()(const T a) const { return -a; }
};
template <class T>
class exp_op {
public:
INLINE T operator()(const T a) const { return std::exp(a); }
};
template <class T>
class log_op {
public:
INLINE T operator()(const T a) const { return std::log(a); }
};
template <class T>
class sqrt_op {
public:
INLINE T operator()(const T a) const { return std::sqrt(a); }
};
template <class T>
class square {
public:
INLINE T operator()(const T a) const { return a * a; }
};
template <class T>
class reciprocal {
public:
INLINE T operator()(const T a) const { return T(1) / a; }
};
template <class T>
class abs {
public:
INLINE T operator()(const T a) const { return a > 0 ? a : -a; }
};
template <class T>
class sign {
public:
INLINE T operator()(const T a) const { return (a > 0) - (a < 0); }
};
template <class T>
class min {
private:
const T p;
public:
INLINE min(const T s) : p(s) {}
INLINE T operator()(const T a) const { return a > p ? p : a; }
};
template <class T>
class max {
private:
const T p;
public:
INLINE max(const T s) : p(s) {}
INLINE T operator()(const T a) const { return a < p ? p : a; }
};
template <class T>
class pow_op {
private:
const T p;
public:
INLINE pow_op(const T s) : p(s) {}
INLINE T operator()(const T a) const { return std::pow(a, p); }
};
template <class T>
class constant {
private:
const T p;
public:
INLINE constant(const T s) : p(s) {}
INLINE T operator()(int i) const { return p; }
INLINE T operator()(int i, int j) const { return p; }
};
template <class T>
class cmp_eq {
private:
const T p;
public:
INLINE cmp_eq(const T s) : p(s) {}
INLINE bool operator()(const T a) const { return a == p; }
};
template <class T>
class cmp_ne {
private:
const T p;
public:
INLINE cmp_ne(const T s) : p(s) {}
INLINE bool operator()(const T a) const { return a != p; }
};
template <class T>
class cmp_le {
private:
const T p;
public:
INLINE cmp_le(const T s) : p(s) {}
INLINE bool operator()(const T a) const { return a <= p; }
};
template <class T>
class cmp_lt {
private:
const T p;
public:
INLINE cmp_lt(const T s) : p(s) {}
INLINE bool operator()(const T a) const { return a < p; }
};
template <class T>
class cmp_ge {
private:
const T p;
public:
INLINE cmp_ge(const T s) : p(s) {}
INLINE bool operator()(const T a) const { return a >= p; }
};
template <class T>
class cmp_gt {
private:
const T p;
public:
INLINE cmp_gt(const T s) : p(s) {}
INLINE bool operator()(const T a) const { return a > p; }
};
template <class T>
class and_op {
private:
const T p;
public:
INLINE and_op(const T s) : p(s) {}
INLINE bool operator()(const T a) const { return a && p; }
};
template <class T>
class or_op {
private:
const T p;
public:
INLINE or_op(const T s) : p(s) {}
INLINE bool operator()(const T a) const { return a || p; }
};
} // namespace unary
namespace binary {
template <class T>
class add {
public:
INLINE T operator()(const T a, const T b) const { return a + b; }
};
template <class T>
class add_scale {
private:
const T p1;
const T p2;
public:
INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {}
INLINE T operator()(const T a, const T b) const { return p1 * a + p2 * b; }
};
template <class T>
class sub {
public:
INLINE T operator()(const T a, const T b) const { return a - b; }
};
template <class T>
class mul {
public:
INLINE T operator()(const T a, const T b) const { return a * b; }
};
template <class T>
class div {
public:
INLINE T operator()(const T a, const T b) const { return a / b; }
};
template <class T>
class cmp_eq {
public:
INLINE bool operator()(const T a, const T b) const { return a == b; }
};
template <class T>
class cmp_ne {
public:
INLINE bool operator()(const T a, const T b) const { return a != b; }
};
template <class T>
class cmp_le {
public:
INLINE bool operator()(const T a, const T b) const { return a <= b; }
};
template <class T>
class cmp_lt {
public:
INLINE bool operator()(const T a, const T b) const { return a < b; }
};
template <class T>
class cmp_ge {
public:
INLINE bool operator()(const T a, const T b) const { return a >= b; }
};
template <class T>
class cmp_gt {
public:
INLINE bool operator()(const T a, const T b) const { return a > b; }
};
template <class T>
class and_op {
public:
INLINE bool operator()(const T a, const T b) const { return a && b; }
};
template <class T>
class or_op {
public:
INLINE bool operator()(const T a, const T b) const { return a || b; }
};
template <class T>
class min {
public:
INLINE T operator()(const T a, const T b) const { return a > b ? b : a; }
};
template <class T>
class max {
public:
INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
};
} // namespace binary
} // namespace hppl
#endif // HL_TENSOR_OPS_H_
......@@ -289,7 +289,7 @@ void forward(Argument& act) {
useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->abs(*act.value);
act.value->abs2(*act.value);
}
void backward(Argument& act) { act.grad->absDerivative(*act.in); }
......@@ -311,7 +311,7 @@ void forward(Argument& act) {
useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->square(*act.value);
act.value->square2(*act.value);
}
void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
......@@ -324,7 +324,7 @@ END_DEFINE_ACTIVATION(square)
* \f]
*/
BEGIN_DEFINE_ACTIVATION(exponential)
void forward(Argument& act) { act.value->exp(*act.value); }
void forward(Argument& act) { act.value->exp2(*act.value); }
void backward(Argument& act) { act.grad->expDerivative(*act.value); }
END_DEFINE_ACTIVATION(exponential)
......@@ -345,7 +345,7 @@ void forward(Argument& act) {
useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->log(*act.value);
act.value->log2(*act.value);
}
void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }
......
......@@ -40,7 +40,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
savedMean_->mulScalar(1.0 / numSamples); // E[x]
tmpMat_->assign(*mat);
tmpMat_->square();
tmpMat_->square2();
savedInvVar_->zeroMem();
savedInvVar_->accumulateColSum(*tmpMat_);
savedInvVar_->mulScalar(1.0 / numSamples); // E[x^2]
......@@ -54,7 +54,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
calMovingMeanAndVar();
savedInvVar_->subScalar(-EPS);
savedInvVar_->sqrt(*savedInvVar_);
savedInvVar_->sqrt2(*savedInvVar_);
}
void BatchNormalizationLayer::calMovingMeanAndVar() {
......@@ -85,7 +85,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
savedInvVar_->downClip(real(0.0));
savedInvVar_->subScalar(-EPS);
savedInvVar_->sqrt(*savedInvVar_);
savedInvVar_->sqrt2(*savedInvVar_);
}
void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
......
......@@ -115,12 +115,12 @@ void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
Matrix& target) {
Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
output.rowSum(*sftMaxSum_);
sftMaxSum_->log();
sftMaxSum_->log2();
target.oneHotCrossEntropy(output, *label.ids);
target.add(*sftMaxSum_);
sftMaxSum_->square();
sftMaxSum_->square2();
target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
}
......@@ -131,12 +131,12 @@ void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
output.rowSum(*sftMaxSum_);
Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
sftMaxSum_->reciprocal(*sumInv_);
sftMaxSum_->reciprocal2(*sumInv_);
outputG.oneHotCrossEntropyBp(output, *label.ids);
outputG.addColumnVector(*sumInv_);
sftMaxSum_->log();
sftMaxSum_->log2();
sumInv_->dotMul(*sumInv_, *sftMaxSum_);
sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());
......
......@@ -316,12 +316,12 @@ void Layer::showOutputStats() {
auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
min = tmpMat->getMin();
max = tmpMat->getMax();
tmpMat->square();
tmpMat->square2();
LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
} else {
min = outSquare->getMin();
max = outSquare->getMax();
outSquare->square();
outSquare->square2();
}
real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
std = std > 0 ? std : 0;
......
......@@ -60,7 +60,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
expX_->assign(*matX);
// subtract max to avoid overflow or underflow
expX_->mul(maxX_, ones_, (real)-1, (real)1);
expX_->exp();
expX_->exp2();
real* a = a_->getData();
real* b = b_->getData();
......@@ -69,7 +69,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
real* expX = expX_->getData();
real* maxX = maxX_->getData();
expW_->exp(*w_);
expW_->exp2(*w_);
real* expW = expW_->getData();
for (int i = 0; i < numClasses_; ++i) {
......
......@@ -99,7 +99,7 @@ void PowerLayer::backward(const UpdateCallback& callback) {
Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
if (inG0) {
tmpMtx->log(*inV1);
tmpMtx->log2(*inV1);
tmpMtx->dotMul(*tmpMtx, *outV);
// inG0 += outG .* (log(inV1) * outV)
......
......@@ -355,11 +355,11 @@ void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); }
DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
template<>
void BaseMatrixT<real>::exp() { applyUnary(unary::Exp<real>()); }
void BaseMatrixT<real>::exp2() { applyUnary(unary::Exp<real>()); }
DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
template<>
void BaseMatrixT<real>::log() {
void BaseMatrixT<real>::log2() {
if (useGpu_) {
applyUnary(unary::Log<real>());
} else {
......@@ -369,23 +369,23 @@ void BaseMatrixT<real>::log() {
DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
template<>
void BaseMatrixT<real>::sqrt() { applyUnary(unary::Sqrt<real>()); }
void BaseMatrixT<real>::sqrt2() { applyUnary(unary::Sqrt<real>()); }
DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
template<class T>
void BaseMatrixT<T>::square() { applyUnary(unary::Square<T>()); }
void BaseMatrixT<T>::square2() { applyUnary(unary::Square<T>()); }
DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
template<class T>
void BaseMatrixT<T>::reciprocal() { applyUnary(unary::Reciprocal<T>()); }
void BaseMatrixT<T>::reciprocal2() { applyUnary(unary::Reciprocal<T>()); }
DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
template<class T>
void BaseMatrixT<T>::abs() { applyUnary(unary::Abs<T>()); }
void BaseMatrixT<T>::abs2() { applyUnary(unary::Abs<T>()); }
DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
template<class T>
void BaseMatrixT<T>::sign() { applyUnary(unary::Sign<T>()); }
void BaseMatrixT<T>::sign2() { applyUnary(unary::Sign<T>()); }
DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
template<class T>
......@@ -405,7 +405,7 @@ void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); }
DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
template<>
void BaseMatrixT<real>::pow(real p) {
void BaseMatrixT<real>::pow2(real p) {
if (useGpu_) {
applyUnary(unary::Pow<real>(p));
} else {
......@@ -534,7 +534,7 @@ void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
template<>
void BaseMatrixT<real>::pow(BaseMatrixT& b, real p) {
void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
if (useGpu_) {
applyBinary(binary::Pow<real>(p), b);
} else {
......@@ -615,7 +615,7 @@ void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
template<class T>
void BaseMatrixT<T>::square(BaseMatrixT& b) {
void BaseMatrixT<T>::square2(BaseMatrixT& b) {
applyBinary(binary::Square<T>(), b);
}
......@@ -657,7 +657,7 @@ void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
template<class T>
void BaseMatrixT<T>::reciprocal(BaseMatrixT& b) {
void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
applyBinary(binary::Reciprocal<T>(), b);
}
......@@ -669,7 +669,7 @@ void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
template<class T>
void BaseMatrixT<T>::abs(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
void BaseMatrixT<T>::abs2(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
template<class T>
......@@ -729,17 +729,19 @@ void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
template<class T>
void BaseMatrixT<T>::sign(BaseMatrixT& b) { applyBinary(binary::Sign<T>(), b); }
void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
applyBinary(binary::Sign<T>(), b);
}
DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
template<>
void BaseMatrixT<real>::exp(BaseMatrixT& b) {
void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
applyBinary(binary::Exp<real>(), b);
}
DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
template<>
void BaseMatrixT<real>::log(BaseMatrixT& b) {
void BaseMatrixT<real>::log2(BaseMatrixT& b) {
if (useGpu_) {
applyBinary(binary::Log<real>(), b);
} else {
......@@ -749,7 +751,7 @@ void BaseMatrixT<real>::log(BaseMatrixT& b) {
DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
template<>
void BaseMatrixT<real>::sqrt(BaseMatrixT& b) {
void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
applyBinary(binary::Sqrt<real>(), b);
}
......@@ -1065,7 +1067,7 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
template<class T>
void BaseMatrixT<T>::max(BaseMatrixT& b, BaseMatrixT& c) { // NOLINT
void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
applyTernary(ternary::Max<T>(), b, c);
}
......@@ -1168,7 +1170,7 @@ void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2,
DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER,
a = 1 / (p1 * b + p2));
template<class T>
void BaseMatrixT<T>::reciprocal(BaseMatrixT& b, T p1, T p2) {
void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
applyBinary(binary::Reciprocal2<T>(p1, p2), b);
}
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <cstddef>
#include <stdint.h>
#include "paddle/utils/TypeDefs.h"
#include "TensorExpression.h"
namespace paddle {
......@@ -70,7 +71,7 @@ public:
};
template <class T>
class BaseMatrixT {
class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
public:
size_t height_, width_;
size_t stride_;
......@@ -427,14 +428,14 @@ public:
*
*/
void neg();
void exp();
void pow(T p);
void log();
void sqrt();
void square();
void reciprocal();
void abs();
void sign();
void exp2();
void pow2(T p);
void log2();
void sqrt2();
void square2();
void reciprocal2();
void abs2();
void sign2();
void zero();
/**
......@@ -603,7 +604,7 @@ public:
* b = this * this
* @endcode
*/
void square(BaseMatrixT& b);
void square2(BaseMatrixT& b);
void squareDerivative(BaseMatrixT& b);
/**
......@@ -627,7 +628,7 @@ public:
* b = 1.0f / this
* @endcode
*/
void reciprocal(BaseMatrixT& b);
void reciprocal2(BaseMatrixT& b);
void reciprocalDerivative(BaseMatrixT& b);
/**
......@@ -635,7 +636,7 @@ public:
* b = this > 0.0f ? this : -this
* @endcode
*/
void abs(BaseMatrixT& b);
void abs2(BaseMatrixT& b);
void absDerivative(BaseMatrixT& b);
/**
......@@ -653,12 +654,12 @@ public:
*/
void expDerivative(BaseMatrixT& b);
void sign(BaseMatrixT& b);
void sign2(BaseMatrixT& b);
void exp(BaseMatrixT& b);
void pow(BaseMatrixT& b, T p);
void log(BaseMatrixT& b);
void sqrt(BaseMatrixT& b);
void exp2(BaseMatrixT& b);
void pow2(BaseMatrixT& b, T p);
void log2(BaseMatrixT& b);
void sqrt2(BaseMatrixT& b);
void addScalar(BaseMatrixT& b, T p);
void subScalar(BaseMatrixT& b, T p);
void mulScalar(BaseMatrixT& b, T p);
......@@ -828,7 +829,7 @@ public:
* this = b>c ? b : c
* @endcode
*/
void max(BaseMatrixT& b, BaseMatrixT& c); // NOLINT
void max2(BaseMatrixT& b, BaseMatrixT& c);
/**
* @code
......@@ -927,7 +928,7 @@ public:
* this = 1 / (p1 * b + p2)
* @endcode
*/
void reciprocal(BaseMatrixT& b, T p1, T p2);
void reciprocal2(BaseMatrixT& b, T p1, T p2);
/**
* @code
......@@ -1050,6 +1051,32 @@ public:
void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
virtual bool isSparse() const { return false; }
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
if (useGpu_) {
TensorGpuApply<T>(*this, expr);
} else {
TensorCpuApply<T>(*this, expr);
}
}
template <typename ExpressionType>
void operator+=(const ExpressionType& expr) {
(*this) = (*this) + expr;
}
template <typename ExpressionType>
void operator-=(const ExpressionType& expr) {
(*this) = (*this) - expr;
}
template <typename ExpressionType>
void operator*=(const ExpressionType& expr) {
(*this) = (*this) * expr;
}
template <typename ExpressionType>
void operator/=(const ExpressionType& expr) {
(*this) = (*this) / expr;
}
};
typedef BaseMatrixT<real> BaseMatrix;
......
......@@ -16,10 +16,12 @@ file(GLOB MATH_HEADERS . *.h)
file(GLOB MATH_SOURCES . *.cpp)
set(MATH_SOURCES
"${PROJ_ROOT}/paddle/math/BaseMatrix.cu"
"${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu"
${MATH_SOURCES})
if(NOT WITH_GPU)
# then compile BaseMatrix.cu as c++ file
compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/BaseMatrix.cu")
compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu")
add_library(paddle_math STATIC
${MATH_SOURCES})
else()
......
......@@ -136,7 +136,7 @@ public:
return sum;
}
virtual void square() {
virtual void square2() {
CHECK(isContiguous());
if (valueType_ == NO_VALUE) {
return;
......
......@@ -1122,6 +1122,7 @@ public:
virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
LOG(FATAL) << "Not implemented";
}
virtual void bilinearForward(const Matrix& in,
const size_t inImgH,
const size_t inImgW,
......@@ -1142,6 +1143,15 @@ public:
const real ratioW) {
LOG(FATAL) << "Not implemented";
}
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
if (useGpu_) {
TensorGpuApply<real>(*this, expr);
} else {
TensorCpuApply<real>(*this, expr);
}
}
};
inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
......@@ -1518,6 +1528,11 @@ public:
void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
TensorGpuApply<real>(*this, expr);
}
};
class CpuMatrix : public Matrix {
......@@ -1917,6 +1932,11 @@ public:
const size_t numChannels,
const real ratioH,
const real ratioW);
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
TensorCpuApply<real>(*this, expr);
}
};
class SharedCpuMatrix : public CpuMatrix {
......@@ -1957,6 +1977,7 @@ public:
void add(real p1, real p2);
private:
using Matrix::mul;
void initShared(int blockNum);
void initBlock(int blockNum);
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
/**
* \brief The tensor evaluator classes.
*/
template <typename Derived, class T>
class TensorApply {
public:
explicit INLINE TensorApply(const Derived& p)
: data_(p.data_),
stride_(p.stride_),
height_(p.height_),
width_(p.width_),
useGpu_(p.useGpu_) {}
INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
INLINE T apply(int index) const { return data_[index]; }
INLINE T& applyRef(int i, int j) { return data_[i * stride_ + j]; }
INLINE T& applyRef(int index) { return data_[index]; }
INLINE size_t getWidth() const { return width_; }
INLINE size_t getHeight() const { return height_; }
INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
INLINE bool useGpu() const { return useGpu_; }
T* data_;
size_t stride_;
size_t height_;
size_t width_;
bool useGpu_;
};
/**
* \brief The tensor evaluator classes.
* evaluator for rvalues
*/
template <typename Derived, class T>
class TensorApply<const Derived, T> {
public:
explicit INLINE TensorApply(const Derived& p)
: data_(p.data_),
stride_(p.stride_),
height_(p.height_),
width_(p.width_),
useGpu_(p.useGpu_) {}
INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
INLINE T apply(int index) const { return data_[index]; }
INLINE size_t getWidth() const { return width_; }
INLINE size_t getHeight() const { return height_; }
INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
INLINE bool useGpu() const { return useGpu_; }
const T* data_;
size_t stride_;
size_t height_;
size_t width_;
bool useGpu_;
};
template <typename Derived, class T>
class TensorApply<const TensorExpression<Derived, T>, T> {
public:
explicit TensorApply(const TensorExpression<Derived, T>& expr)
: expr_(expr.derived()) {}
INLINE T apply(int i, int j) const { return expr_.apply(i, j); }
INLINE T apply(int index) const { return expr_.apply(index); }
INLINE size_t getWidth() const { return expr_.getWidth(); }
INLINE size_t getHeight() const { return expr_.getHeight(); }
INLINE bool isContiguous() const { return expr_.isContiguous(); }
INLINE bool useGpu() const { return expr_.useGpu(); }
TensorApply<const Derived, T> expr_;
};
/**
* \brief The unary expression evaluator classes.
*/
template <class OP, typename ArgType, class T>
class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
public:
explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
: op_(expr.op_), expr_(expr.expr_) {}
INLINE T apply(int i, int j) const { return op_(expr_.apply(i, j)); }
INLINE T apply(int index) const { return op_(expr_.apply(index)); }
INLINE size_t getWidth() const { return expr_.getWidth(); }
INLINE size_t getHeight() const { return expr_.getHeight(); }
INLINE bool isContiguous() const { return expr_.isContiguous(); }
INLINE bool useGpu() const { return expr_.useGpu(); }
const OP op_;
TensorApply<ArgType, T> expr_;
};
/**
* \brief The binary expression evaluator classes.
*/
template <class OP, typename LhsType, typename RhsType, class T>
class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
public:
explicit INLINE TensorApply(
const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
: op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
#ifndef __CUDA_ARCH__
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
#endif
}
INLINE T apply(int i, int j) const {
return op_(lhs_.apply(i, j), rhs_.apply(i, j));
}
INLINE T apply(int index) const {
return op_(lhs_.apply(index), rhs_.apply(index));
}
INLINE size_t getWidth() const { return lhs_.getWidth(); }
INLINE size_t getHeight() const { return rhs_.getHeight(); }
INLINE bool isContiguous() const {
return lhs_.isContiguous() && rhs_.isContiguous();
}
INLINE bool useGpu() const { return lhs_.useGpu(); }
const OP op_;
TensorApply<LhsType, T> lhs_;
TensorApply<RhsType, T> rhs_;
};
/**
* \brief The ternary expression evaluator classes.
*/
template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
public:
explicit INLINE TensorApply(
const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
: expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
#ifndef __CUDA_ARCH__
CHECK_EQ(expr1_.getWidth(), expr2_.getWidth());
CHECK_EQ(expr1_.getWidth(), expr3_.getWidth());
CHECK_EQ(expr1_.getHeight(), expr2_.getHeight());
CHECK_EQ(expr1_.getHeight(), expr3_.getHeight());
CHECK_EQ(expr1_.useGpu(), expr2_.useGpu());
CHECK_EQ(expr1_.useGpu(), expr3_.useGpu());
#endif
}
INLINE T apply(int i, int j) const {
return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j);
}
INLINE T apply(int index) const {
return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index);
}
INLINE size_t getWidth() const { return expr1_.getWidth(); }
INLINE size_t getHeight() const { return expr1_.getHeight(); }
INLINE bool isContiguous() const {
return expr1_.isContiguous() && expr2_.isContiguous() &&
expr3_.isContiguous();
}
INLINE bool useGpu() const { return expr1_.useGpu(); }
TensorApply<ArgType1, T> expr1_;
TensorApply<ArgType2, T> expr2_;
TensorApply<ArgType3, T> expr3_;
};
/**
* \brief The const expression evaluator classes.
*/
template <class OP, typename ArgType, class T>
class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
public:
explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
: op_(expr.op_), expr_(expr.expr_) {}
INLINE T apply(int i, int j) const { return op_(i, j); }
INLINE T apply(int index) const { return op_(index); }
INLINE size_t getWidth() const { return expr_.getWidth(); }
INLINE size_t getHeight() const { return expr_.getHeight(); }
INLINE bool isContiguous() const { return true; }
INLINE bool useGpu() const { return expr_.useGpu(); }
const OP op_;
TensorApply<ArgType, T> expr_;
};
} // namespace paddle
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/utils/Logging.h"
namespace paddle {
/**
* \brief Tensor Assign Expression(return by lazyAssign,
* and evaluated by AssignEvaluate)
*/
template <typename LhsType, typename RhsType, class T>
class TensorAssignOp {
public:
explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
: lhs_(lhs), rhs_(rhs) {
#ifndef __CUDA_ARCH__
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
#endif
}
INLINE void apply(const int i, const int j) {
lhs_.applyRef(i, j) = rhs_.apply(i, j);
}
INLINE void apply(const int index) {
lhs_.applyRef(index) = rhs_.apply(index);
}
INLINE size_t getWidth() const { return lhs_.getWidth(); }
INLINE size_t getHeight() const { return rhs_.getHeight(); }
INLINE bool isContiguous() const {
return lhs_.isContiguous() && rhs_.isContiguous();
}
INLINE bool useGpu() const { return lhs_.useGpu(); }
private:
TensorApply<LhsType, T> lhs_;
TensorApply<const RhsType, T> rhs_;
};
template <typename Assign, typename... AssignOp>
void AssignCpuEvaluate(int height,
int width,
bool isContiguous,
Assign&& assign,
AssignOp&&... args) {
if (isContiguous) {
int size = height * width;
for (int index = 0; index < size; index++) {
assign.apply(index);
__attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
}
} else {
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
assign.apply(i, j);
__attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
}
}
}
}
#ifdef __NVCC__
template <typename Assign, typename... AssignOp>
__global__ void AssignGpuEvaluate1(const int border,
Assign assign,
AssignOp... args) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < border) {
assign.apply(idx);
__attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
}
}
template <typename Assign, typename... AssignOp>
__global__ void AssignGpuEvaluate2(const int height,
const int width,
Assign assign,
AssignOp... args) {
const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
assign.apply(i, j);
__attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
}
}
}
#endif
/**
* \brief Evaluate one or more TensorAssignOp objects.
*
* \note At least one assignment expression is required
*/
template <typename Assign, typename... AssignOp>
void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
const bool useGpu_ = assign.useGpu();
bool isContiguous_ = assign.isContiguous();
const size_t height = assign.getHeight();
const size_t width = assign.getWidth();
const int packSize = sizeof...(args);
const bool packUseGpu[] = {((args)).useGpu()...};
const bool packIsContiguous[] = {((args)).isContiguous()...};
const size_t packHeight[] = {((args)).getHeight()...};
const size_t packWidth[] = {((args)).getWidth()...};
for (int i = 0; i < packSize; i++) {
CHECK_EQ(useGpu_, packUseGpu[i]);
CHECK_EQ(height, packHeight[i]);
CHECK_EQ(width, packWidth[i]);
isContiguous_ = isContiguous_ && packIsContiguous[i];
}
if (useGpu_) {
#ifdef __NVCC__
if (isContiguous_) {
int size = height * width;
int blockSize = size <= 1024 ? size : 1024;
int gridSize = (size + 1024 - 1) / 1024;
AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
size, assign, args...);
} else {
int blockSizeY = std::min(32, (int)height);
int blockSizeX = (32 / blockSizeY) * 32;
int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
dim3 threads(blockSizeX, blockSizeY);
dim3 grid(gridSizeX, gridSizeY);
AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
height, width, assign, args...);
}
CHECK_SYNC("AssignEvaluate failed");
#endif
} else {
AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
}
}
} // namespace paddle
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/utils/Logging.h"
#include "hl_base.h"
namespace paddle {
/**
* \brief The tensor cpu evaluate api.
*/
template <class T, typename LeftType, typename RightType>
inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
TensorApply<LeftType, T> lhs_(lhs);
TensorApply<const RightType, T> rhs_(rhs);
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
int height = lhs_.getHeight();
int width = lhs_.getWidth();
if (lhs_.isContiguous() && rhs_.isContiguous()) {
int size = height * width;
for (int index = 0; index < size; index++) {
lhs_.applyRef(index) = rhs_.apply(index);
}
} else {
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
lhs_.applyRef(i, j) = rhs_.apply(i, j);
}
}
}
}
#ifdef __NVCC__
template <typename LeftType, typename RightType>
__global__ void TensorElementWiseOp(LeftType lhs,
RightType rhs,
const int border) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < border) {
lhs.applyRef(idx) = rhs.apply(idx);
}
}
template <typename LeftType, typename RightType>
__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
lhs.applyRef(i, j) = rhs.apply(i, j);
}
}
}
/**
* \brief The tensor gpu evaluate api.
*/
template <class T, typename LeftType, typename RightType>
inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
TensorApply<LeftType, T> lhs_(lhs);
TensorApply<const RightType, T> rhs_(rhs);
CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
int dimM = lhs_.getHeight();
int dimN = lhs_.getWidth();
if (lhs_.isContiguous() && rhs_.isContiguous()) {
int size = dimM * dimN;
int blockSize = size <= 1024 ? size : 1024;
int gridSize = (size + 1024 - 1) / 1024;
TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
lhs_, rhs_, size);
} else {
int blockSizeY = std::min(32, dimM);
int blockSizeX = (32 / blockSizeY) * 32;
int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
dim3 threads(blockSizeX, blockSizeY);
dim3 grid(gridSizeX, gridSizeY);
TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
}
CHECK_SYNC("TensorGpuApply failed");
}
#else
template <class T, typename LeftType, typename RightType>
inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {}
#endif
} // namespace paddle
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstddef>
#include <stdint.h>
#include "paddle/utils/TypeDefs.h"
#include "paddle/utils/Logging.h"
#include "hl_tensor_ops.h"
namespace paddle {
template <class OP, typename ExprType, class T>
class TensorConstant;
template <class OP, typename ExprType, class T>
class TensorUnaryOp;
template <class OP, typename LhsType, typename RhsType, class T>
class TensorBinaryOp;
template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
class TensorTernaryOp;
template <typename LhsType, typename RhsType, class T>
class TensorAssignOp;
/**
* \brief Tensor base class.
*
* This is the base class of all Tensor and Expression class.
*/
template <typename Derived, class T>
class TensorExpression {
public:
/**
* Element wise unary expression.
*/
template <typename UnaryOp>
const TensorUnaryOp<UnaryOp, const Derived, T> unaryExpression(
const UnaryOp& op) const {
return TensorUnaryOp<UnaryOp, const Derived, T>(op, derived());
}
const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
T p) const {
return unaryExpression(hppl::unary::add_scale<T>(p));
}
const TensorUnaryOp<hppl::unary::sub_scale<T>, const Derived, T> operator-(
T p) const {
return unaryExpression(hppl::unary::sub_scale<T>(p));
}
const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
T p) const {
return unaryExpression(hppl::unary::mul_scale<T>(p));
}
const TensorUnaryOp<hppl::unary::div_scale<T>, const Derived, T> operator/(
T p) const {
return unaryExpression(hppl::unary::div_scale<T>(p));
}
const TensorUnaryOp<hppl::unary::neg<T>, const Derived, T> operator-() const {
return unaryExpression(hppl::unary::neg<T>());
}
const TensorUnaryOp<hppl::unary::exp_op<T>, const Derived, T> exp() const {
return unaryExpression(hppl::unary::exp_op<T>());
}
const TensorUnaryOp<hppl::unary::log_op<T>, const Derived, T> log() const {
return unaryExpression(hppl::unary::log_op<T>());
}
const TensorUnaryOp<hppl::unary::sqrt_op<T>, const Derived, T> sqrt() const {
return unaryExpression(hppl::unary::sqrt_op<T>());
}
const TensorUnaryOp<hppl::unary::square<T>, const Derived, T> square() const {
return unaryExpression(hppl::unary::square<T>());
}
const TensorUnaryOp<hppl::unary::reciprocal<T>, const Derived, T> reciprocal()
const {
return unaryExpression(hppl::unary::reciprocal<T>());
}
const TensorUnaryOp<hppl::unary::abs<T>, const Derived, T> abs() const {
return unaryExpression(hppl::unary::abs<T>());
}
const TensorUnaryOp<hppl::unary::sign<T>, const Derived, T> sign() const {
return unaryExpression(hppl::unary::sign<T>());
}
const TensorUnaryOp<hppl::unary::pow_op<T>, const Derived, T> pow(T p) const {
return unaryExpression(hppl::unary::pow_op<T>(p));
}
const TensorUnaryOp<hppl::unary::min<T>, const Derived, T> min(T p) const {
return unaryExpression(hppl::unary::min<T>(p));
}
const TensorUnaryOp<hppl::unary::max<T>, const Derived, T> max(T p) const {
return unaryExpression(hppl::unary::max<T>(p));
}
const TensorUnaryOp<hppl::unary::cmp_eq<T>, const Derived, T> operator==(
T p) const {
return unaryExpression(hppl::unary::cmp_eq<T>(p));
}
const TensorUnaryOp<hppl::unary::cmp_ne<T>, const Derived, T> operator!=(
T p) const {
return unaryExpression(hppl::unary::cmp_ne<T>(p));
}
const TensorUnaryOp<hppl::unary::cmp_le<T>, const Derived, T> operator<=(
T p) const {
return unaryExpression(hppl::unary::cmp_le<T>(p));
}
const TensorUnaryOp<hppl::unary::cmp_lt<T>, const Derived, T> operator<(
T p) const {
return unaryExpression(hppl::unary::cmp_lt<T>(p));
}
const TensorUnaryOp<hppl::unary::cmp_ge<T>, const Derived, T> operator>=(
T p) const {
return unaryExpression(hppl::unary::cmp_ge<T>(p));
}
const TensorUnaryOp<hppl::unary::cmp_gt<T>, const Derived, T> operator>(
T p) const {
return unaryExpression(hppl::unary::cmp_gt<T>(p));
}
const TensorUnaryOp<hppl::unary::and_op<T>, const Derived, T> operator&&(
T p) const {
return unaryExpression(hppl::unary::and_op<T>(p));
}
const TensorUnaryOp<hppl::unary::or_op<T>, const Derived, T> operator||(
T p) const {
return unaryExpression(hppl::unary::or_op<T>(p));
}
/**
* Element wise binary expression.
*/
template <typename BinaryOp, typename ExpressionType>
const TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>
binaryExpression(const BinaryOp& op, const ExpressionType& expr) const {
return TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>(
op, derived(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::cmp_eq<T>,
const Derived,
const ExpressionType,
T>
operator==(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::cmp_eq<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::cmp_ne<T>,
const Derived,
const ExpressionType,
T>
operator!=(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::cmp_ne<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::cmp_le<T>,
const Derived,
const ExpressionType,
T>
operator<=(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::cmp_le<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::cmp_lt<T>,
const Derived,
const ExpressionType,
T>
operator<(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::cmp_lt<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::cmp_ge<T>,
const Derived,
const ExpressionType,
T>
operator>=(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::cmp_ge<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::cmp_gt<T>,
const Derived,
const ExpressionType,
T>
operator>(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::cmp_gt<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::and_op<T>,
const Derived,
const ExpressionType,
T>
operator&&(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::and_op<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::or_op<T>,
const Derived,
const ExpressionType,
T>
operator||(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::or_op<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::add<T>,
const Derived,
const ExpressionType,
T>
operator+(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::add<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::sub<T>,
const Derived,
const ExpressionType,
T>
operator-(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::sub<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::mul<T>,
const Derived,
const ExpressionType,
T>
operator*(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::mul<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::div<T>,
const Derived,
const ExpressionType,
T>
operator/(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::div<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::min<T>,
const Derived,
const ExpressionType,
T>
min(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::min<T>(), expr);
}
template <typename ExpressionType>
const TensorBinaryOp<hppl::binary::max<T>,
const Derived,
const ExpressionType,
T>
max(const ExpressionType& expr) const {
return binaryExpression(hppl::binary::max<T>(), expr);
}
/**
* Element wise ternary expression.
*
* ternary conditional operator(?: operator).
* The conditional expression returns one of two values depending on
* the result of derived expression.
* If derived expression evaluates to true, then expression1 is evaluated.
* If derived expression evaluates to false, then expression2 is evaluated.
*/
template <typename ExprType1, typename ExprType2>
const TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>
condition(const ExprType1& expr1, const ExprType2& expr2) const {
return TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>(
derived(), expr1, expr2);
}
template <typename ExprType>
const TensorTernaryOp<
const Derived,
const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
const ExprType,
T>
condition(T p, const ExprType& expr) const {
return condition(constant(p), expr);
}
template <typename ExprType>
const TensorTernaryOp<
const Derived,
const ExprType,
const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
T>
condition(const ExprType& expr, T p) const {
return condition(expr, constant(p));
}
const TensorTernaryOp<
const Derived,
const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
T>
condition(T p1, T p2) const {
return condition(constant(p1), constant(p2));
}
/**
* return a TensorConstant. A TensorConstant object hold a constant value.
*/
const TensorConstant<hppl::unary::constant<T>, const Derived, T> constant(
T p) const {
return TensorConstant<hppl::unary::constant<T>, const Derived, T>(
hppl::unary::constant<T>(p), derived());
}
/**
* return a TensorAssignOp, and use AssignEvaluate to evaluate one or more
* TensorAssignOp objects.
*/
template <typename ExpressionType>
TensorAssignOp<Derived, ExpressionType, T> lazyAssign(
const ExpressionType& expr) const {
return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
}
protected:
const Derived& derived() const { return *static_cast<const Derived*>(this); }
};
/**
* \brief Unary Operator Expression
*/
template <class OP, typename ExprType, class T>
class TensorUnaryOp
: public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
public:
explicit TensorUnaryOp(const OP op, const ExprType& expr)
: op_(op), expr_(expr) {}
const OP op_;
const ExprType expr_;
};
/**
* \brief Binary Operator Expression
*/
template <class OP, typename LhsType, typename RhsType, class T>
class TensorBinaryOp
: public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
public:
explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
: op_(op), lhs_(lhs), rhs_(rhs) {}
const OP op_;
const LhsType lhs_;
const RhsType rhs_;
};
/**
* \brief Ternary Operator Expression
*/
template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
class TensorTernaryOp : public TensorExpression<
TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
T> {
public:
explicit TensorTernaryOp(const ExprType1& expr1,
const ExprType2& expr2,
const ExprType3& expr3)
: expr1_(expr1), expr2_(expr2), expr3_(expr3) {}
const ExprType1 expr1_;
const ExprType2 expr2_;
const ExprType3 expr3_;
};
/**
* \brief Constant Expression
*/
template <class OP, typename ExprType, class T>
class TensorConstant
: public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
public:
explicit TensorConstant(const OP op, const ExprType& expr)
: op_(op), expr_(expr) {}
const OP op_;
const ExprType expr_;
};
/**
* \brief operator+ overload
* \return a unary operator expression
*/
template <typename Derived, class T>
const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
T p, const TensorExpression<Derived, T>& expr) {
return expr + p;
}
/**
* \brief operator* overload
* \return a unary operator expression
*/
template <typename Derived, class T>
const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
T p, const TensorExpression<Derived, T>& expr) {
return expr * p;
}
} // namespace paddle
#include "TensorApply.h"
#include "TensorEvaluate.h"
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/Logging.h"
#include "BaseMatrix.h"
#include "TrainingAlgorithmOp.h"
#if __cplusplus > 199711L
#include "TensorAssign.h"
namespace paddle {
void sparseMomentumApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& momU,
BaseMatrix& momV,
real alpha,
real beta,
real gamma,
real tau,
real learningRate) {
auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
auto expr2 = momV.lazyAssign(
momV + (tau * alpha * gamma * learningRate) * grad);
auto expr3 = value.lazyAssign(
(tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV);
AssignEvaluate(expr1, expr2, expr3);
}
void adadeltaApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& accum,
BaseMatrix& accum_update,
BaseMatrix& lr,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate) {
auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
auto expr2 = lr.lazyAssign(
((accum_update + epsilon) / (accum + epsilon)).sqrt());
auto expr3 = accum_update.lazyAssign(
rou * accum_update + ((real)1 - rou) * (grad * lr).square());
auto expr4 = mom.lazyAssign(
mom * momentum - learningRate * lr * (grad + value * decayRate));
auto expr5 = value.lazyAssign(value + mom);
AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
}
void adagradApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& accum_buffer,
BaseMatrix& accum,
BaseMatrix& lr,
real epsilon,
real learningRate,
real momentum,
real decayRate) {
auto expr1 = accum.lazyAssign(accum + grad.square());
auto expr2 = lr.lazyAssign(
(accum_buffer + accum + epsilon).sqrt().reciprocal());
auto expr3 = mom.lazyAssign(
mom * momentum - learningRate * lr * (grad + value * decayRate));
auto expr4 = value.lazyAssign(value + mom);
AssignEvaluate(expr1, expr2, expr3, expr4);
}
void rmspropApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& g,
BaseMatrix& f,
BaseMatrix& lr,
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime) {
auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
auto expr4 = mom.lazyAssign(
mom * momentum - learningRate * lr * (grad + value * decayRate));
auto expr5 = value.lazyAssign(value + mom);
if (firstTime) {
auto expr1 = g.lazyAssign(accumulatedRou * g + grad.square());
AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
} else {
auto expr1 = g.lazyAssign(
accumulatedRou * g + ((real)1 - rou) * grad.square());
AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
}
}
void decayedAdagradApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& accum,
BaseMatrix& lr,
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime) {
auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
auto expr3 = mom.lazyAssign(
mom * momentum - learningRate * lr * (grad + value * decayRate));
auto expr4 = value.lazyAssign(value + mom);
if (firstTime) {
auto expr1 = accum.lazyAssign(accumulatedRou * accum + grad.square());
AssignEvaluate(expr1, expr2, expr3, expr4);
} else {
auto expr1 = accum.lazyAssign(
accumulatedRou * accum + ((real)1 - rou) * grad.square());
AssignEvaluate(expr1, expr2, expr3, expr4);
}
}
void adamApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom, // firse moment
BaseMatrix& v, // second moment
real beta1,
real beta2,
real beta1_power,
real beta2_power,
real epsilon,
real learningRate) {
real alpha = learningRate *
std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
auto expr3 = value.lazyAssign(
value - (mom * alpha) / (v.sqrt() + epsilon));
AssignEvaluate(expr1, expr2, expr3);
}
void adamaxApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom, // firse moment
BaseMatrix& u, // weighted infinity norm
real beta1,
real beta2,
int64_t step,
real alpha) {
auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
auto expr2 = u.lazyAssign(
(beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
auto expr3 = value.lazyAssign(
value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
AssignEvaluate(expr1, expr2, expr3);
}
} // namespace paddle
#else
namespace paddle {
void sparseMomentumApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& momU,
BaseMatrix& momV,
real alpha,
real beta,
real gamma,
real tau,
real learningRate) {
/**
* \alpha_t = \alpha_{t-1} / k
* \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
* u_t = u_{t-1} - \alpha_t \gamma_t g_t
* v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
* \tau_t = \tau_{t-1} + \beta_t / \alpha_t
*/
momU -= (alpha * gamma * learningRate) * grad;
momV += (tau * alpha * gamma * learningRate) * grad;
value = (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV;
}
void adadeltaApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& accum,
BaseMatrix& accum_update,
BaseMatrix& lr,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate) {
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
accum = rou * accum + ((real)1 - rou) * grad.square();
// learn_rate: sqrt(( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ))
lr = ((accum_update + epsilon) / (accum + epsilon)).sqrt();
// E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
accum_update = rou * accum_update + ((real)1 - rou) * (grad * lr).square();
mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
value += mom;
}
void adagradApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& accum_buffer,
BaseMatrix& accum,
BaseMatrix& lr,
real epsilon,
real learningRate,
real momentum,
real decayRate) {
accum += grad.square();
lr = (accum_buffer + accum + epsilon).sqrt().reciprocal();
mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
value += mom;
}
void rmspropApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& g,
BaseMatrix& f,
BaseMatrix& lr,
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime) {
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
// For the first time update, make the sum be the current square
// so that the initial estimation of E(g_t^2) will not be too small.
if (firstTime) {
g = accumulatedRou * g + grad.square();
} else {
g = accumulatedRou * g + ((real)1 - rou) * grad.square();
}
// E(f_t) = \rou * E(f_{t-1}) + (1-\rou) * g
f = accumulatedRou * f + ((real)1 - rou) * grad;
// learn_rate = 1/sqrt( ( E(g_t^2) - (E(f_t))^2 + epsilon )
// Basiclly if the sign of the gradient changes more often,
// the learning rate will be decreased.
lr = (g - f.square() + epsilon).sqrt().reciprocal();
mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
value += mom;
}
void decayedAdagradApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& accum,
BaseMatrix& lr,
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime) {
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
// For the first time update, make the sum be the current square
// so that the initial estimation of E(g_t^2) will not be too small.
if (firstTime) {
accum = accumulatedRou * accum + grad.square();
} else {
accum = accumulatedRou * accum + ((real)1 - rou) * grad.square();
}
// learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
// Basiclly if the bigger the magnitude gradient is,
// the smaller the learning rate will be.
lr = (accum + epsilon).sqrt().reciprocal();
mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
value += mom;
}
void adamApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom, // firse moment
BaseMatrix& v, // second moment
real beta1,
real beta2,
real beta1_power,
real beta2_power,
real epsilon,
real learningRate) {
real alpha = learningRate *
std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
mom = beta1 * mom + ((real)1 - beta1) * grad;
// v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
v = beta2 * v + ((real)1 - beta2) * grad.square();
value -= (mom * alpha) / (v.sqrt() + epsilon);
}
void adamaxApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom, // firse moment
BaseMatrix& u, // weighted infinity norm
real beta1,
real beta2,
int64_t step,
real alpha) {
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
mom = beta1 * mom + ((real)1 - beta1) * grad;
// u_t = max(\beta_2*u_{t-1}, abs(g_t))
u = (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs());
// \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
value -= (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u);
}
} // namespace paddle
#endif
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/utils/Logging.h"
#include "BaseMatrix.h"
namespace paddle {
/**
* \brief Sparse Momentum optimizer.
*/
extern void sparseMomentumApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& momU,
BaseMatrix& momV,
real alpha,
real beta,
real gamma,
real tau,
real learningRate);
/**
* \brief AdaDelta optimizer.
*/
extern void adadeltaApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& sum,
BaseMatrix& sum1,
BaseMatrix& mom,
BaseMatrix& lr,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate);
/**
* \brief AdaGrad optimizer.
*/
extern void adagradApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& sum,
BaseMatrix& sum1,
BaseMatrix& mom,
BaseMatrix& lr,
real epsilon,
real learningRate,
real momentum,
real decayRate);
/**
* \brief RMSProp optimizer.
*/
extern void rmspropApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& g,
BaseMatrix& f,
BaseMatrix& mom,
BaseMatrix& lr,
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime);
/**
* \brief Decayed AdaGrad optimizer.
*/
extern void decayedAdagradApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& accum,
BaseMatrix& lr,
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime);
/**
* \brief Adam optimizer.
*/
extern void adamApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom,
BaseMatrix& v,
real beta1,
real beta2,
real beta1_power,
real beta2_power,
real epsilon,
real learningRate);
/**
* \brief AdaMax optimizer.
*/
extern void adamaxApply(BaseMatrix& value,
BaseMatrix& grad,
BaseMatrix& mom, // firse moment
BaseMatrix& u, // weighted infinity norm
real beta1,
real beta2,
int64_t step,
real alpha);
} // namespace paddle
......@@ -265,6 +265,15 @@ public:
/// print the "idx" element of the Vector
virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
if (BaseVector<T>::useGpu_) {
TensorGpuApply<T>(*this, expr);
} else {
TensorCpuApply<T>(*this, expr);
}
}
protected:
friend class GpuVectorT<T>;
friend class CpuVectorT<T>;
......@@ -322,6 +331,11 @@ public:
virtual void print(std::ostream& os, size_t num) const;
virtual void printOneElement(std::ostream& os, size_t idx) const;
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
TensorGpuApply<T>(*this, expr);
}
protected:
virtual void copyTo(CpuVectorT<T>* dest) const;
virtual void copyTo(GpuVectorT<T>* dest) const;
......@@ -385,6 +399,11 @@ public:
virtual T get(size_t pos);
virtual void print(std::ostream& os, size_t num) const;
virtual void printOneElement(std::ostream& os, size_t idx) const;
template <typename ExpressionType>
void operator=(const ExpressionType& expr) {
TensorCpuApply<T>(*this, expr);
}
};
template <class T>
......
......@@ -2,6 +2,7 @@
add_simple_unittest(test_ExecViaCpu)
add_simple_unittest(test_SIMDFunctions)
add_simple_unittest(test_TrainingAlgorithm)
add_simple_unittest(test_SparseMatrix)
# TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
......@@ -13,6 +14,21 @@ add_simple_unittest(test_sparseMatrixCompare)
add_simple_unittest(test_perturbation)
add_simple_unittest(test_CpuGpuVector)
add_simple_unittest(test_Allocator)
if(WITH_GPU)
if(COMPILER_SUPPORT_CXX11)
CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
link_paddle_test(test_Tensor)
CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
link_paddle_test(test_lazyAssign)
endif()
else()
compile_cu_as_cpp(test_Tensor.cu)
add_unittest(test_Tensor test_Tensor.cu)
compile_cu_as_cpp(test_lazyAssign.cu)
add_unittest(test_lazyAssign test_lazyAssign.cu)
endif(WITH_GPU)
add_simple_unittest(test_FPException)
add_simple_unittest(test_GpuProfiler)
add_simple_unittest(test_BaseMatrix)
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/utils/GlobalConstants.h"
#include "paddle/math/Vector.h"
using namespace paddle; // NOLINT
void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
real alpha,
real beta,
real gamma,
real tau,
real learningRate) {
vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
-alpha * gamma * learningRate);
vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
tau * alpha * gamma * learningRate);
vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
tau / beta + 1.0 / alpha,
*vecs[PARAMETER_MOMENTUM_VT],
1.0 / beta);
}
void AdagradParameterOptimizer(const VectorPtr vecs[],
real epsilon,
real learningRate,
real momentum,
real decayRate) {
vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
1.0f);
vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
*vecs[PARAMETER_GRADIENT_SQURESUM1]);
vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_MOMENTUM],
*vecs[PARAMETER_LEARNING_RATE],
learningRate,
momentum,
decayRate);
}
void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate) {
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
*vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
// learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
*vecs[PARAMETER_GRADIENT_SQURESUM],
epsilon,
epsilon);
vecs[PARAMETER_LEARNING_RATE]->sqrt2();
// E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_LEARNING_RATE],
rou,
1.0f - rou);
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_MOMENTUM],
*vecs[PARAMETER_LEARNING_RATE],
learningRate,
momentum,
decayRate);
}
void RMSPropParameterOptimizer(const VectorPtr vecs[],
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime) {
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
// For the first time update, make the sum be the current square
// so that the initial estimation of E(g_t^2) will not be too small.
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
*vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
// E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
*vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
// learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
// Basiclly if the sign of the gradient changes more often,
// the learning rate will be decreased.
vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-1.0f);
vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_MOMENTUM],
*vecs[PARAMETER_LEARNING_RATE],
learningRate,
momentum,
decayRate);
}
void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
real accumulatedRou,
real rou,
real epsilon,
real learningRate,
real momentum,
real decayRate,
bool firstTime) {
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
// For the first time update, make the sum be the current square
// so that the initial estimation of E(g_t^2) will not be too small.
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
*vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
// learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
// Basiclly if the bigger the magnitude gradient is,
// the smaller the learning rate will be.
vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_MOMENTUM],
*vecs[PARAMETER_LEARNING_RATE],
learningRate,
momentum,
decayRate);
}
void AdamParameterOptimizer(const VectorPtr vecs[],
real beta1,
real beta2,
real beta1_power,
real beta2_power,
real epsilon,
real learningRate) {
Vector* m = vecs[PARAMETER_MOMENTUM].get();
Vector* g = vecs[PARAMETER_GRADIENT].get();
Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
Vector* theta = vecs[PARAMETER_VALUE].get();
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
m->add(*g, beta1, 1 - beta1);
// v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
g->square2();
v->add(*g, beta2, 1 - beta2);
// tmp = m_t / ( \sqrt{v_t} + \epsilon )
// \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
g->sqrt2(*v);
g->dotDiv(*m, *g, 0., epsilon);
real alpha =
learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
theta->add(*theta, 1.0, *g, -alpha);
}
void AdamaxParameterOptimizer(
const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
Vector* m = vecs[PARAMETER_MOMENTUM].get();
Vector* g = vecs[PARAMETER_GRADIENT].get();
Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
Vector* theta = vecs[PARAMETER_VALUE].get();
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
m->add(*g, beta1, 1 - beta1);
// u_t = max(\beta_2*u_{t-1}, abs(g_t))
u->mulScalar(beta2);
g->abs2();
u->max2(*u, *g);
// \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
g->dotDiv(*m, *u);
real learningRate = alpha / (1 - std::pow(beta1, step));
theta->add(*theta, 1.0, *g, -learningRate);
}
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
// Performance Check
#ifdef PADDLE_DISABLE_TIMER
#define EXPRESSION_PERFORMANCE(expression) expression;
#else
#include "paddle/utils/Stat.h"
using namespace paddle; // NOLINT
#define EXPRESSION_PERFORMANCE(expression) \
do { \
char expr[30]; \
strncpy(expr, #expression, 30); \
if (expr[29] != '\0') { \
expr[27] = '.'; \
expr[28] = '.'; \
expr[29] = '\0'; \
} \
expression; \
for (int i = 0; i < 20; i++) { \
REGISTER_TIMER(expr); \
expression; \
} \
LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
<< *globalStat.getStat(expr); \
globalStat.reset(); \
} while (0)
#endif
......@@ -37,13 +37,13 @@ TEST(BaseMatrix, void) {
};
compare(&BaseMatrix::neg);
compare(&BaseMatrix::exp);
compare(&BaseMatrix::log);
compare(&BaseMatrix::sqrt);
compare(&BaseMatrix::square);
compare(&BaseMatrix::reciprocal);
compare(&BaseMatrix::abs);
compare(&BaseMatrix::sign);
compare(&BaseMatrix::exp2);
compare(&BaseMatrix::log2);
compare(&BaseMatrix::sqrt2);
compare(&BaseMatrix::square2);
compare(&BaseMatrix::reciprocal2);
compare(&BaseMatrix::abs2);
compare(&BaseMatrix::sign2);
compare(&BaseMatrix::zero);
compare(&BaseMatrix::one);
}
......@@ -59,7 +59,7 @@ TEST(BaseMatrix, real) {
test.cmpWithoutArg<0>(f, height, width);
};
compare(&BaseMatrix::pow);
compare(&BaseMatrix::pow2);
compare(&BaseMatrix::subScalar);
compare(&BaseMatrix::mulScalar);
compare(&BaseMatrix::divScalar);
......@@ -88,21 +88,21 @@ TEST(BaseMatrix, BaseMatrix) {
compare(&BaseMatrix::softreluDerivative);
compare(&BaseMatrix::brelu);
compare(&BaseMatrix::breluDerivative);
compare(&BaseMatrix::square);
compare(&BaseMatrix::square2);
compare(&BaseMatrix::squareDerivative);
compare(&BaseMatrix::tanh);
compare(&BaseMatrix::tanhDerivative);
compare(&BaseMatrix::reciprocal);
compare(&BaseMatrix::reciprocal2);
compare(&BaseMatrix::reciprocalDerivative);
compare(&BaseMatrix::abs);
compare(&BaseMatrix::abs2);
compare(&BaseMatrix::absDerivative);
compare(&BaseMatrix::sigmoid);
compare(&BaseMatrix::sigmoidDerivative);
compare(&BaseMatrix::expDerivative);
compare(&BaseMatrix::sign);
compare(&BaseMatrix::exp);
compare(&BaseMatrix::log);
compare(&BaseMatrix::sqrt);
compare(&BaseMatrix::sign2);
compare(&BaseMatrix::exp2);
compare(&BaseMatrix::log2);
compare(&BaseMatrix::sqrt2);
compare(&BaseMatrix::dotMul);
compare(&BaseMatrix::dotMulSquare);
compare(&BaseMatrix::dotSquareMul);
......@@ -143,7 +143,7 @@ TEST(BaseMatrix, BaseMatrix_real) {
compare(&BaseMatrix::addBias);
compare(&BaseMatrix::add);
compare(&BaseMatrix::sub);
compare(&BaseMatrix::pow);
compare(&BaseMatrix::pow2);
compare(&BaseMatrix::addScalar);
compare(&BaseMatrix::subScalar);
compare(&BaseMatrix::mulScalar);
......@@ -176,7 +176,7 @@ TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
compare(&BaseMatrix::logisticRegressionLoss);
compare(&BaseMatrix::logisticRegressionLossBp);
compare(&BaseMatrix::biggerThan);
compare(&BaseMatrix::max);
compare(&BaseMatrix::max2);
compare(&BaseMatrix::dotMulSquare);
compare(&BaseMatrix::dotSquareSquare);
}
......
此差异已折叠。
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/utils/Util.h"
#include "paddle/math/TrainingAlgorithmOp.h"
#include "OriginalOptimizerApi.h"
#include "TensorCheck.h"
#include "PerfUtils.h"
using namespace paddle; // NOLINT
#ifndef PADDLE_TYPE_DOUBLE
P_DEFINE_double(max_diff, 1e-5, "max diff allowed");
#else
P_DEFINE_double(max_diff, 1e-13, "max diff allowed");
#endif
class SetMaxDiff {
public:
explicit SetMaxDiff(double max_diff) {
max_diff_ = FLAGS_max_diff;
FLAGS_max_diff = max_diff;
}
~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
private:
double max_diff_;
};
#define COPY_VECTOR_TO_CPU(cpuVec, vector) \
do { \
if (vector->useGpu()) { \
cpuVec = Vector::create(vector->getSize(), false); \
cpuVec->copyFrom(*vector); \
} else { \
cpuVec = vector; \
} \
} while (0)
int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
CHECK(vector1.getSize() == vector2.getSize());
const real* data1 = vector1.getData();
const real* data2 = vector2.getData();
size_t size = vector1.getSize();
int count = 0;
for (size_t i = 0; i < size; i++) {
real a = data1[i];
real b = data2[i];
if (fabs(a - b) > FLAGS_max_diff) {
if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
count++;
}
}
}
return count;
}
int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
VectorPtr tmp1;
VectorPtr tmp2;
COPY_VECTOR_TO_CPU(tmp1, vector1);
COPY_VECTOR_TO_CPU(tmp2, vector2);
return VectorCheckErr(*tmp1, *tmp2);
}
#ifdef PADDLE_DISABLE_TIMER
#define CHECK_VECTORPTR(vector1, vector2) \
EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
#else
#define CHECK_VECTORPTR(vector1, vector2)
#endif
typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
void testCase(testMatrixFunc matrixFunc) {
#ifndef PADDLE_ONLY_CPU
for (auto useGpu : {false, true}) {
#else
for (auto useGpu : {false}) {
#endif
for (auto size : {1,
32,
64,
128,
512,
1024,
4096,
32768,
65536,
131072,
262144,
524288,
1048576,
2097152}) {
LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
matrixFunc(size, useGpu);
}
}
}
#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
vec1[type] = Vector::create(size, useGpu); \
vec2[type] = Vector::create(size, useGpu); \
vec1[type]->rand(); \
vec2[type]->copyFrom(*vec1[type]);
void testAdagrad(size_t size, bool useGpu) {
VectorPtr bufs1[NUM_PARAMETER_TYPES];
VectorPtr bufs2[NUM_PARAMETER_TYPES];
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT
real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT
real momentum = (real)rand() / (real)RAND_MAX; // NOLINT
real decayRate = (real)rand() / (real)RAND_MAX; // NOLINT
EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
bufs1, epsilon, learningRate, momentum, decayRate));
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
EXPRESSION_PERFORMANCE(adagradApply(value,
grad,
mom,
accum_buffer,
accum,
lr,
epsilon,
learningRate,
momentum,
decayRate));
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
bufs2[PARAMETER_GRADIENT_SQURESUM1]);
CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
bufs2[PARAMETER_LEARNING_RATE]);
}
TEST(Training, Adagrad) { testCase(testAdagrad); }
void testAdaDelta(size_t size, bool useGpu) {
VectorPtr bufs1[NUM_PARAMETER_TYPES];
VectorPtr bufs2[NUM_PARAMETER_TYPES];
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
real rou = (real)rand() / (real)RAND_MAX; // NOLINT
real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT
real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT
real momentum = (real)rand() / (real)RAND_MAX; // NOLINT
real decayRate = (real)rand() / (real)RAND_MAX; // NOLINT
EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
bufs1, rou, epsilon, learningRate, momentum, decayRate));
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
EXPRESSION_PERFORMANCE(adadeltaApply(value,
grad,
mom,
accum,
accum_update,
lr,
rou,
epsilon,
learningRate,
momentum,
decayRate));
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
bufs2[PARAMETER_GRADIENT_SQURESUM]);
CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
bufs2[PARAMETER_GRADIENT_SQURESUM1]);
CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
bufs2[PARAMETER_LEARNING_RATE]);
}
TEST(Training, AdaDelta) { testCase(testAdaDelta); }
template <bool isFirstTime>
void testRMSProp(size_t size, bool useGpu) {
VectorPtr bufs1[NUM_PARAMETER_TYPES];
VectorPtr bufs2[NUM_PARAMETER_TYPES];
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
/* make sure 'g - f.square()' greater than 0 */
bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
*bufs1[PARAMETER_GRADIENT_SQURESUM]);
real rou = (real)rand() / (real)RAND_MAX; // NOLINT
real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT
real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT
real momentum = (real)rand() / (real)RAND_MAX; // NOLINT
real decayRate = (real)rand() / (real)RAND_MAX; // NOLINT
real accumulatedRou = rou;
EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
accumulatedRou,
rou,
epsilon,
learningRate,
momentum,
decayRate,
isFirstTime));
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
EXPRESSION_PERFORMANCE(rmspropApply(value,
grad,
mom,
sum,
sum1,
lr,
accumulatedRou,
rou,
epsilon,
learningRate,
momentum,
decayRate,
isFirstTime));
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
bufs2[PARAMETER_GRADIENT_SQURESUM]);
CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
bufs2[PARAMETER_GRADIENT_SQURESUM1]);
CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
bufs2[PARAMETER_LEARNING_RATE]);
}
TEST(Training, RMSProp) {
testCase(testRMSProp<true>);
testCase(testRMSProp<false>);
}
template <bool isFirstTime>
void testDecayedAdagrad(size_t size, bool useGpu) {
VectorPtr bufs1[NUM_PARAMETER_TYPES];
VectorPtr bufs2[NUM_PARAMETER_TYPES];
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
real rou = (real)rand() / (real)RAND_MAX; // NOLINT
real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT
real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT
real momentum = (real)rand() / (real)RAND_MAX; // NOLINT
real decayRate = (real)rand() / (real)RAND_MAX; // NOLINT
real accumulatedRou = rou;
if (isFirstTime) {
bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
}
EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
accumulatedRou,
rou,
epsilon,
learningRate,
momentum,
decayRate,
isFirstTime));
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
grad,
mom,
sum,
lr,
accumulatedRou,
rou,
epsilon,
learningRate,
momentum,
decayRate,
isFirstTime));
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
bufs2[PARAMETER_GRADIENT_SQURESUM]);
CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
bufs2[PARAMETER_LEARNING_RATE]);
}
TEST(Training, DecayedAdagrad) {
testCase(testDecayedAdagrad<false>);
testCase(testDecayedAdagrad<true>);
}
void testAdam(size_t size, bool useGpu) {
VectorPtr bufs1[NUM_PARAMETER_TYPES];
VectorPtr bufs2[NUM_PARAMETER_TYPES];
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
real beta1 = (real)rand() / (real)RAND_MAX; // NOLINT
real beta2 = (real)rand() / (real)RAND_MAX; // NOLINT
real beta1_power = (real)rand() / (real)RAND_MAX; // NOLINT
real beta2_power = (real)rand() / (real)RAND_MAX; // NOLINT
real epsilon = (real)rand() / (real)RAND_MAX; // NOLINT
real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT
EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
EXPRESSION_PERFORMANCE(adamApply(value,
grad,
mom,
v,
beta1,
beta2,
beta1_power,
beta2_power,
epsilon,
learningRate));
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
bufs2[PARAMETER_SECOND_MOMENTUM]);
}
TEST(Training, Adam) { testCase(testAdam); }
void testAdamax(size_t size, bool useGpu) {
VectorPtr bufs1[NUM_PARAMETER_TYPES];
VectorPtr bufs2[NUM_PARAMETER_TYPES];
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
real beta1 = (real)rand() / (real)RAND_MAX; // NOLINT
real beta2 = (real)rand() / (real)RAND_MAX; // NOLINT
real alpha = (real)rand() / (real)RAND_MAX; // NOLINT
int64_t step = 2;
EXPRESSION_PERFORMANCE(
AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
EXPRESSION_PERFORMANCE(
adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
}
TEST(Training, Adamax) {
#ifndef PADDLE_TYPE_DOUBLE
SetMaxDiff diff(1e-4);
#endif
testCase(testAdamax);
}
void testSparseMomentum(size_t size, bool useGpu) {
VectorPtr bufs1[NUM_PARAMETER_TYPES];
VectorPtr bufs2[NUM_PARAMETER_TYPES];
INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
real alpha = (real)rand() / (real)RAND_MAX; // NOLINT
real beta = (real)rand() / (real)RAND_MAX; // NOLINT
real gamma = (real)rand() / (real)RAND_MAX; // NOLINT
real tau = (real)rand() / (real)RAND_MAX; // NOLINT
real learningRate = (real)rand() / (real)RAND_MAX; // NOLINT
EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
bufs1, alpha, beta, gamma, tau, learningRate));
BaseMatrix& value = *bufs2[PARAMETER_VALUE];
BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
EXPRESSION_PERFORMANCE(sparseMomentumApply(
value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
}
TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
hl_start();
hl_init(FLAGS_gpu_id);
return RUN_ALL_TESTS();
}
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/math/Matrix.h"
#include "paddle/math/TensorAssign.h"
#include "TensorCheck.h"
#include "PerfUtils.h"
using paddle::BaseMatrix;
using paddle::CpuMatrix;
using paddle::GpuMatrix;
using autotest::TensorCheckEqual;
using autotest::TensorCheckErr;
typedef std::function<void(int height, int width)> testMatrixFunc;
void testMatrixCase(testMatrixFunc matrixFunc) {
for (auto height : {1}) {
for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072,
262144, 524288, 1048576, 2097152, 4194304, 8388608}) {
matrixFunc(height, width);
}
}
}
template<typename Tensor>
void testLazyAssign(int height, int width) {
Tensor A1(height, width);
Tensor A2(height, width);
Tensor B(height, width);
Tensor C(height, width);
Tensor D(height, width);
A1.randomizeUniform();
B.randomizeUniform();
C.randomizeUniform();
D.randomizeUniform();
A2.copyFrom(A1);
EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
EXPRESSION_PERFORMANCE(
auto expr1 = A2.lazyAssign(B + C);
auto expr2 = A2.lazyAssign(A2 * D);
AssignEvaluate(expr1, expr2););
TensorCheckErr(A1, A2);
}
TEST(lazyAssign, CPU) {
testMatrixCase(testLazyAssign<CpuMatrix>);
}
#ifndef PADDLE_ONLY_CPU
TEST(lazyAssign, GPU) {
testMatrixCase(testLazyAssign<GpuMatrix>);
}
#endif
template<typename Tensor>
void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D,
real p1, real p2, real p3) {
C = C * p2 - D * (B + A * p3) * p1;
A += C;
}
void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B,
BaseMatrix& C, BaseMatrix& D,
real p1, real p2, real p3) {
auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
auto expr2 = A.lazyAssign(A + C);
AssignEvaluate(expr1, expr2);
}
template<typename Tensor>
void testSgdUpdate(int height, int width) {
Tensor A1(height, width);
Tensor A2(height, width);
Tensor A3(height, width);
A1.randomizeUniform();
A2.copyFrom(A1);
A3.copyFrom(A1);
Tensor B(height, width);
B.randomizeUniform();
Tensor C1(height, width);
Tensor C2(height, width);
Tensor C3(height, width);
C1.randomizeUniform();
C2.copyFrom(C1);
C3.copyFrom(C1);
Tensor D(height, width);
D.randomizeUniform();
real p1 = 0.2;
real p2 = 0.3;
real p3 = 0.5;
/**
* c = p2 * c - p1 * (b + p3 * a);
* a = a + c;
*/
// BaseMatrix API
EXPRESSION_PERFORMANCE(
A1.sgdUpdate(B, C1, D, p1, p2, p3););
// Tensor expression
EXPRESSION_PERFORMANCE(
sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
// lazyAssign
EXPRESSION_PERFORMANCE(
sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
TensorCheckErr(A1, A2);
TensorCheckErr(A1, A3);
TensorCheckErr(C1, C2);
TensorCheckErr(C1, C3);
}
TEST(sgdUpdate, CPU) {
testMatrixCase(testSgdUpdate<CpuMatrix>);
}
#ifndef PADDLE_ONLY_CPU
TEST(sgdUpdate, GPU) {
testMatrixCase(testSgdUpdate<GpuMatrix>);
}
#endif
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
hl_start();
hl_init(0);
return RUN_ALL_TESTS();
}
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/utils/Util.h"
#include "paddle/utils/Flags.h"
#include "paddle/math/TrainingAlgorithmOp.h"
#include "FirstOrderOptimizer.h"
#include <cmath>
......@@ -115,19 +115,28 @@ void SparseMomentumParameterOptimizer::finishBatch() {
void AdagradParameterOptimizer::update(const VectorPtr vecs[],
const ParameterConfig& config,
size_t sparseId) const {
vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
1.0f);
vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
*vecs[PARAMETER_GRADIENT_SQURESUM1]);
vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_MOMENTUM],
*vecs[PARAMETER_LEARNING_RATE],
learningRate_ * config.learning_rate(),
config.momentum(),
applyDecay_ ? config.decay_rate() : 0);
BaseMatrix& value = *vecs[PARAMETER_VALUE];
BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
real epsilon = optConfig_.ada_epsilon();
real learningRate = learningRate_ * config.learning_rate();
real momentum = config.momentum();
real decayRate = applyDecay_ ? config.decay_rate() : 0;
adagradApply(value,
grad,
mom,
accum_buffer,
accum,
lr,
epsilon,
learningRate,
momentum,
decayRate);
}
ParameterOptimizer::TraverseCallback
......@@ -152,37 +161,41 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
const ParameterConfig& config,
size_t sparseId) const {
CHECK(sparseId == -1LU) << "Sparse update is not supported";
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
*vecs[PARAMETER_GRADIENT], rou_, 1.0f - rou_);
// learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
*vecs[PARAMETER_GRADIENT_SQURESUM],
epsilon_,
epsilon_);
vecs[PARAMETER_LEARNING_RATE]->sqrt();
// E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_LEARNING_RATE],
rou_,
1.0f - rou_);
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_MOMENTUM],
*vecs[PARAMETER_LEARNING_RATE],
learningRate_ * config.learning_rate(),
config.momentum(),
applyDecay_ ? config.decay_rate() : 0);
BaseMatrix& value = *vecs[PARAMETER_VALUE];
BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
real learningRate = learningRate_ * config.learning_rate();
real momentum = config.momentum();
real decayRate = applyDecay_ ? config.decay_rate() : 0;
adadeltaApply(value,
grad,
mom,
accum,
accum_update,
lr,
rou_,
epsilon_,
learningRate,
momentum,
decayRate);
}
void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
const ParameterConfig& config,
size_t sparseId) const {
real accumulatedRou = rou_;
BaseMatrix& value = *vecs[PARAMETER_VALUE];
BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
real accumulatedRou = rou_;
bool firstTime = timer_ == 0;
if (sparseId != -1LU) {
CHECK_LT(sparseId, t0Vec_.size());
......@@ -191,40 +204,36 @@ void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
t0Vec_[sparseId] = timer_ + 1;
}
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
// For the first time update, make the sum be the current square
// so that the initial estimation of E(g_t^2) will not be too small.
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
*vecs[PARAMETER_GRADIENT],
accumulatedRou,
firstTime ? 1.0f : 1.0f - rou_);
// E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
*vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou_);
// learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
// Basiclly if the sign of the gradient changes more often,
// the learning rate will be decreased.
vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-1.0f);
vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_MOMENTUM],
*vecs[PARAMETER_LEARNING_RATE],
learningRate_ * config.learning_rate(),
config.momentum(),
applyDecay_ ? config.decay_rate() : 0);
real epsilon = optConfig_.ada_epsilon();
real learningRate = learningRate_ * config.learning_rate();
real momentum = config.momentum();
real decayRate = applyDecay_ ? config.decay_rate() : 0;
rmspropApply(value,
grad,
mom,
sum,
sum1,
lr,
accumulatedRou,
rou_,
epsilon,
learningRate,
momentum,
decayRate,
firstTime);
}
void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
const ParameterConfig& config,
size_t sparseId) const {
real accumulatedRou = rou_;
BaseMatrix& value = *vecs[PARAMETER_VALUE];
BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
real accumulatedRou = rou_;
bool firstTime = timer_ == 0;
if (sparseId != -1LU) {
CHECK_LT(sparseId, t0Vec_.size());
......@@ -233,77 +242,62 @@ void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
t0Vec_[sparseId] = timer_ + 1;
}
// E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
// For the first time update, make the sum be the current square
// so that the initial estimation of E(g_t^2) will not be too small.
vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
*vecs[PARAMETER_GRADIENT],
accumulatedRou,
firstTime ? 1.0f : 1.0f - rou_);
// learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
// Basiclly if the bigger the magnitude gradient is,
// the smaller the learning rate will be.
vecs[PARAMETER_LEARNING_RATE]->assign(optConfig_.ada_epsilon());
vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
*vecs[PARAMETER_MOMENTUM],
*vecs[PARAMETER_LEARNING_RATE],
learningRate_ * config.learning_rate(),
config.momentum(),
applyDecay_ ? config.decay_rate() : 0);
real epsilon = optConfig_.ada_epsilon();
real learningRate = learningRate_ * config.learning_rate();
real momentum = config.momentum();
real decayRate = applyDecay_ ? config.decay_rate() : 0;
decayedAdagradApply(value,
grad,
mom,
sum,
lr,
accumulatedRou,
rou_,
epsilon,
learningRate,
momentum,
decayRate,
firstTime);
}
void AdamParameterOptimizer::update(const VectorPtr vecs[],
const ParameterConfig& config,
size_t sparseId) const {
CHECK(sparseId == -1UL) << "Sparse update is not supported";
Vector* m = vecs[PARAMETER_MOMENTUM].get();
Vector* g = vecs[PARAMETER_GRADIENT].get();
Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
Vector* theta = vecs[PARAMETER_VALUE].get();
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
m->add(*g, beta1_, 1 - beta1_);
// v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
g->square();
v->add(*g, beta2_, 1 - beta2_);
// tmp = m_t / ( \sqrt{v_t} + \epsilon )
// \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
g->sqrt(*v);
g->dotDiv(*m, *g, 0., epsilon_);
real alpha = config.learning_rate() * learningRate_;
alpha = alpha * std::sqrt(1 - std::pow(beta2_, step_)) /
(1 - std::pow(beta1_, step_));
theta->add(*theta, 1.0, *g, -alpha);
real beta1_power = std::pow(beta1_, step_);
real beta2_power = std::pow(beta2_, step_);
real learningRate = config.learning_rate() * learningRate_;
BaseMatrix& value = *vecs[PARAMETER_VALUE];
BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
adamApply(value,
grad,
mom,
v,
beta1_,
beta2_,
beta1_power,
beta2_power,
epsilon_,
learningRate);
}
void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
const ParameterConfig& config,
size_t sparseId) const {
CHECK(sparseId == -1UL) << "Sparse update is not supported";
Vector* m = vecs[PARAMETER_MOMENTUM].get();
Vector* g = vecs[PARAMETER_GRADIENT].get();
Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
Vector* theta = vecs[PARAMETER_VALUE].get();
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
m->add(*g, beta1_, 1 - beta1_);
real learningRate = config.learning_rate() * learningRate_;
// u_t = max(\beta_2*u_{t-1}, abs(g_t))
u->mulScalar(beta2_);
g->abs();
u->max(*u, *g);
BaseMatrix& value = *vecs[PARAMETER_VALUE];
BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
// \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
g->dotDiv(*m, *u);
real learningRate = config.learning_rate() * learningRate_;
learningRate /= (1 - std::pow(beta1_, step_));
theta->add(*theta, 1.0, *g, -learningRate);
adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
}
void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
......
FROM ubuntu:14.04
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
RUN apt-get update && \
apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
RUN apt-get update \
&& apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip \
python-protobuf python-numpy python-dev swig openssh-server \
wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
sed grep graphviz libjpeg-dev zlib1g-dev doxygen && \
apt-get clean -y
sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
clang-3.8 llvm-3.8 libclang-3.8-dev \
&& apt-get clean -y
RUN pip install -U BeautifulSoup docopt PyYAML pillow \
sphinx sphinx_rtd_theme breathe recommonmark
......@@ -25,6 +26,7 @@ ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
RUN mkdir /paddle
COPY . /paddle/
RUN /paddle/paddle/scripts/docker/build.sh
VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
......
FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
RUN apt-get update && \
apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
RUN apt-get update \
&& apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip \
python-protobuf python-numpy python-dev swig openssh-server \
wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
sed grep graphviz libjpeg-dev zlib1g-dev doxygen && \
apt-get clean -y
sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
clang-3.8 llvm-3.8 libclang-3.8-dev \
&& apt-get clean -y
RUN pip install -U BeautifulSoup docopt PyYAML pillow \
sphinx sphinx_rtd_theme breathe recommonmark
......@@ -25,6 +26,7 @@ ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
RUN mkdir /paddle
COPY . /paddle/
RUN /paddle/paddle/scripts/docker/build.sh
VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
......
......@@ -20,8 +20,28 @@ cmake .. \
-DWITH_AVX=${WITH_AVX} \
-DWITH_SWIG_PY=ON \
-DCUDNN_ROOT=/usr/ \
-DWITH_STYLE_CHECK=OFF
-DWITH_STYLE_CHECK=OFF \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
make -j `nproc`
make install
# Install woboq_codebrowser.
git clone https://github.com/woboq/woboq_codebrowser /woboq
cd /woboq
cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-DCMAKE_BUILD_TYPE=Release \
.
make
export WOBOQ_OUT=/usr/share/nginx/html/paddle
export BUILD_DIR=/paddle/build
mkdir -p $WOBOQ_OUT
cp -rv /woboq/data $WOBOQ_OUT/../data
/woboq/generator/codebrowser_generator \
-b /paddle/build \
-a \
-o $WOBOQ_OUT \
-p paddle:/paddle
/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
trap : 0
......@@ -17,22 +17,22 @@ limitations under the License. */
#include <fenv.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <iostream>
#include <limits>
#include <sstream>
#include <google/protobuf/text_format.h>
#include "paddle/utils/GlobalConstants.h"
#include "paddle/utils/PythonUtil.h"
#include "paddle/utils/Stat.h"
#include "paddle/utils/Util.h"
#include "paddle/utils/GlobalConstants.h"
#include "TesterConfig.h"
#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
#include "paddle/gserver/layers/ValidationLayer.h"
#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
#include "TesterConfig.h"
namespace paddle {
......@@ -66,6 +66,9 @@ Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
}
void Tester::startTestPeriod() {
if (testDataProvider_) {
testDataProvider_->reset();
}
testEvaluator_->start();
testContext_.cost = 0;
testContext_.numSamples = 0;
......@@ -87,27 +90,18 @@ void Tester::testOneDataBatch(const DataBatch& dataBatch,
void Tester::testOnePeriod() {
DataBatch dataBatch;
int64_t batchSize = config_->getOptConfig().batch_size();
int batches = std::numeric_limits<int>::max();
std::vector<Argument> outArgs;
startTestPeriod();
for (int i = 0; i < batches; ++i) {
int num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
if (num == 0) {
testDataProvider_->reset();
if (intconfig_->prevBatchState) {
gradientMachine_->resetState();
}
break;
}
while (testDataProvider_->getNextBatch(batchSize, &dataBatch) != 0) {
testOneDataBatch(dataBatch, &outArgs);
}
finishTestPeriod();
}
void Tester::finishTestPeriod() {
if (intconfig_->prevBatchState) {
gradientMachine_->resetState();
}
testEvaluator_->finish();
CHECK_GT(testContext_.numSamples, 0)
<< "There is no samples in your test batch. Possibly "
......
......@@ -17,36 +17,38 @@ limitations under the License. */
#include <fenv.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <iostream>
#include <limits>
#include <sstream>
#include <google/protobuf/text_format.h>
#include "paddle/utils/Excepts.h"
#include "paddle/utils/GlobalConstants.h"
#include "paddle/utils/PythonUtil.h"
#include "paddle/utils/Stat.h"
#include "paddle/utils/Util.h"
#include "paddle/utils/Excepts.h"
#include "paddle/utils/GlobalConstants.h"
#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
#include "paddle/gserver/layers/ValidationLayer.h"
#include "RemoteParameterUpdater.h"
#include "TesterConfig.h"
#include "ThreadParameterUpdater.h"
#include "RemoteParameterUpdater.h"
#include "TrainerConfigHelper.h"
#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
#include "paddle/gserver/layers/ValidationLayer.h"
P_DEFINE_string(config, "", "Trainer config file");
P_DEFINE_int32(test_period, 0,
P_DEFINE_int32(test_period,
0,
"if equal 0, do test on all test data at the end of "
"each pass. While if equal non-zero, do test on all test "
"data every test_period batches");
P_DEFINE_bool(test_all_data_in_one_period, false,
"This option was deprecated, since we will always do "
"test on all test set ");
P_DEFINE_bool(test_all_data_in_one_period,
false,
"This option was deprecated, since we will always do "
"test on all test set ");
P_DEFINE_bool(local, true, "Train in local mode or not");
......@@ -392,10 +394,6 @@ void Trainer::startTrain() {
dataProvider_->reset();
}
if (this->testDataProvider_) {
this->testDataProvider_->reset();
}
trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
}
......@@ -630,16 +628,14 @@ void Trainer::test() { tester_->test(); }
std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
TesterConfig* conf = new TesterConfig;
if (FLAGS_test_period) {
LOG(WARNING)
<< "The meaning of --test_period is changed: "
<< "if equal 0, do test on all test data at the end of "
<< "each pass. While if equal non-zero, do test on all test "
<< "data every test_period batches ";
LOG(WARNING) << "The meaning of --test_period is changed: "
<< "if equal 0, do test on all test data at the end of "
<< "each pass. While if equal non-zero, do test on all test "
<< "data every test_period batches ";
}
if (FLAGS_test_all_data_in_one_period) {
LOG(WARNING)
<< "--test_all_data_in_one_period was deprecated, since "
<< "we will always do test on all test set ";
LOG(WARNING) << "--test_all_data_in_one_period was deprecated, since "
<< "we will always do test on all test set ";
}
conf->testPeriod = FLAGS_test_period;
conf->prevBatchState = FLAGS_prev_batch_state;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册