/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "BaseMatrix.h" #include "hl_matrix_ops.cuh" #include "hl_matrix_base.cuh" #include "hl_matrix_apply.cuh" #include "SIMDFunctions.h" #include "MathFunctions.h" namespace paddle { const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported."; template template int BaseMatrixT::applyUnary(Op op) { MatrixOffset offset(0, 0); applyUnary(op, height_, width_, offset); return 0; } template template int BaseMatrixT::applyUnary(Op op, int numRows, int numCols, MatrixOffset& offset) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; int dimM = numRows; int dimN = numCols; int lda = stride_; T* A = data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); if (true == useGpu_) { hl_gpu_apply_unary_op(op, A, dimM, dimN, lda); } else { hl_cpu_apply_unary_op(op, A, dimM, dimN, lda); } return 0; } template template int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b) { CHECK(height_ == b.height_ && width_ == b.width_) << "Matrix dimensions are not equal"; MatrixOffset offset(0, 0, 0, 0); applyBinary(op, b, height_, width_, offset); return 0; } template template int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) { applyBinary(op, b, numRows, numCols, offset, false_type(), false_type()); return 0; } template template int BaseMatrixT::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset, bAsRowVector, bAsColVector) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch"; int dimM = numRows; int dimN = numCols; int lda = stride_; int ldb = b.stride_; T* A = data_; T* B = b.data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); if (!bAsRowVector::value && !bAsColVector::value) { CHECK_LE(dimM + offset.bRow_, b.height_); CHECK_LE(dimN + offset.bCol_, b.width_); } else if (bAsRowVector::value && !bAsColVector::value) { CHECK_LE(dimN + offset.bCol_, b.width_); } else if (!bAsRowVector::value && bAsColVector::value) { CHECK_LE(dimM + offset.bRow_, b.height_); } else { } if (true == useGpu_) { hl_gpu_apply_binary_op( op, A, B, dimM, dimN, lda, ldb); } else { hl_cpu_apply_binary_op( op, A, B, dimM, dimN, lda, ldb); } return 0; } template template int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) { CHECK_EQ(height_, b.height_); CHECK_EQ(width_, b.width_); CHECK_EQ(height_, c.height_); CHECK_EQ(width_, c.width_); MatrixOffset offset(0, 0, 0, 0, 0, 0); applyTernary(op, b, c, height_, width_, offset); return 0; } template template int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int numRows, int numCols, MatrixOffset& offset) { applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type()); return 0; } template template int BaseMatrixT::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int numRows, int numCols, MatrixOffset& offset, cAsRowVector, cAsColVector) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK_EQ(useGpu_, b.useGpu_); CHECK_EQ(useGpu_, c.useGpu_); int dimM = numRows; int dimN = numCols; int lda = stride_; int ldb = b.stride_; int ldc = c.stride_; T* A = data_; T* B = b.data_; T* C = c.data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); CHECK_LE(dimM + offset.bRow_, b.height_); CHECK_LE(dimN + offset.bCol_, b.width_); if (!cAsRowVector::value && !cAsColVector::value) { CHECK_LE(dimM + offset.cRow_, c.height_); CHECK_LE(dimN + offset.cCol_, c.width_); } else if (cAsRowVector::value && !cAsColVector::value) { CHECK_LE(dimN + offset.cCol_, c.width_); } else if (!cAsRowVector::value && cAsColVector::value) { CHECK_LE(dimM + offset.cRow_, c.height_); } else { } if (true == useGpu_) { hl_gpu_apply_ternary_op ( op, A, B, C, dimM, dimN, lda, ldb, ldc); } else { hl_cpu_apply_ternary_op ( op, A, B, C, dimM, dimN, lda, ldb, ldc); } return 0; } template template int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { CHECK_EQ(height_, b.height_); CHECK_EQ(width_, b.width_); CHECK_EQ(height_, c.height_); CHECK_EQ(width_, c.width_); CHECK_EQ(height_, d.height_); CHECK_EQ(width_, d.width_); MatrixOffset offset(0, 0, 0, 0, 0, 0, 0, 0); applyQuaternary(op, b, c, d, height_, width_, offset); return 0; } template template int BaseMatrixT::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, int numRows, int numCols, MatrixOffset& offset) { CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK(!d.isSparse()) << SPARSE_SUPPORT_ERROR; CHECK_EQ(useGpu_, b.useGpu_); CHECK_EQ(useGpu_, c.useGpu_); CHECK_EQ(useGpu_, d.useGpu_); int dimM = numRows; int dimN = numCols; int lda = stride_; int ldb = b.stride_; int ldc = c.stride_; int ldd = d.stride_; T* A = data_; T* B = b.data_; T* C = c.data_; T* D = d.data_; CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_); CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_); CAL_MATRIX_START_ADDRESS(D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_); CHECK_LE(dimM + offset.aRow_, this->height_); CHECK_LE(dimN + offset.aCol_, this->width_); CHECK_LE(dimM + offset.bRow_, b.height_); CHECK_LE(dimN + offset.bCol_, b.width_); CHECK_LE(dimM + offset.cRow_, c.height_); CHECK_LE(dimN + offset.cCol_, c.width_); CHECK_LE(dimM + offset.dRow_, d.height_); CHECK_LE(dimN + offset.dCol_, d.width_); if (true == useGpu_) { hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd); } else { hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd); } return 0; } template template int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset, aAsRowVector, aAsColVector) { CHECK_EQ(useGpu_, b.useGpu_); int ld = stride_; int ldb = b.stride_; T* dst = data_; T* B = b.data_; CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_, offset.aRow_); CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); if (aAsRowVector::value && !aAsColVector::value) { if (useGpu_) { hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb); } else { hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb); } } else if (!aAsRowVector::value && aAsColVector::value) { if (useGpu_) { hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb); } else { hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb); } } else { LOG(FATAL) << "not supported"; } return 0; } template template int BaseMatrixT::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c, int numRows, int numCols, MatrixOffset& offset, aAsRowVector, aAsColVector) { CHECK_EQ(useGpu_, b.useGpu_); CHECK_EQ(useGpu_, c.useGpu_); int ld = stride_; int ldb = b.stride_; int ldc = c.stride_; T* dst = data_; T* B = b.data_; T* C = c.data_; CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_, offset.aRow_); CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_); CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_); if (aAsRowVector::value && !aAsColVector::value) { if (useGpu_) { hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc); } else { hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc); } } else if (!aAsRowVector::value && aAsColVector::value) { if (useGpu_) { hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc); } else { hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc); } } else { LOG(FATAL) << "not supported"; } return 0; } /** * @brief unary operator. * */ DEFINE_MATRIX_UNARY_OP(Neg, a = -a); template void BaseMatrixT::neg() { applyUnary(unary::Neg()); } DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a)); template<> void BaseMatrixT::exp2() { applyUnary(unary::Exp()); } DEFINE_MATRIX_UNARY_OP(Log, a = log(a)); template<> void BaseMatrixT::log2() { if (useGpu_) { applyUnary(unary::Log()); } else { vLog(height_ * width_, data_, data_); } } DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a)); template<> void BaseMatrixT::sqrt2() { applyUnary(unary::Sqrt()); } DEFINE_MATRIX_UNARY_OP(Square, a = a * a); template void BaseMatrixT::square2() { applyUnary(unary::Square()); } DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a); template void BaseMatrixT::reciprocal2() { applyUnary(unary::Reciprocal()); } DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a); template void BaseMatrixT::abs2() { applyUnary(unary::Abs()); } DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0)); template void BaseMatrixT::sign2() { applyUnary(unary::Sign()); } DEFINE_MATRIX_UNARY_OP(Zero, a = 0); template void BaseMatrixT::zero() { applyUnary(unary::Zero()); } template void BaseMatrixT::zeroAtOffset(int64_t columnOffset, int64_t numColumns) { int numRows = height_; int numCols = numColumns; MatrixOffset offset(columnOffset, 0); applyUnary(unary::Zero(), numRows, numCols, offset); } DEFINE_MATRIX_UNARY_OP(One, a = 1); template void BaseMatrixT::one() { applyUnary(unary::One()); } DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p)); template<> void BaseMatrixT::pow2(real p) { if (useGpu_) { applyUnary(unary::Pow(p)); } else { vPow(height_ * width_, data_, p, data_); } } DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p); template void BaseMatrixT::subScalar(T p) { applyUnary(unary::SubScalar(p)); } DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p); template void BaseMatrixT::mulScalar(T p) { applyUnary(unary::MulScalar(p)); } DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p); template void BaseMatrixT::divScalar(T p) { applyUnary(unary::DivScalar(p)); } DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p); template void BaseMatrixT::assign(T p) { applyUnary(unary::Assign(p)); } DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p); template void BaseMatrixT::add(T p) { applyUnary(unary::Add(p)); } DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2); template void BaseMatrixT::add(T p1, T p2) { applyUnary(unary::Add2(p1, p2)); } DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER, a = a < p1 ? p1 : (a > p2 ? p2 : a)); template void BaseMatrixT::clip(T p1, T p2) { applyUnary(unary::Clip(p1, p2)); } DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER, a = a > p ? 1.0f : 0.0f); template void BaseMatrixT::biggerThanScalar(T p) { applyUnary(unary::BiggerThanScalar(p)); } DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p); template void BaseMatrixT::downClip(T p) { applyUnary(unary::DownClip(p)); } /** * @brief binary operator. * */ DEFINE_MATRIX_BINARY_OP(Add, a += b); template void BaseMatrixT::add(BaseMatrixT& b) { applyBinary(binary::Add(), b); } template<> void BaseMatrixT::add(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Add(), b); } else { // cpu branch CHECK_EQ(height_, b.height_); CHECK_EQ(width_, b.width_); vAdd(height_ * width_, data_, b.data_, data_); } } template void BaseMatrixT::addAtOffset(BaseMatrixT& b, int64_t columnOffset) { if (columnOffset + b.width_ <= width_) { int numRows = height_; int numCols = b.width_; MatrixOffset offset(columnOffset, 0, 0, 0); applyBinary(binary::Add(), b, numRows, numCols, offset); } else if (columnOffset + width_ <= b.width_) { int numRows = height_; int numCols = width_; MatrixOffset offset(0, 0, columnOffset, 0); applyBinary(binary::Add(), b, numRows, numCols, offset); } else { LOG(FATAL) << "Wrong argument " << " a.width=" << width_ << " b.width=" << b.width_ << " columnOffset=" << columnOffset; } } template void BaseMatrixT::addP2P(BaseMatrixT& b) { T* A = data_; T* B = b.data_; int dimM = height_; int dimN = width_; hl_gpu_apply_binary_op, 0, 0> (binary::Add(), A, B, dimM, dimN, dimN, dimN); } template void BaseMatrixT::addColVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; applyBinary(binary::Add(), b, numRows, numCols, offset, false_type(), true_type() /* bAsColVector */); } template void BaseMatrixT::addRowVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; applyBinary(binary::Add(), b, numRows, numCols, offset, true_type() /* bAsRowVector */, false_type()); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p); template void BaseMatrixT::add(BaseMatrixT& b, T p) { applyBinary(binary::Add1(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p)); template<> void BaseMatrixT::pow2(BaseMatrixT& b, real p) { if (useGpu_) { applyBinary(binary::Pow(p), b); } else { vPow(height_ * width_, b.data_, p, data_); } } DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b); template void BaseMatrixT::add(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::Add2(p1, p2), b); } template void BaseMatrixT::addBias(BaseMatrixT& b, T scale) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; applyBinary(binary::Add1(scale), b, numRows, numCols, offset, true_type() /* bAsRowVector */, false_type()); } DEFINE_MATRIX_BINARY_OP(Sub, a -= b); template void BaseMatrixT::sub(BaseMatrixT& b) { applyBinary(binary::Sub(), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p); template void BaseMatrixT::sub(BaseMatrixT& b, T p) { applyBinary(binary::Sub1(p), b); } DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f); template void BaseMatrixT::relu(BaseMatrixT& b) { applyBinary(binary::Relu(), b); } DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f)); template void BaseMatrixT::reluDerivative(BaseMatrixT& b) { applyBinary(binary::ReluDerivative(), b); } DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0; b = log(1.0 + exp((a > THRESHOLD) ? THRESHOLD : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))); template<> void BaseMatrixT::softrelu(BaseMatrixT& b) { applyBinary(binary::Softrelu(), b); } DEFINE_MATRIX_BINARY_OP( SoftreluDerivative, const T THRESHOLD = 40.0; a *= (1.0 - exp(-1.0 * ((b > THRESHOLD) ? THRESHOLD : ((b < -THRESHOLD) ? (-THRESHOLD) : b))))); template<> void BaseMatrixT::softreluDerivative(BaseMatrixT& b) { applyBinary(binary::SoftreluDerivative(), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1; b = b < p2 ? b : p2); template void BaseMatrixT::brelu(BaseMatrixT& b) { int p1 = 0, p2 = 24; //! TODO(yuyang18): Make p1,p2 configuable. applyBinary(binary::Brelu(p1, p2), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, TWO_PARAMETER, a *= (b > p1 && b < p2) ? 1.0 : 0.0); template void BaseMatrixT::breluDerivative(BaseMatrixT& b) { int p1 = 0, p2 = 24; applyBinary(binary::BreluDerivative(p1, p2), b); } DEFINE_MATRIX_BINARY_OP(Square, b = a * a); template void BaseMatrixT::square2(BaseMatrixT& b) { applyBinary(binary::Square(), b); } DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b); template void BaseMatrixT::squareDerivative(BaseMatrixT& b) { applyBinary(binary::SquareDerivative(), b); } DEFINE_MATRIX_BINARY_OP(Tanh, b = 2.0 / (1.0 + exp(-2 * a)) - 1.0); template<> void BaseMatrixT::tanh(BaseMatrixT& b) { applyBinary(binary::Tanh(), b); } DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b); template void BaseMatrixT::tanhDerivative(BaseMatrixT& b) { applyBinary(binary::TanhDerivative(), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)); template<> void BaseMatrixT::scaledTanh(BaseMatrixT& b, real p1, real p2) { applyBinary(binary::ScaledTanh(p1, p2), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, TWO_PARAMETER, a *= p2 * (p1 - b * b)); template void BaseMatrixT::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::ScaledTanhDerivative(p1 * p1, p2 / p1), b); } DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a); template void BaseMatrixT::reciprocal2(BaseMatrixT& b) { applyBinary(binary::Reciprocal(), b); } DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b); template void BaseMatrixT::reciprocalDerivative(BaseMatrixT& b) { applyBinary(binary::ReciprocalDerivative(), b); } DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a); template void BaseMatrixT::abs2(BaseMatrixT& b) { applyBinary(binary::Abs(), b); } DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0); template void BaseMatrixT::absDerivative(BaseMatrixT& b) { applyBinary(binary::AbsDerivative(), b); } DEFINE_MATRIX_BINARY_OP( Sigmoid, const T THRESHOLD_MIN = -40.0; const T THRESHOLD_MAX = 13.0; T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a); b = 1.0f / (1.0f + exp(-tmp))); template<> void BaseMatrixT::sigmoid(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Sigmoid(), b); } else { // cpu versioni size_t numSamples = this->height_; size_t dim = this->width_; CHECK_EQ(b.height_, numSamples); CHECK_EQ(b.width_, dim); const real* in = this->data_; real* out = b.data_; // out = - in const float THRESHOLD_MIN = -40.0; // make sure sigmoid(x) > 0 const float THRESHOLD_MAX = 13.0; // make sure sigmoid(x) < 1 for (size_t i = 0; i < numSamples * dim; ++i) { real tmp = in[i]; tmp = (tmp < THRESHOLD_MIN) ? THRESHOLD_MIN : ((tmp > THRESHOLD_MAX) ? THRESHOLD_MAX : tmp); out[i] = -tmp; } // out = exp(out) vExp(numSamples * dim, out, out); // out = 1 / (1 + out) for (size_t i = 0; i < numSamples * dim; ++i) { out[i] = 1 / (1 + out[i]); } } } DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b)); template void BaseMatrixT::sigmoidDerivative(BaseMatrixT& b) { applyBinary(binary::SigmoidDerivative(), b); } DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b); template void BaseMatrixT::expDerivative(BaseMatrixT& b) { applyBinary(binary::ExpDerivative(), b); } DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f); template void BaseMatrixT::sign2(BaseMatrixT& b) { applyBinary(binary::Sign(), b); } DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b)); template<> void BaseMatrixT::exp2(BaseMatrixT& b) { applyBinary(binary::Exp(), b); } DEFINE_MATRIX_BINARY_OP(Log, a = log(b)); template<> void BaseMatrixT::log2(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Log(), b); } else { vLog(height_ * width_, b.data_, data_); } } DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b)); template<> void BaseMatrixT::sqrt2(BaseMatrixT& b) { applyBinary(binary::Sqrt(), b); } DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b)); template<> void BaseMatrixT::invSqrt(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::InvSqrt(), b); } else { // cpu branch CHECK_EQ(height_, b.height_); CHECK_EQ(width_, b.width_); vInvSqrt(height_ * width_, b.data_, data_); } } DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p)); template void BaseMatrixT::isEqualTo(BaseMatrixT& b, T value) { applyBinary(binary::IsEqual(value), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p); template void BaseMatrixT::addScalar(BaseMatrixT& b, T p) { applyBinary(binary::AddScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p); template void BaseMatrixT::subScalar(BaseMatrixT& b, T p) { applyBinary(binary::SubScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p); template void BaseMatrixT::mulScalar(BaseMatrixT& b, T p) { applyBinary(binary::MulScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p); template void BaseMatrixT::divScalar(BaseMatrixT& b, T p) { applyBinary(binary::DivScalar(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b); template void BaseMatrixT::scalarDiv(BaseMatrixT& b, T p) { applyBinary(binary::ScalarDiv(p), b); } /** * @brief ternary operator. * */ DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy, a = -c * log(b) - (1 - c) * log(1 - b)); template<> void BaseMatrixT::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::SoftCrossEntropy(), b, c); } DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b))); template void BaseMatrixT::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::SoftCrossEntropyBp(), b, c); } DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy, a = c > 0.5 ? -log(b) : -log(1.0 - b)); template<> void BaseMatrixT::binaryLabelCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) { if (useGpu_) { applyTernary(ternary::BinaryCrossEntropy(), b, c); } else { CHECK_EQ(height_, b.height_); CHECK_EQ(height_, c.height_); CHECK_EQ(width_, b.width_); CHECK_EQ(width_, c.width_); size_t size = height_ * width_; real* out = b.data_; real* label = c.data_; real* cost = data_; for (size_t i = 0; i < size; ++i) { cost[i] = label[i] > 0.5 ? out[i] : 1.0 - out[i]; } vLog(size, cost, cost); for (size_t i = 0; i < size; ++i) { cost[i] *= -1.0; } } } DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp, a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)); template void BaseMatrixT::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::BinaryCrossEntropyBp(), b, c); } DEFINE_MATRIX_TERNARY_OP(Add, a = b + c); template void BaseMatrixT::add(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Add(), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c); template void BaseMatrixT::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) { applyTernary(ternary::Add1(p1, p2), b, c); } DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c); template void BaseMatrixT::sub(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Sub(), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c); template void BaseMatrixT::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) { applyTernary(ternary::Sub1(p1, p2), b, c); } DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c); template void BaseMatrixT::add2(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Add2(), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, THREE_PARAMETER, a = p1 * a + p2 * b + p3 * c); template void BaseMatrixT::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) { applyTernary(ternary::Add3(p1, p2, p3), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER, c = p2 * c - p1 * (b + p3 * a); a = a + c); template void BaseMatrixT::sgdUpdate(BaseMatrixT& b, // grad BaseMatrixT& c, // mom T p1, // learningRate, T p2, // momentum, T p3) { // decayRate applyTernary(ternary::SgdUpdate(p1, p2, p3), b, c); } DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER, c = p2 * c - p1 * d * (b + p3 * a); a += c); template void BaseMatrixT::sgdUpdate(BaseMatrixT& b, // grad, BaseMatrixT& c, // mom, BaseMatrixT& d, // lr, T p1, // learningRate, T p2, // momentum, T p3) { // decayRate applyQuaternary(quaternary::SgdUpdate(p1, p2, p3), b, c, d); } DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b; a = (a > lambda) ? (a - lambda) : (a < -lambda) ? (a + lambda) : 0); template void BaseMatrixT::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) { applyBinary(binary::ApplyL1(learningRate * decayRate), lr); } template<> void BaseMatrixT::applyL1(BaseMatrixT& lr, real learningRate, real decayRate) { if (useGpu_) { applyBinary(binary::ApplyL1(learningRate * decayRate), lr); } else { simd::decayL1(this->data_, this->data_, lr.data_, learningRate * decayRate, height_ * width_); } } DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p; a = (a > lambda) ? (a - lambda) : (a < -lambda) ? (a + lambda) : 0); template void BaseMatrixT::applyL1(T learningRate, T decayRate) { applyUnary(unary::ApplyL1(learningRate * decayRate)); } template<> void BaseMatrixT::applyL1(real learningRate, real decayRate) { if (useGpu_) { applyUnary(unary::ApplyL1(learningRate * decayRate)); } else { simd::decayL1(this->data_, this->data_, learningRate * decayRate, height_ * width_); } } DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, ONE_PARAMETER, a *= (1.0f / (1.0f + p * b))); template void BaseMatrixT::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) { if (useGpu_) { applyBinary(binary::ApplyL2(learningRate * decayRate), lr); } else { size_t size = this->height_ * this->width_; T decay = learningRate * decayRate; for (size_t j = 0; j < size; ++j) { this->data_[j] *= 1.0f / (1.0f + decay * lr.data_[j]); } } } template void BaseMatrixT::applyL2(T learningRate, T decayRate) { BaseMatrixT::mulScalar(1.0f / (1.0f + learningRate * decayRate)); } DEFINE_MATRIX_BINARY_OP(DotMul, a *= b); template void BaseMatrixT::dotMul(BaseMatrixT& b) { applyBinary(binary::DotMul(), b); } DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c); template void BaseMatrixT::dotMul(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotMul(), b, c); } DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c); template void BaseMatrixT::dotDiv(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotDiv(), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, TWO_PARAMETER, a = (b + p1) / (c + p2)); template void BaseMatrixT::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotDiv2P(p1, p2), b, c); } DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c; a = (a > THRESHOLD) ? THRESHOLD : ((a < -THRESHOLD) ? (-THRESHOLD) : a); a = log(1 + exp(a)) - a * d); template<> void BaseMatrixT::rankLoss(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { applyQuaternary(quaternary::RankLoss(), b, c, d); } DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c; a = (a > THRESHOLD) ? THRESHOLD : ((a < -THRESHOLD) ? (-THRESHOLD) : a); a = exp(a); a = (a / (1 + a) - d)); template<> void BaseMatrixT::rankLossBp(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { applyQuaternary(quaternary::RankLossBp(), b, c, d); } /* this = log(1 + exp(b)) - c * b */ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0; T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD) ? -THRESHOLD : b; a = log(1 + exp(x)) - c * x); template<> void BaseMatrixT::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::LogisticRegressionLoss(), b, c); } /* this = exp(b)/(1+exp(b)) - c */ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0; T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD) ? -THRESHOLD : b; x = exp(x); a = x / (1 + x) - c); template<> void BaseMatrixT::logisticRegressionLossBp(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::LogisticRegressionLossBp(), b, c); } DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f); template void BaseMatrixT::biggerThan(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::BiggerThan(), b, c); } DEFINE_MATRIX_QUATERNARY_OP( BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f); template void BaseMatrixT::biggerThan(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d) { applyQuaternary(quaternary::BiggerThan(), b, c, d); } DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c); template void BaseMatrixT::max2(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::Max(), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, ONE_PARAMETER, c += ((a > p) == (b > p)) ? 0.0f : 1.0f); template void BaseMatrixT::binaryClassificationError2(size_t destCol, BaseMatrixT& b, BaseMatrixT& c, T p) { CHECK(!useGpu_) << "do not support gpu"; MatrixOffset offset(0, 0, 0, 0, destCol, 0); int numRows = b.height_; int numCols = b.width_; b.applyTernary(ternary::BinaryClassificationError(p), c, *this, numRows, numCols, offset, false_type(), true_type() /*cAsColVector*/); } template<> void BaseMatrixT::binaryClassificationError(size_t destCol, BaseMatrixT& b, BaseMatrixT& c, real p) { MatrixOffset offset(destCol, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; aggregate(aggregate::sum(), base::binary::classificationError(p), base::binary::add(), b, c, numRows, numCols, offset, false_type(), true_type() /*aAsColVector*/); } DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, THREE_PARAMETER, a = p1 * b + p2 * c + p3 * d); template void BaseMatrixT::add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) { applyQuaternary(quaternary::Add3(p1, p2, p3), b, c, d); } DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c); template void BaseMatrixT::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotMulSquare(), b, c); } DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c); template void BaseMatrixT::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) { applyTernary(ternary::DotSquareSquare(), b, c); } DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b); template void BaseMatrixT::dotMulSquare(BaseMatrixT& b) { applyBinary(binary::DotMulSquare(), b); } DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b); template void BaseMatrixT::dotSquareMul(BaseMatrixT& b) { applyBinary(binary::DotSquareMul(), b); } DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, THREE_PARAMETER, T tmp = p1 * b + p2 * c + p3 * d; a += tmp * tmp); template void BaseMatrixT::addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) { applyQuaternary(quaternary::AddSquareSum(p1, p2, p3), b, c, d); } DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b); template void BaseMatrixT::addSquare(BaseMatrixT& b, T p) { applyBinary(binary::AddSquare(p), b); } DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, TWO_PARAMETER, a = p1 * a + p2 * b * b); template void BaseMatrixT::decayAddSquare(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::DecayAddSquare(p1, p2), b); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, TWO_PARAMETER, a = p1 * a + p2 * b * b * c * c); template void BaseMatrixT::decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DecayAddSquareMul(p1, p2), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, THREE_PARAMETER, a = 1 / (p1 * b + p2 * c + p3)); template void BaseMatrixT::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) { applyTernary(ternary::ReciprocalSum(p1, p2, p3), b, c); } DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER, a = 1 / (p1 * b + p2)); template void BaseMatrixT::reciprocal2(BaseMatrixT& b, T p1, T p2) { applyBinary(binary::Reciprocal2(p1, p2), b); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, TWO_PARAMETER, T tmp = p1 * b + p2 * c; a *= tmp * tmp); template void BaseMatrixT::dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotMulSquareSum(p1, p2), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, TWO_PARAMETER, T tmp = p1 * b + p2 * c; a = tmp * tmp); template void BaseMatrixT::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotSquareSum(p1, p2), b, c); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, TWO_PARAMETER, a *= p1 * b + p2 * c); template void BaseMatrixT::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::DotMulSum(p1, p2), b, c); } DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0); template void BaseMatrixT::copyAndClear(BaseMatrixT& b) { applyBinary(binary::CopyAndClear(), b); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, TWO_PARAMETER, a = p1 * a + p2 * b * c); template void BaseMatrixT::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) { applyTernary(ternary::AddDotMul(p1, p2), b, c); } DEFINE_MATRIX_BINARY_OP(Assign, a = b;); template void BaseMatrixT::assign(BaseMatrixT& b) { if (useGpu_) { applyBinary(binary::Assign(), b); } else { // cpu version CHECK_EQ(this->height_, b.height_); CHECK_EQ(this->width_, b.width_); memcpy(data_, b.data_, sizeof(T) * height_ * width_); } } template void BaseMatrixT::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) { if (columnOffset + b.width_ <= width_) { int numRows = height_; int numCols = b.width_; MatrixOffset offset(columnOffset, 0, 0, 0); applyBinary(binary::Assign(), b, numRows, numCols, offset); } else if (columnOffset + width_ <= b.width_) { int numRows = height_; int numCols = width_; MatrixOffset offset(0, 0, columnOffset, 0); applyBinary(binary::Assign(), b, numRows, numCols, offset); } else { LOG(FATAL) << "Wrong argument " << " a.width=" << width_ << " b.width=" << b.width_ << " columnOffset=" << columnOffset; } } template<> void BaseMatrixT::rowDotMul(size_t destCol, BaseMatrixT& b, BaseMatrixT& c) { int numRows = b.height_; int numCols = b.width_; MatrixOffset offset(destCol, 0, 0, 0, 0, 0); aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c, numRows, numCols, offset, false_type(), true_type() /*aAsColVector*/); } template void BaseMatrixT::rowDotMul2(size_t destCol, BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; size_t height = this->height_; CHECK_LT(destCol, this->width_); CHECK_EQ(height, b.height_); CHECK_EQ(height, c.height_); CHECK_EQ(b.width_, c.width_); size_t width = b.width_; T* A = this->data_; const T* B = b.data_; const T* C = c.data_; for (size_t i = 0; i < height; ++i, A += this->width_, B += width, C += width) { for (size_t j = 0; j < width; ++j) { A[destCol] += B[j] * C[j]; } } } template<> void BaseMatrixT::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c, numRows, numCols, offset, true_type() /*aAsRowVector*/, false_type()); } template void BaseMatrixT::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; CHECK_EQ(height_, 1LU); CHECK_EQ(b.height_, c.height_); CHECK_EQ(width_, b.width_); CHECK_EQ(width_, c.width_); size_t height = b.height_; size_t width = b.width_; T* A = this->data_; const T* B = b.data_; const T* C = c.data_; for (size_t i = 0; i < height; ++i, B += width, C += width) { for (size_t j = 0; j < width; ++j) { A[j] += B[j] * C[j]; } } } DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c); template void BaseMatrixT::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = height_; int numCols = width_; applyTernary(ternary::addDotMulMMV(), b, c, numRows, numCols, offset, true_type() /*cAsRowVector*/, false_type()); } template void BaseMatrixT::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; CHECK_EQ(c.height_, 1LU); CHECK_EQ(height_, b.height_); CHECK_EQ(width_, b.width_); CHECK_EQ(width_, c.width_); size_t height = height_; size_t width = width_; T* A = this->data_; const T* B = b.data_; const T* C = c.data_; for (size_t i = 0; i < height; ++i, A += width, B += width) { for (size_t j = 0; j < width; ++j) { A[j] += B[j] * C[j]; } } } template void BaseMatrixT::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; applyTernary(ternary::DotMul(), b, c, numRows, numCols, offset, false_type(), true_type() /*cAsColVector*/); } template void BaseMatrixT::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { CHECK(!useGpu_) << "do not support gpu"; size_t height = this->height_; size_t width = this->width_; CHECK_EQ(height, b.height_); CHECK_EQ(width, b.width_); CHECK_LT(cCol, c.width_); CHECK_EQ(height, c.height_); T* A = this->data_; const T* B = b.data_; const T* C = c.data_; for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) { for (size_t j = 0; j < width; ++j) { A[j] = B[j] * C[cCol]; } } } template void BaseMatrixT::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, cRow); int numRows = height_; int numCols = width_; applyTernary(ternary::DotMul(), b, c, numRows, numCols, offset, true_type() /* cAsRowVector */, false_type() /* cAsColVector */); } template void BaseMatrixT::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, 0, cRow); int numRows = height_; int numCols = width_; applyTernary(ternary::addDotMulMMV(), b, c, numRows, numCols, offset, true_type() /* cAsRowVector */, false_type() /* cAsColVector */); } template void BaseMatrixT::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; applyTernary(ternary::addDotMulMMV(), b, c, numRows, numCols, offset, false_type(), true_type() /*cAsColVector*/); } DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c); template void BaseMatrixT::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; applyTernary(ternary::RowAdd(p), b, c, numRows, numCols, offset, false_type(), true_type() /*cAsColVector*/); } DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c)); template<> void BaseMatrixT::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) { if (useGpu_) { MatrixOffset offset(0, 0, 0, 0, cCol, 0); int numRows = height_; int numCols = width_; applyTernary(ternary::RowPow(), b, c, numRows, numCols, offset, false_type(), true_type() /*cAsColVector*/); } else { size_t height = this->height_; size_t width = this->width_; CHECK_EQ(height, b.height_); CHECK_EQ(width, b.width_); CHECK_LT(cCol, c.width_); CHECK_EQ(height, c.height_); real* A = this->data_; const real* B = b.data_; const real* C = c.data_; for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) { vPow(width, B, C[cCol], A); } } } template void BaseMatrixT::mulRowVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; applyBinary(binary::DotMul(), b, numRows, numCols, offset, true_type() /* bAsRowVector */, false_type()); } DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b); template void BaseMatrixT::divRowVector(BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0); int numRows = height_; int numCols = width_; applyBinary(binary::DotDiv(), b, numRows, numCols, offset, true_type() /* bAsRowVector */, false_type()); } template<> template int BaseMatrixT::applyRow(Agg agg, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, numCols, offset, false_type(), true_type() /*aAsColVector*/); return 0; } template<> template int BaseMatrixT::applyRow(Agg agg, Saver sv, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, false_type(), true_type() /*aAsColVector*/); return 0; } template<> template int BaseMatrixT::applyCol(Agg agg, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows, numCols, offset, true_type() /*aAsRowVector*/, false_type()); return 0; } template<> template int BaseMatrixT::applyCol(Agg agg, Saver sv, BaseMatrixT& b) { MatrixOffset offset(0, 0, 0, 0, 0, 0); int numRows = b.height_; int numCols = b.width_; aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset, true_type() /*aAsRowVector*/, false_type()); return 0; } template<> void BaseMatrixT::sumRows(BaseMatrixT& b) { applyRow(aggregate::sum(), b); } template<> void BaseMatrixT::maxRows(BaseMatrixT& b) { applyRow(aggregate::max(), b); } template<> void BaseMatrixT::minRows(BaseMatrixT& b) { applyRow(aggregate::min(), b); } template<> void BaseMatrixT::sumCols(BaseMatrixT& b) { applyCol(aggregate::sum(), b); } template<> void BaseMatrixT::maxCols(BaseMatrixT& b) { applyCol(aggregate::max(), b); } template<> void BaseMatrixT::minCols(BaseMatrixT& b) { applyCol(aggregate::min(), b); } template<> void BaseMatrixT::sumCols(BaseMatrixT& b, real scale) { applyCol(aggregate::sum(), base::binary::add2(1.0, scale), b); } template<> void BaseMatrixT::sumOfSquares(BaseMatrixT& b, BaseMatrixT& c) { int numRows = b.height_; int numCols = b.width_; MatrixOffset offset(0, 0, 0, 0, 0, 0); aggregate(aggregate::sum(), base::binary::squaredDiff(), base::binary::add(), b, c, numRows, numCols, offset, false_type(), true_type() /*aAsColVector*/); } template class BaseMatrixT; template class BaseMatrixT; } // namespace paddle