Matrix

Base

Defines

CAL_MATRIX_START_ADDRESS(address, height, width, ld, col, row)

Calculate matrix element address.

For instance, address of A[i][j] = i * ld + j.

namespace paddle

Typedefs

typedef bool_constant<bool, false> false_type
typedef bool_constant<bool, true> true_type
typedef BaseMatrixT<real> BaseMatrix
typedef BaseMatrixT<int> IBaseMatrix
template <class T, T v>
struct bool_constant

Public Static Attributes

const T value
class MatrixOffset

Public Functions

MatrixOffset(size_t aCol = 0, size_t aRow = 0, size_t bCol = 0, size_t bRow = 0, size_t cCol = 0, size_t cRow = 0, size_t dCol = 0, size_t dRow = 0)

Public Members

size_t aCol_
size_t aRow_
size_t bCol_
size_t bRow_
size_t cCol_
size_t cRow_
size_t dCol_
size_t dRow_
template <class T>
class BaseMatrixT

Subclassed by paddle::BaseVector< T >, paddle::Matrix

Public Functions

virtual ~BaseMatrixT()
BaseMatrixT(size_t height, size_t width, T *data, bool trans, bool useGpu)
BaseMatrixT(BaseMatrixT &mat, bool useGpu)

Note
This constructor is for temporarily making a matrix with different useGpu flag as the original matrix so that mixed gpu/cpu operations can be performed successfully.

BaseMatrixT(size_t height, size_t width, size_t stride, T *data, bool trans, bool use_gpu)
void setData(T *data)

caller should make sure that the size of data is at least height*width

template <class Op>
int applyUnary(Op op)

unary operator: element wise op(a).

for 0 <= i < this->height_ & for 0 <= j < this->width_.

template <class Op>
int applyUnary(Op op, int numRows, int numCols, MatrixOffset &offset)

unary operator: element wise op(a).

for 0 <= i < numRows & for 0 <= j < numCols.
While matrix start address is:
 A = this->data_ + offset.aRow_*ld + offset.aCol_;

template <class Op>
int applyBinary(Op op, BaseMatrixT &b)

binary operator: element wise op(a, b).

for 0 <= i < this->height_ & for 0 <= j < this->width_.
While this->height_ == b.height_ && this->width_ == b.width_.

template <class Op, class bAsRowVector, class bAsColVector>
int applyBinary(Op op, BaseMatrixT &b, int numRows, int numCols, MatrixOffset &offset, bAsRowVector, bAsColVector)

binary operator: element wise op(a, b)

for 0 <= i < numRows & for 0 <= j < numCols.
While matrix start address is:
  A = this->data_ + offset.aRow_*lda + offset.aCol_;
  B = b->data_ + offset.bRow_*ldb + offset.bCol_;

if (bAsRowVector == false_type && bAsColVector == false_type)
  op(A[i * lda + j], B[i * ldb + j])

if (bAsRowVector == true_type && bAsColVector == false_type)
  op(A[i * lda + j], B[j])

if (bAsRowVector == false_type && bAsColVector == true_type)
  op(A[i * lda + j], B[i * ldb])

if (bAsRowVector == true_type && bAsColVector == true_type)
  op(A[i * lda + j], B[0])

template <class Op>
int applyBinary(Op op, BaseMatrixT &b, int numRows, int numCols, MatrixOffset &offset)
template <class Op>
int applyTernary(Op op, BaseMatrixT &b, BaseMatrixT &c)

ternary operator: element wise op(a, b, c).

for 0 <= i < this->height_ & for 0 <= j < this->width_.

While this->height_ == b.height_ && this->width_ == b.width_
   && this->height_ == c.height_ && this->width_ == c.width_

template <class Op, class cAsRowVector, class cAsColVector>
int applyTernary(Op op, BaseMatrixT &b, BaseMatrixT &c, int numRows, int numCols, MatrixOffset &offset, cAsRowVector, cAsColVector)

ternary operator: element wise op(a, b, c).

for 0 <= i < numRows & for 0 <= j < numCols.
While matrix start address is:

  A = this->data_ + offset.aRow_*lda + offset.aCol_;
  B = b->data_ + offset.bRow_*ldb + offset.bCol_;
  C = c->data_ + offset.cRow_*ldc + offset.cCol_;

  if (cAsRowVector == false_type && cAsColVector == false_type)
    op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])

  if (cAsRowVector == true_type && cAsColVector == false_type)
    op(A[i*lda + j], B[i*ldb + j], C[j])

  if (cAsRowVector == false_type && cAsColVector == true_type)
    op(A[i*lda + j], B[i*ldb + j], C[i*ldc])

  if (cAsRowVector == 1 && cAsColVector == 1)
    op(A[i*lda + j], B[i*ldb + j], C[0])

template <class Op>
int applyTernary(Op op, BaseMatrixT &b, BaseMatrixT &c, int numRows, int numCols, MatrixOffset &offset)
template <class Op>
int applyQuaternary(Op op, BaseMatrixT &b, BaseMatrixT &c, BaseMatrixT &d)

quaternary operator: element wise op(a, b, c, d).

for 0 <= i < this->height_ & for 0 <= j < this->width_.

While this->height_ == b.height_ && this->width_ == b.width_
   && this->height_ == c.height_ && this->width_ == c.width_
   && this->height_ == d.height_ && this->width_ == d.width_

template <class Op>
int applyQuaternary(Op op, BaseMatrixT &b, BaseMatrixT &c, BaseMatrixT &d, int numRows, int numCols, MatrixOffset &offset)

quaternary operator: element wise op(a, b, c, d).

for 0 <= i < numRows & for 0 <= j < numCols.
While matrix start address is:
   A = this->data_ + offset.aRow_*lda + offset.aCol_;
   B = b->data_ + offset.bRow_*ldb + offset.bCol_;
   C = c->data_ + offset.cRow_*ldc + offset.cCol_;
   D = d->data_ + offset.dRow_*ldd + offset.dCol_;

template <class Agg, class Op, class Saver, class aAsRowVector, class aAsColVector>
int aggregate(Agg agg, Op op, Saver sv, BaseMatrixT &b, int numRows, int numCols, MatrixOffset &offset, aAsRowVector, aAsColVector)

a aggregate expression that apply each row(or column) of matrix b. op and sv is element wise operator.

if (aAsRowVector == true_type && aAsColVector == false_type)
 for each column j & 0 <= i < numRows, do:
   dst = agg(op(b[i*ldb + j]))
   a[j] = sv(a[j], dst)

if (aAsRowVector == false_type && aAsColVector == true_type)
 for each row i & 0 <= j < numCols, do:
   dst = agg(op(b[i*ldb + j]))
   a[i] = sv(a[i], dst)

template <class Agg, class Op, class Saver, class aAsRowVector, class aAsColVector>
int aggregate(Agg agg, Op op, Saver sv, BaseMatrixT &b, BaseMatrixT &c, int numRows, int numCols, MatrixOffset &offset, aAsRowVector, aAsColVector)

a aggregate expression that apply each row(or column) of matrix b and c.

op and sv is element wise operator.

if (aAsRowVector == true_type && aAsColVector == false_type)
  for each column j & 0 <= i < numRows, do:
    dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
    a[j] = sv(a[j], dst)

if (aAsRowVector == false_type && aAsColVector == true_type)
  for each row i & 0 <= j < numCols, do:
    dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
    a[i] = sv(a[i], dst)

template <class Agg>
int applyRow(Agg agg, BaseMatrixT &b)

a aggregate expression that apply each row of matrix b.

for each row i & 0 <= j < b.width_, do:
  this[i] = agg(b[i*ldb + j])

template <class Agg, class Saver>
int applyRow(Agg agg, Saver sv, BaseMatrixT &b)

a aggregate expression that apply each row of matrix b.

for each row i & 0 <= j < b.width_, do:
  dst = agg(b[i*ldb + j])
  this[i] = sv(this[i], dst)

template <class Agg>
int applyCol(Agg agg, BaseMatrixT &b)

a aggregate expression that apply each column of matrix b.

for each column j & 0 <= i < b.height_, do:
  this[j] = agg(b[i*ldb + j])

template <class Agg, class Saver>
int applyCol(Agg agg, Saver sv, BaseMatrixT &b)

a aggregate expression that apply each column of matrix b.

for each column j & 0 <= i < b.height_, do:
  dst = agg(b[i*ldb + j])
  this[j] = sv(this[j], dst)

bool useGpu() const
const T *rowBuf(size_t row) const
T *rowBuf(size_t row)
void neg()

unary operator.

void exp()
void pow(T p)
void log()
void sqrt()
void square()
void reciprocal()
void abs()
void sign()
void zero()
void zeroAtOffset(int64_t columnOffset, int64_t numColumns)

this(row, col + columnOffset) = 0 for 0 <= col < numColumns

void one()
void subScalar(T p)
void mulScalar(T p)
void divScalar(T p)
void assign(T p)

this = p

void add(T p)

this = this + p

void add(T p1, T p2)

this = this*p1 + p2

void clip(T p1, T p2)

this = this < low ? low : this

this = this > high ? high : this

void biggerThanScalar(T p)

a = a > p ? 1.0f : 0.0f

void downClip(T p)

a = a > p ? a : p

void assign(BaseMatrixT &b)

this = b

void assignAtOffset(BaseMatrixT &b, int64_t columnOffset)

If b.width + columOffset <= this.width
 this(row, col + columnOffset) = b(row, col) for 0 <= col < b.width

If this.width + columnOffset <= b.width
 this(row, col) = b(row, col + columnOffset) for 0 <= col < this.width

Otherwise, FATAL

void add(BaseMatrixT &b)

this = this + b

void addAtOffset(BaseMatrixT &b, int64_t columnOffset)

If b.width + columOffset <= this.width
 this(row, col + columnOffset) += b(row, col) for 0 <= col < b.width

If this.width + columnOffset <= b.width
 this(row, col) += b(row, col + columnOffset) for 0 <= col < this.width

Otherwise, FATAL

void addColVector(BaseMatrixT &b)
void addRowVector(BaseMatrixT &b)
void addBias(BaseMatrixT &b, T scale)
void mulRowVector(BaseMatrixT &b)
void divRowVector(BaseMatrixT &b)
void addP2P(BaseMatrixT &b)
void add(BaseMatrixT &b, T p)

this = this + b*p

void add(BaseMatrixT &b, T p1, T p2)

this = p1*this + p2*b

void sub(BaseMatrixT &b)

this = this - b

void sub(BaseMatrixT &b, T p)

this = this - b*p

void relu(BaseMatrixT &b)

b = max(0, this)

void reluDerivative(BaseMatrixT &b)
void softrelu(BaseMatrixT &b)

b = log(1.0 + exp(this))

void softreluDerivative(BaseMatrixT &b)
void brelu(BaseMatrixT &b)

b = min(max(this, p1), p2)

void breluDerivative(BaseMatrixT &b)
void square(BaseMatrixT &b)

b = this * this

void squareDerivative(BaseMatrixT &b)
void tanh(BaseMatrixT &b)

b = tanh(this)

void tanhDerivative(BaseMatrixT &b)
void scaledTanh(BaseMatrixT &b, T p1, T p2)

b = p1 * tanh(p2 * this)

void scaledTanhDerivative(BaseMatrixT &b, T p1, T p2)
void reciprocal(BaseMatrixT &b)

b = 1.0f / this

void reciprocalDerivative(BaseMatrixT &b)
void abs(BaseMatrixT &b)

b = this > 0.0f ? this : -this

void absDerivative(BaseMatrixT &b)
void sigmoid(BaseMatrixT &b)

b = 1.0f / (1.0f + exp(-this))

void sigmoidDerivative(BaseMatrixT &b)
void expDerivative(BaseMatrixT &b)

b = a

void sign(BaseMatrixT &b)
void exp(BaseMatrixT &b)
void pow(BaseMatrixT &b, T p)
void log(BaseMatrixT &b)
void sqrt(BaseMatrixT &b)
void addScalar(BaseMatrixT &b, T p)
void subScalar(BaseMatrixT &b, T p)
void mulScalar(BaseMatrixT &b, T p)
void divScalar(BaseMatrixT &b, T p)
void scalarDiv(BaseMatrixT &b, T p)
void invSqrt(BaseMatrixT &b)

this = 1.0f / sqrt(b)

void isEqualTo(BaseMatrixT &b, T value)

this = (b == value)

void softCrossEntropy(BaseMatrixT &b, BaseMatrixT &c)

ternary operator.

void softCrossEntropyBp(BaseMatrixT &b, BaseMatrixT &c)
void binaryLabelCrossEntropy(BaseMatrixT &b, BaseMatrixT &c)
void binaryLabelCrossEntropyBp(BaseMatrixT &b, BaseMatrixT &c)
void add(BaseMatrixT &b, BaseMatrixT &c)

this = b + c

void add(BaseMatrixT &b, T p1, BaseMatrixT &c, T p2)

this = b*p1 + c*p2

void sub(BaseMatrixT &b, BaseMatrixT &c)

this = b - c

void sub(BaseMatrixT &b, T p1, BaseMatrixT &c, T p2)

this = b*p1 - c*p2

void add2(BaseMatrixT &b, BaseMatrixT &c)

this = this + b + c

void add2(BaseMatrixT &b, BaseMatrixT &c, T p1, T p2, T p3)

this = this*p1 + b*p2 + c*p3

void add3(BaseMatrixT &b, BaseMatrixT &c, BaseMatrixT &d, T p1, T p2, T p3)

this = a*p1 + b*p2 + c*p3

void sgdUpdate(BaseMatrixT &b, BaseMatrixT &c, T p1, T p2, T p3)

c = p2 * c - p1 *  (b + p3 * this)
this += mom

void sgdUpdate(BaseMatrixT &b, BaseMatrixT &c, BaseMatrixT &d, T p1, T p2, T p3)

c = p2 * c - p1 * d * (b + p3 * this)
this += mom

void applyL1(T learningRate, T decayRate)

apply L1/L2 to this

void applyL1(BaseMatrixT &lr, T learningRate, T decayRate)
void applyL2(T learningRate, T decayRate)
void applyL2(BaseMatrixT &lr, T learningRate, T decayRate)
void dotMul(BaseMatrixT &b)

this *= b

void dotMul(BaseMatrixT &b, BaseMatrixT &c)

this = b * c

void dotDiv(BaseMatrixT &b, BaseMatrixT &c)

this = b / c

void dotDiv(BaseMatrixT &b, BaseMatrixT &c, T p1, T p2)

this = (b + p1) / (c + p2)

void rankLoss(BaseMatrixT &b, BaseMatrixT &c, BaseMatrixT &d)

this = log(1 + exp(b - c)) - d * (b - c)

void rankLossBp(BaseMatrixT &b, BaseMatrixT &c, BaseMatrixT &d)
void logisticRegressionLoss(BaseMatrixT &b, BaseMatrixT &c)

this = log(1 + exp(b)) - c * b

void logisticRegressionLossBp(BaseMatrixT &b, BaseMatrixT &c)

this += exp(b)/(1+exp(b)) - c

void biggerThan(BaseMatrixT &b, BaseMatrixT &c)

this = b > c ? 1.0 : 0.0

void biggerThan(BaseMatrixT &b, BaseMatrixT &c, BaseMatrixT &d)

this = ((b>c && d>0.5) || (b<c && d<0.5)) ? 1 : 0)

void max(BaseMatrixT &b, BaseMatrixT &c)

this = b>c ? b : c

void binaryClassificationError(size_t destCol, BaseMatrixT &b, BaseMatrixT &c, T p)

this[destCol] += (b>p1 == c>p1) ? 0 : 1)

void binaryClassificationError2(size_t destCol, BaseMatrixT &b, BaseMatrixT &c, T p)
void dotMulSquare(BaseMatrixT &b)

this = this * b * b

void dotSquareMul(BaseMatrixT &b)

this = this * this * b

void dotMulSquare(BaseMatrixT &b, BaseMatrixT &c)

this = b * c * c

void dotSquareSquare(BaseMatrixT &b, BaseMatrixT &c)

this = b * b * c * c

void dotMulSquareSum(BaseMatrixT &b, BaseMatrixT &c, T p1, T p2)

this = this * (p1*b + p2*c)^2

void dotSquareSum(BaseMatrixT &b, BaseMatrixT &c, T p1, T p2)

this = (p1*b + p2*c)^2

void dotMulSum(BaseMatrixT &b, BaseMatrixT &c, T p1, T p2)

this=  this * (p1*b + p2*c)

void addSquareSum(BaseMatrixT &b, BaseMatrixT &c, BaseMatrixT d, T p1, T p2, T p3)

this += sqr(p1*b + p2*c + p3*d)

void addSquare(BaseMatrixT &b, T p)

this += p * sqr(b)

void decayAddSquare(BaseMatrixT &b, T p1, T p2)

this = p1 * this + p2 * sqr(b)

void decayAddSquareMul(BaseMatrixT &b, BaseMatrixT &c, T p1, T p2)

this = p1 * this + p2 * sqr(b * c)

void reciprocal(BaseMatrixT &b, T p1, T p2)

this = 1 / (p1 * b + p2)

void reciprocalSum(BaseMatrixT &b, BaseMatrixT &c, T p1, T p2, T p3)

this = 1 / (p1 * b + p2 * c + p3)

void copyAndClear(BaseMatrixT &b)

b = this; this = 0

void rowDotMul(size_t destCol, BaseMatrixT &b, BaseMatrixT &c)

this_row[destCol] += dotprod(b_row, c_row)

void rowDotMul2(size_t destCol, BaseMatrixT &b, BaseMatrixT &c)
void addDotMulVMM(BaseMatrixT &b, BaseMatrixT &c)

this is vector (one row matrix)

for each row i, do:
   this_row += dotmul(b_row_i, c_row_i)

void addDotMulVMM2(BaseMatrixT &b, BaseMatrixT &c)
void addDotMulMMV(BaseMatrixT &b, BaseMatrixT &c)

c is vector (one row matrix)

for each row i, do:
   this_row_i += dotmul(b_row_i, c_row)

void addDotMulMMV2(BaseMatrixT &b, BaseMatrixT &c)
void addDotMul(BaseMatrixT &b, BaseMatrixT &c, T p1, T p2)

this = p1 * this + p2 * b * c

void rowScale(size_t cCol, BaseMatrixT &b, BaseMatrixT &c)

this_row = b_row * c_row[cCol]

void rowScale2(size_t cCol, BaseMatrixT &b, BaseMatrixT &c)
void colScale(size_t cRow, BaseMatrixT &b, BaseMatrixT &c)

this_col = b_col * c_col[cRow]

void addColScale(size_t cRow, BaseMatrixT &b, BaseMatrixT &c)

this_col += b_col * c_col[cRow]

void addRowScale(size_t cCol, BaseMatrixT &b, BaseMatrixT &c)

this_row += b_row * c_row[cCol]

void sumRows(BaseMatrixT &b)

calculate the sum of each row of the matrix b.

void maxRows(BaseMatrixT &b)

calculate the maximum value of each row of the matrix b.

void minRows(BaseMatrixT &b)

calculate the minimum value of each row of the matrix b.

void sumCols(BaseMatrixT &b)

calculate the sum of each column of the matrix b.

void maxCols(BaseMatrixT &b)

calculate the maximum value of each column of the matrix b.

void minCols(BaseMatrixT &b)

calculate the minimum value of each column of the matrix b.

void sumCols(BaseMatrixT &b, T scale)
void sumOfSquares(BaseMatrixT &b, BaseMatrixT &c)

calculate the sum of each row of (b - c)^2.

void rowAdd(size_t cCol, BaseMatrixT &b, BaseMatrixT &c, T p)

this_row = b_row + p * ones * c_row[cCol]

void rowPow(size_t cCol, BaseMatrixT &b, BaseMatrixT &c)

this_row = pow(b_row, c_row[cCol])

virtual bool isSparse() const

Public Members

size_t height_
size_t width_
size_t stride_
T *data_
bool trans_
bool useGpu_

Sparse Matrix

namespace paddle

Typedefs

typedef std::shared_ptr<Matrix> MatrixPtr
typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr
typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr
typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr
typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr

Enums

enum SparseValueType

Values:

NO_VALUE = 0
FLOAT_VALUE = 1
enum SparseFormat

matrix sparse_format .

nnz represents nonzero number in sparse matrix.

SPARSE_CSR: row major matrix. length of row is height_ + 1, each element represents row start index in Matrix. length of col and value are nnz.

SPARSE_CSC: col major matrix. length of col is width_ + 1, each element represents col start index in Matrix. length of col and value are nnz.

for example: [0, 1, 0, 2, 0;
              1, 0, 0, 0, 0;
              0, 0, 0, 2, 5];
SPARSE_CSR row   [0, 2, 3, 5];
           col   [1, 3, 0, 3, 4];
           value [1, 2, 1, 2, 5]
SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
           row   [1, 0, 0, 2, 2];
           value [1, 1, 2, 2, 5]

Values:

SPARSE_CSR = 0
SPARSE_CSC = 1

Functions

std::ostream &operator<<(std::ostream &os, const Matrix &mat)
class Matrix
#include <Matrix.h>

Copy or assignemnt constructor will share the data as opposed to making a copy of the original data. To make a copy of the orinal data, use copyFrom() instead.

Inherits from paddle::BaseMatrixT< real >

Subclassed by paddle::CpuMatrix, paddle::CpuSparseMatrix, paddle::GpuMatrix, paddle::GpuSparseMatrix

Public Functions

virtual ~Matrix()
void setData(real *data)

set the data buffer used to hold the matrix data.

caller should make sure that the size of data is at least sizeof(real)*height*width.

void setData(real *data, size_t newHeight, size_t newWidth)

the data should be contiguous

size_t getWidth() const
size_t getHeight() const
size_t getStride() const
size_t getElementCnt() const
virtual real *getData()
virtual const real *getData() const
bool isTransposed() const
bool isContiguous() const
virtual int *getRows() const
virtual int *getCols() const
virtual SparseFormat getFormat() const
virtual SparseValueType getValueType() const
virtual void add3(MatrixPtr b)

matrix elment-wise add

Named add3 just because add/add2 has been used in BaseMatrix.cu and they are not virtual function.

MemoryHandlePtr getMemoryHandle() const
virtual void zeroMem()
virtual void resetOne()
virtual void copyFrom(const Matrix &src)
virtual void trimFrom(const CpuSparseMatrix &src)
virtual void copyFrom(const Matrix &src, hl_stream_t stream)
MatrixPtr subMatrix(size_t startRow, size_t endRow, size_t startCol, size_t endCol)
MatrixPtr subRowMatrix(size_t startRow, size_t endRow)
MatrixPtr subColMatrix(size_t startCol, size_t endCol)
virtual MatrixPtr subMatrix(size_t startRow, size_t numRows)
virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest)
virtual void copyFrom(const real *src, size_t size)

If this is GpuMatrix, src is assumed to be CPU memory

If this is CpuMatrix, src is assumed to be CPU memory

virtual void copyFrom(const real *src, const int64_t *seq)
virtual void copyFrom(const IVector &src)

convert a int vector to a real matrix.

(1) source and dest are both in CPU.

(2) sizes are exactly match.

virtual void copyByRowIndex(Matrix &b, IVector &rowIndex)
virtual MatrixPtr clone(size_t height = 0, size_t width = 0, bool useGpu = false)

Create a matrix with the same type (GpuMatrix, CpuMatrix, NonValueSparseMatrix, etc.) as this.

If height and width is zero, the new matrix will have the same size as this, otherwise the new matrix will have the specified size.

virtual real *getRowBuf(size_t row)
virtual real getElement(size_t x, size_t y) const
virtual real getSum()
virtual void accumulateColSum(Matrix &src)
virtual real getAbsSum()
virtual void resize(size_t newHeight, size_t newWidth) = 0

Note
Original data may not be preserved after resize().

virtual void resize(size_t newHeight, size_t newWidth, size_t newNnz, SparseValueType valueType, SparseFormat format) = 0

Note
This should only be used for sparse matrix.

virtual void setRow(size_t row, size_t colNum, const unsigned int *cols, const real *values) = 0

This should only be used for sparse matrix.

Currently must be called for each row in order. The matrix is not valid until setRow is called for the last row.

virtual MatrixPtr getTranspose() = 0
virtual void transpose(MatrixPtr matTrans, bool memAlloc)

hard transpose.

allocate matTrans’ memory outside, then set memAlloc as false; else set as true.

virtual void clear()

Only set all variables to 0 or NULL but not free them.

void reshape(size_t height, size_t width)
virtual void addBias(Matrix &b, real scale)

add b to each sample of this.

virtual void collectBias(Matrix &a, real scale)

add each sample from a to this.

virtual void sequenceAvgForward(Matrix &a, const IVector &startsPos, int mode)
virtual void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT)

this = scaleAB*(a*b) + scaleT*this

virtual void addColumnVector(const Matrix &b)

Add a vector (column) b to matrix a, column by column.

virtual void addByBitCode(size_t numClasses, const IVector &codes, const Matrix &vec)

For j < codeLength:
  this(i, j) += vec(index(i, j), 0)
where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1

virtual void addByBitCodeBackward(size_t numClasses, const IVector &codes, Matrix &vec)

For j < codeLength:
  vec(index(i, j), 0) += this(i, j)
where index is same as the index for addByBitCode

virtual void mulByBitCode(size_t numClasses, const IVector &codes, const Matrix &mat, const Matrix &input)

For j < codeLength:
  this(i, j) += <mat.row(index(i, j)), input.row(i)>
where index is same as the index for addByBitCode

virtual void mulByBitCodeBackwardWeight(size_t numClasses, const IVector &codes, Matrix &mat, const Matrix &input)

For j < codeLength:
  mat.row(index(i, j)) += this(i, j) * input.row(i)
where index is same as the index for addByBitCode

virtual void mulByBitCodeBackwardError(size_t numClasses, const IVector &codes, const Matrix &mat, Matrix &input)

For j < codeLength:
  input.row(i) += this(i, j) * mat.row(index(i, j))
where index is same as the index for addByBitCode

virtual void sumByBitCode(size_t numClasses, IVector &codes, Matrix &sum, real scaleSum)

For j < codeLength
  sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0

virtual void subByBitCode(size_t numClasses_, IVector &codes)

For j < codeLength
 this(i, j) -= bit(i, j)
where bit(i, j) is same as that for sumByBitCode

virtual void rowSum(Matrix &sum)

add the sum of each row of this to mat

virtual void rowMax(Matrix &max)

set the max of each row of this to mat

virtual void colMax(Matrix &max)
virtual void rowMaxId(IVector &maxIds)
virtual void rowMax(IVector &maxIds, Matrix &max)

Get the top k elements of each row of this matrix.

The column ids and values of these elements are stored in maxIds and max respectively. Note that the top k elements are not sorted.

virtual void rowNormalizeL1(Matrix &out)

normalize each row so that the sum of each row is 1.

virtual void mul(const MatrixPtr a, const MatrixPtr b)

this = a*b

virtual void rightMul(Matrix &b, real scaleAB, real scaleT)

this = scaleAB*(this*b) +  scaleT*this

virtual void rightMul(Matrix &b)

this = this* b

virtual void leftMul(Matrix &a, real scaleAB, real scaleT)

this = scaleAB*(a*this) +  scaleT*this

virtual void leftMul(Matrix &a)

this = a*this)

virtual void colMerge(Matrix &src)

merge the element for each col.

virtual void oneHotCrossEntropy(Matrix &output, IVector &label)

copy -log(output[label]) to this->data[i].

virtual void oneHotCrossEntropyBp(Matrix &outputV, IVector &label)

calculate the error of outputV according to label.

virtual void oneHotCrossEntropyWithSelfNorm(Matrix &output, IVector &label, real alpha)

copy -log(output[label]) to this->data[i].

virtual void oneHotCrossEntropyWithSelfNormBp(Matrix &outputV, IVector &label, real alpha)

calculate the error of outputV according to label.

virtual void circularConv(Matrix &b, Matrix &c)

\[ a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j} \]

b contains M elements, c contains N elements (N is odd), b’s index arithmetic is computed modulo M, c’s index arithmetic is computed modulo N.

virtual void circularConvDerivative(Matrix &output, Matrix &prevOut1, Matrix &prevOut2, Matrix &prevGrad1, Matrix &prevGrad2)
virtual void softmax(Matrix &output)
virtual void sequenceSoftmax(Matrix &output, const IVector &index)
virtual void softmaxBackward(Matrix &outputV)
virtual void softmaxDerivative(Matrix &output, Matrix &sftmaxSum)
virtual void sumOfSquares(Matrix &output, Matrix &label)

calculate the sum of squares diff cost.

virtual void sumOfSquaresBp(Matrix &outputV, Matrix &label)

gradient of sumOfSquares.

virtual void tanh(Matrix &output)
virtual void tanhDerivative(Matrix &output)
virtual void softrelu(Matrix &output)
virtual void softreluDerivative(Matrix &output)
virtual void scaledTanh(Matrix &output, real p1, real p2)
virtual void cosSim(Matrix &output1, Matrix &output2, real scale = 1.0f)

cosine similarity, for each row i, this[i] = cos(output1[i], output2[i])

output2 can only have one row, then for each row i, this[i] = cos(output1[i], output2[0])

virtual void cosSimDerivative(Matrix &output, Matrix &prevOut1, Matrix &prevOut2, Matrix &prevGrad1, Matrix &prevGrad2, real scale = 1.0f)
virtual void print(std::ostream &os) const

print out the values of elements to os

virtual void print(std::ostream &os, size_t height, size_t width) const

print a part of the matrix from the (top,left) value to the (height, width) value (not included)

virtual void printOneRow(std::ostream &os, size_t idx) const

print one row to os

virtual void check(std::ostream &os, Matrix &refMat, bool printDiff = true)
virtual real getMin()
virtual real getMax()
virtual void randomizeUniform()
virtual void classificationError(MatrixPtr output, IVectorPtr label)

calulate the error of classification

output[i] = 1 if row i is an error.

output[i] = 0 if row i is correct.

virtual void convExpand(Matrix &feature, int feaImgHeight, int feaImgWidth, int channels, int blockH, int blockW, int strideH, int strideW, int paddingH, int paddingW, int outputH, int outputW)

This function is used to calculate the convolution:

It will expand a feature matrix according to the convolution filters

virtual void convShrink(Matrix &expandColMat, int thisImgHeight, int thisImgWidth, int channels, int blockH, int blockW, int strideH, int strideW, int paddingH, int paddingW, int outputH, int outputW, real alpha = 1.0f, real beta = 0.0f)

This function is the reverse implementation of convExpand:

Its function is to restore a expanded-matrix into a feature matrix

virtual void maxPoolForward(Matrix &inputMat, size_t imgSizeH, size_t imgSizeW, size_t channels, size_t sizeX, int start_, size_t stride, size_t outputH, size_t outputW)

Pooling forward operation, pick out the largest element in the sizeX of value

virtual void maxPoolBackward(Matrix &image, size_t imgSizeH, size_t imgSizeW, Matrix &outGrad, Matrix &outV, size_t sizeX, int start, size_t stride, size_t outputH, size_t outputW, real scaleTargets, real scaleOutput)

Pooling backward operation.

virtual void avgPoolForward(Matrix &input, size_t imgSizeH, size_t imgSizeW, size_t channels, size_t sizeX, int start, size_t stride, size_t outputH, size_t outputW)

Pooling forward operation, caculate the average of sizeX elements.

virtual void avgPoolBackward(Matrix &input, size_t imgSizeH, size_t imgSizeW, size_t sizeX, int start, size_t stride, size_t outputH, size_t outputW, real scaleTargets, real scaleOutput)
virtual void crossMapNormalFwd(Matrix &input, size_t imgSizeH, size_t imgSizeW, Matrix &denoms, size_t channels, size_t sizeX, float scale, float pow)

normalize-operation.

virtual void crossMapNormalBwd(Matrix &localGrad, Matrix &denoms, Matrix &preOutV, Matrix &localOutV, size_t channels, size_t imgSizeH, size_t imgSizeW, size_t size, float scale, float pow)
virtual void maxSequenceForward(Matrix &input, const IVector &sequence, IVector &index)

Input: one or more sequences. Each sequence contains some instances.

Output: output size is the number of input sequences (NOT input instances).

output[i] is set to max_input[i].

virtual void maxSequenceBackward(Matrix &outputGrad, const IVector &sequence, IVector &index)
virtual void contextProjectionForward(MatrixPtr input, MatrixPtr weight, const IVector &sequence, int contextLength, int contextStart, size_t beginPad, bool isPadding)
virtual void contextProjectionBackward(MatrixPtr inputGrad, MatrixPtr weightGrad, const IVector &sequence, int contextLength, int contextStart, size_t beginPad, bool isPadding)
virtual void contextProjectionBackwardData(MatrixPtr inputGrad, const IVector &sequence, int contextLength, int contextStart)
virtual void contextProjectionBackwardWeight(MatrixPtr weightGrad, const IVector &sequence, int contextLength, int contextStart, int totalPad, size_t beginPad)
virtual void selectRows(Matrix &table, IVector &ids)

this.row[i] += table.row[ids[i]]
if ids[i] == -1, it will be ignored

virtual void selectElements(Matrix &table, IVector &ids)

this[i] = table[i, id[i]]

virtual void addToRows(Matrix &table, IVector &ids)

table.row[ids[i]] += this.row[i]
if ids[i] == -1, it will be ignored

virtual void addElements(Matrix &table, IVector &ids)

table[i, id[i]] += this[i]

virtual void multiBinaryLabelCrossEntropy(Matrix &output, Matrix &label)

cross entropy for multi binary labels

this[i] = -sum(label[i][j]*log(output[i][j])
          + (1-label[i][j])*log(1-output[i][j]))

virtual void multiBinaryLabelCrossEntropyBp(Matrix &output, Matrix &label)

The gradient of cross entropy for multi binary labels on output.

this[i][j] = -label[i][j]/output[i][j]
             + (1-label[i][j])/(1-output[i][j])

virtual void classificationErrorMulti(Matrix &output, Matrix &label, real threshold)

Calculate the classification error for multi binary labels.

this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
           || (output[i][j] < threshold && label[i][j] == 1))
           / output->getWidth()

virtual void paramReluForward(Matrix &data, Matrix &W)
virtual void paramReluBackwardW(Matrix &oGrad, Matrix &data)
virtual void paramReluBackwardDiff(Matrix &oGrad, Matrix &data, Matrix &W)

Public Members

size_t elementCnt_
MemoryHandlePtr memoryHandle_

Public Static Functions

MatrixPtr create(MemoryHandlePtr memHandle, size_t height, size_t width, bool trans = false)
MatrixPtr create(size_t height, size_t width, bool trans = false, bool useGpu = false)
MatrixPtr create(real *data, size_t height, size_t width, bool trans = false, bool useGpu = false)
MatrixPtr create(real *data, size_t height, size_t width, size_t stride, bool trans = false, bool useGpu = false)
MatrixPtr createSparseMatrix(size_t height, size_t width, size_t nnz, SparseValueType valueType = FLOAT_VALUE, bool trans = false, bool useGpu = false)
MatrixPtr createSparseMatrix(size_t height, size_t width, size_t nnz, SparseValueType valueType = FLOAT_VALUE, SparseFormat foramt = SPARSE_CSR, bool trans = false, bool useGpu = false)
MatrixPtr createSparseMatrix(real *data, int *row, int *col, size_t height, size_t width, size_t nnz, SparseValueType valueType, SparseFormat format, bool trans, bool useGpu)
void resizeOrCreateSparseMatrix(MatrixPtr &matrix, size_t height, size_t width, size_t nnz, SparseValueType valueType = FLOAT_VALUE, SparseFormat foramt = SPARSE_CSR, bool trans = false, bool useGpu = false)
void resizeOrCreate(MatrixPtr &a, size_t height, size_t width, bool trans = false, bool useGpu = false)

Protected Functions

Matrix(MemoryHandlePtr memHandle, size_t height, size_t width, bool trans, bool use_gpu)
Matrix(real *data, size_t height, size_t width, bool trans, bool use_gpu)
Matrix(real *data, size_t height, size_t width, size_t stride, bool trans, bool use_gpu)

Protected Static Attributes

ThreadLocal<MatrixPtr> tmpMat_
class GpuMatrix

Inherits from paddle::Matrix

Public Functions

GpuMatrix()
GpuMatrix(size_t height, size_t width, bool trans = false)
GpuMatrix(real *data, size_t height, size_t width, bool trans = false)
GpuMatrix(real *data, size_t height, size_t width, size_t stride, bool trans = false)
GpuMatrix(GpuMemHandlePtr dataHandle, size_t height, size_t width, bool trans = false)
~GpuMatrix()
virtual void zeroMem()
virtual void resetOne()
virtual void resize(size_t newHeight, size_t newWidth)

Note
Original data may not be preserved after resize().

virtual void resize(size_t newHeight, size_t newWidth, size_t newNnz, SparseValueType valueType, SparseFormat format)

Note
This should only be used for sparse matrix.

virtual void setRow(size_t row, size_t colNum, const unsigned int *cols, const real *values)

This should only be used for sparse matrix.

Currently must be called for each row in order. The matrix is not valid until setRow is called for the last row.

virtual void copyFrom(const real *hostSrc, size_t size)

Copy the data from cpu_memory buffer

virtual void copyFrom(const real *hostSrc, const int64_t *seq)
virtual void copyFrom(const Matrix &src, hl_stream_t stream)
virtual void copyFrom(const Matrix &src)
virtual void copyFrom(const IVector &src)

convert a int vector to a real matrix.

(1) source and dest are both in CPU.

(2) sizes are exactly match.

virtual void copyByRowIndex(Matrix &b, IVector &rowIndex)
virtual MatrixPtr clone(size_t height, size_t width, bool useGpu = false)

Create a matrix with the same type (GpuMatrix, CpuMatrix, NonValueSparseMatrix, etc.) as this.

If height and width is zero, the new matrix will have the same size as this, otherwise the new matrix will have the specified size.

virtual real getElement(size_t x, size_t y) const
real *getRow(size_t row)
virtual real *getRowBuf(size_t row)
virtual real getSum()
virtual void accumulateColSum(Matrix &src)
virtual real getAbsSum()
virtual MatrixPtr getTranspose()
virtual void transpose(MatrixPtr matTrans, bool memAlloc)

hard transpose.

allocate matTrans’ memory outside, then set memAlloc as false; else set as true.

virtual void addBias(Matrix &b, real scale)

add b to each sample of this.

virtual void collectBias(Matrix &a, real scale)

add each sample from a to this.

virtual void sequenceAvgForward(Matrix &a, const IVector &startsPos, int mode)
virtual void selectRows(Matrix &table, IVector &ids)

this.row[i] += table.row[ids[i]]

virtual void selectElements(Matrix &table, IVector &ids)

this[i] = table[i, id[i]]

virtual void addToRows(Matrix &table, IVector &ids)

table.row[ids[i]] += this.row[i]

virtual void addColumnVector(const Matrix &b)

Add a vector (column) b to matrix a, column by column.

virtual void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT)

this = scaleAB*(a*b) + scaleT*this

virtual void mul(const MatrixPtr a, const MatrixPtr b)

this = a*b

void mul(const GpuMatrix &a, const GpuMatrix &b, real scaleAB, real scaleT)
void mul(const GpuSparseMatrix &a, const GpuMatrix &b, real scaleAB, real scaleT)
void mul(const GpuMatrix &a, const GpuSparseMatrix &b, real scaleAB, real scaleT)
virtual void rightMul(Matrix &b, real scaleAB, real scaleT)

this = scaleAB*(this*b) +  scaleT*this

virtual void rightMul(Matrix &b)

this = this* b

virtual void leftMul(Matrix &a, real scaleAB, real scaleT)

this = scaleAB*(a*this) +  scaleT*this

virtual void leftMul(Matrix &a)

this = a*this

virtual void colMerge(Matrix &src)

merge the element for each col.

virtual void rowSum(Matrix &sum)

add the sum of each row of this to mat

virtual void rowMax(Matrix &max)

set the max of each row of this to mat

virtual void rowMax(IVector &maxIds, Matrix &max)

Get the top k elements of each row of this matrix.

The column ids and values of these elements are stored in maxIds and max respectively. Note that the top k elements are not sorted.

virtual void colMax(Matrix &max)
virtual void oneHotCrossEntropy(Matrix &output, IVector &label)

copy -log(output[label]) to this->data[i].

virtual void oneHotCrossEntropyBp(Matrix &outputV, IVector &label)

calculate the error of outputV according to label.

virtual void oneHotCrossEntropyWithSelfNorm(Matrix &output, IVector &label, real alpha)

copy -log(output[label]) to this->data[i].

virtual void oneHotCrossEntropyWithSelfNormBp(Matrix &outputV, IVector &label, real alpha)

calculate the error of outputV according to label.

virtual void softmax(Matrix &output)
virtual void sequenceSoftmax(Matrix &output, const IVector &index)
virtual void softmaxBackward(Matrix &outputV)
virtual void softmaxDerivative(Matrix &output, Matrix &sftmaxSum)
virtual void sumOfSquares(Matrix &output, Matrix &label)

calculate the sum of squares diff cost.

virtual void sumOfSquaresBp(Matrix &outputV, Matrix &label)

gradient of sumOfSquares.

virtual void tanh(Matrix &output)
virtual void tanhDerivative(Matrix &output)
virtual void softrelu(Matrix &output)
virtual void softreluDerivative(Matrix &output)
virtual void scaledTanh(Matrix &output, real p1, real p2)
virtual void cosSim(Matrix &output1, Matrix &output2, real scale)

cosine similarity, for each row i, this[i] = cos(output1[i], output2[i])

output2 can only have one row, then for each row i, this[i] = cos(output1[i], output2[0])

virtual void cosSimDerivative(Matrix &output, Matrix &prevOut1, Matrix &prevOut2, Matrix &prevGrad1, Matrix &prevGrad2, real scale)
virtual void print(std::ostream &os) const

print out the values of elements to os

virtual void print(std::ostream &os, size_t height, size_t width) const

print a part of the matrix from the (top,left) value to the (height, width) value (not included)

virtual void paramReluForward(Matrix &data, Matrix &W)
virtual void paramReluBackwardW(Matrix &oGrad, Matrix &data)
virtual void paramReluBackwardDiff(Matrix &oGrad, Matrix &data, Matrix &W)
virtual void check(std::ostream &os, Matrix &refMat, bool printDiff = true)
virtual void randomizeUniform()
virtual void classificationError(MatrixPtr output, IVectorPtr label)

calulate the error of classification

output[i] = 1 if row i is an error.

output[i] = 0 if row i is correct.

virtual void convExpand(Matrix &feature, int feaImgHeight, int feaImgWidth, int channels, int blockH, int blockW, int strideH, int strideW, int paddingH, int paddingW, int outputH, int outputW)

This function is used to calculate the convolution:

It will expand a feature matrix according to the convolution filters

virtual void convShrink(Matrix &expandColMat, int thisImgHeight, int thisImgWidth, int channels, int blockH, int blockW, int strideH, int strideW, int paddingH, int paddingW, int outputH, int outputW, real alpha = 1.0f, real beta = 0.0f)

This function is the reverse implementation of convExpand:

Its function is to restore a expanded-matrix into a feature matrix

virtual void maxPoolForward(Matrix &inputMat, size_t imgSizeH, size_t imgSizeW, size_t channels, size_t sizeX, int start_, size_t stride, size_t outputH, size_t outputW)

Pooling forward operation, pick out the largest element in the sizeX of value

virtual void maxPoolBackward(Matrix &image, size_t imgSizeH, size_t imgSizeW, Matrix &outGrad, Matrix &outV, size_t sizeX, int start, size_t stride, size_t outputH, size_t outputW, real scaleTargets, real scaleOutput)

Pooling backward operation.

virtual void avgPoolForward(Matrix &input, size_t imgSizeH, size_t imgSizeW, size_t channels, size_t sizeX, int start, size_t stride, size_t outputH, size_t outputW)

Pooling forward operation, caculate the average of sizeX elements.

virtual void avgPoolBackward(Matrix &input, size_t imgSizeH, size_t imgSizeW, size_t sizeX, int start, size_t stride, size_t outputH, size_t outputW, real scaleTargets, real scaleOutput)
virtual void crossMapNormalFwd(Matrix &input, size_t imgSizeH, size_t imgSizeW, Matrix &denoms, size_t channels, size_t sizeX, float scale, float pow)

normalize-operation.

virtual void crossMapNormalBwd(Matrix &localGrad, Matrix &denoms, Matrix &preOutV, Matrix &localOutV, size_t channels, size_t imgSizeH, size_t imgSizeW, size_t sizeX, float scale, float pow)
virtual void maxSequenceForward(Matrix &input, const IVector &sequence, IVector &index)

Input: one or more sequences. Each sequence contains some instances.

Output: output size is the number of input sequences (NOT input instances).

output[i] is set to max_input[i].

virtual void maxSequenceBackward(Matrix &outputGrad, const IVector &sequence, IVector &index)
virtual void contextProjectionForward(MatrixPtr input, MatrixPtr weight, const IVector &sequence, int contextLength, int contextStart, size_t beginPad, bool isPadding)
virtual void contextProjectionBackwardData(MatrixPtr inputGrad, const IVector &sequence, int contextLength, int contextStart)
virtual void contextProjectionBackwardWeight(MatrixPtr weightGrad, const IVector &sequence, int contextLength, int contextStart, int totalPad, size_t beginPad)
class CpuMatrix

Inherits from paddle::Matrix

Subclassed by paddle::SharedCpuMatrix, paddle::SparseRowCpuMatrix, paddle::SparseRowIdsCpuMatrix

Public Functions

CpuMatrix(size_t height, size_t width, bool trans = false)

CpuMatrix

CpuMatrix(real *data, size_t height, size_t width, bool trans = false)
CpuMatrix(real *data, size_t height, size_t width, size_t stride, bool trans = false)
CpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width, bool trans = false)
~CpuMatrix()
virtual void zeroMem()
virtual void resetOne()
virtual void resize(size_t newHeight, size_t newWidth)

Note
Original data may not be preserved after resize().

virtual void resize(size_t newHeight, size_t newWidth, size_t newNnz, SparseValueType valueType, SparseFormat format)

Note
This should only be used for sparse matrix.

virtual void setRow(size_t row, size_t colNum, const unsigned int *cols, const real *values)

This should only be used for sparse matrix.

Currently must be called for each row in order. The matrix is not valid until setRow is called for the last row.

virtual real getElement(size_t x, size_t y) const
virtual real getSum()
virtual void accumulateColSum(Matrix &src)
virtual real getAbsSum()
virtual MatrixPtr getTranspose()
virtual void transpose(MatrixPtr matTrans, bool memAlloc)

hard transpose.

allocate matTrans’ memory outside, then set memAlloc as false; else set as true.

virtual void copyFrom(const Matrix &src)
virtual void copyFrom(const Matrix &src, hl_stream_t stream)
virtual void copyFrom(const real *src, size_t size)

If this is GpuMatrix, src is assumed to be CPU memory

If this is CpuMatrix, src is assumed to be CPU memory

virtual void copyFrom(const real *cpuSrc, const int64_t *seq)
virtual void copyFrom(const IVector &src)

convert a int vector to a real matrix.

(1) source and dest are both in CPU.

(2) sizes are exactly match.

void copyFrom(CpuSparseMatrix &src)
virtual void copyByRowIndex(Matrix &b, IVector &rowIndex)
virtual MatrixPtr clone(size_t height, size_t width, bool useGpu = false)

Create a matrix with the same type (GpuMatrix, CpuMatrix, NonValueSparseMatrix, etc.) as this.

If height and width is zero, the new matrix will have the same size as this, otherwise the new matrix will have the specified size.

virtual void convExpand(Matrix &feature, int feaImgHeight, int feaImgWidth, int channels, int blockH, int blockW, int strideH, int strideW, int paddingH, int paddingW, int outputH, int outputW)

This function is used to calculate the convolution:

It will expand a feature matrix according to the convolution filters

virtual void convShrink(Matrix &expandColMat, int thisImgHeight, int thisImgWidth, int channels, int blockH, int blockW, int strideH, int strideW, int paddingH, int paddingW, int outputH, int outputW, real alpha = 1.0f, real beta = 0.0f)

This function is the reverse implementation of convExpand:

Its function is to restore a expanded-matrix into a feature matrix

virtual void maxPoolForward(Matrix &inputMat, size_t imgSizeH, size_t imgSizeW, size_t channels, size_t sizeX, int start_, size_t stride, size_t outputH, size_t outputW)

Pooling forward operation, pick out the largest element in the sizeX of value

virtual void maxPoolBackward(Matrix &image, size_t imgSizeH, size_t imgSizeW, Matrix &outGrad, Matrix &outV, size_t sizeX, int start, size_t stride, size_t outputH, size_t outputW, real scaleTargets, real scaleOutput)

Pooling backward operation.

virtual void avgPoolForward(Matrix &input, size_t imgSizeH, size_t imgSizeW, size_t channels, size_t sizeX, int start, size_t stride, size_t outputH, size_t outputW)

Pooling forward operation, caculate the average of sizeX elements.

virtual void avgPoolBackward(Matrix &input, size_t imgSizeH, size_t imgSizeW, size_t sizeX, int start, size_t stride, size_t outputH, size_t outputW, real scaleTargets, real scaleOutput)
virtual void crossMapNormalFwd(Matrix &input, size_t imgSizeH, size_t imgSizeW, Matrix &denoms, size_t channels, size_t sizeX, float scale, float pow)

normalize-operation.

virtual void crossMapNormalBwd(Matrix &localGrad, Matrix &denoms, Matrix &preOutV, Matrix &localOutV, size_t channels, size_t imgSizeH, size_t imgSizeW, size_t sizeX, float scale, float pow)
virtual void maxSequenceForward(Matrix &input, const IVector &sequence, IVector &index)

Input: one or more sequences. Each sequence contains some instances. Output: output size is the number of input sequences (NOT input instances). output[i] is set to max_{for each instance in this sequence}{input[i]}

virtual void maxSequenceBackward(Matrix &outputGrad, const IVector &sequence, IVector &index)
virtual void contextProjectionForward(MatrixPtr input, MatrixPtr weight, const IVector &sequence, int contextLength, int contextStart, size_t beginPad, bool isPadding)
virtual void contextProjectionBackward(MatrixPtr inputGrad, MatrixPtr weightGrad, const IVector &sequence, int contextLength, int contextStart, size_t beginPad, bool isPadding)
real *getRow(size_t row)
virtual real *getRowBuf(size_t row)
virtual void addBias(Matrix &b, real scale)

add b to each sample of this.

virtual void collectBias(Matrix &a, real scale)

add each sample of a to this.

virtual void sequenceAvgForward(Matrix &a, const IVector &startsPos, int mode)
virtual void selectRows(Matrix &table, IVector &ids)

this.row[i] += table.row[ids[i]]

virtual void addToRows(Matrix &table, IVector &ids)

table.row[ids[i]] += this.row[i]

virtual void selectElements(Matrix &table, IVector &ids)

this[i] = table[i, id[i]]

virtual void addElements(Matrix &table, IVector &ids)

table[i, id[i]] += this[i]

template <typename TableMatType>
void selectRowsImp(TableMatType &table, IVector &ids)

use abstract getRow() to get row from table.

Define table as template instead of virtual class for performance sake. internal used by above two virtual funcs.

template <typename TableMatType>
void addToRowsImp(TableMatType &table, IVector &ids)
virtual void addColumnVector(const Matrix &b)

Add a vector (column) b to matrix a, column by column.

virtual void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT)

this = scaleAB*(a*b) + scaleT*this

void mul(CpuMatrix *a, CpuMatrix *b, real scaleAB, real scaleT)
void mul(CpuMatrix *a, CpuSparseMatrix *b, real scaleAB, real scaleT)
virtual void mul(CpuSparseMatrix *a, CpuMatrix *b, real scaleAB, real scaleT)
virtual void mul(const MatrixPtr a, const MatrixPtr b)

this = a*b

virtual void rightMul(Matrix &b, real scaleAB, real scaleT)

this = scaleAB*(this*b) +  scaleT*this

virtual void rightMul(Matrix &b)

this = this* b

virtual void leftMul(Matrix &a, real scaleAB, real scaleT)

this = scaleAB*(a*this) +  scaleT*this

virtual void leftMul(Matrix &a)

this = a*this)

virtual void colMerge(Matrix &src)

merge the element for each col.

virtual void rowSum(Matrix &sum)

add the sum of each row of this to mat

virtual void rowMaxId(IVector &maxIds)
virtual void rowMax(Matrix &max)

set the max of each row of this to mat

virtual void rowMax(IVector &maxIds, Matrix &max)

Get the top k elements of each row of this matrix.

The column ids and values of these elements are stored in maxIds and max respectively. Note that the top k elements are not sorted.

virtual void colMax(Matrix &max)
virtual void rowNormalizeL1(Matrix &out)

normalize each row so that the sum of each row is 1.

virtual void oneHotCrossEntropy(Matrix &output, IVector &label)

copy -log(output[label]) to this->data[i].

virtual void oneHotCrossEntropyBp(Matrix &outputV, IVector &label)

calculate the error of outputV according to label.

virtual void oneHotCrossEntropyWithSelfNorm(Matrix &output, IVector &label, real alpha)

copy -log(output[label]) to this->data[i].

virtual void oneHotCrossEntropyWithSelfNormBp(Matrix &outputV, IVector &label, real alpha)

calculate the error of outputV according to label.

virtual void circularConv(Matrix &b, Matrix &c)

\[ a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j} \]

b contains M elements, c contains N elements (N is odd), b’s index arithmetic is computed modulo M, c’s index arithmetic is computed modulo N.

virtual void circularConvDerivative(Matrix &output, Matrix &prevOut1, Matrix &prevOut2, Matrix &prevGrad1, Matrix &prevGrad2)
virtual void softmax(Matrix &output)
virtual void sequenceSoftmax(Matrix &output, const IVector &index)
virtual void softmaxDerivative(Matrix &output, Matrix &sftmaxSum)
virtual void sumOfSquares(Matrix &output, Matrix &label)

calculate the sum of squares diff cost.

virtual void sumOfSquaresBp(Matrix &outputV, Matrix &label)

gradient of sumOfSquares.

virtual void tanh(Matrix &output)
virtual void tanhDerivative(Matrix &output)
virtual void softrelu(Matrix &output)
virtual void softreluDerivative(Matrix &output)
virtual void scaledTanh(Matrix &output, real p1, real p2)
virtual void cosSim(Matrix &output1, Matrix &output2, real scale)

cosine similarity, for each row i, this[i] = cos(output1[i], output2[i])

output2 can only have one row, then for each row i, this[i] = cos(output1[i], output2[0])

virtual void cosSimDerivative(Matrix &output, Matrix &prevOut1, Matrix &prevOut2, Matrix &prevGrad1, Matrix &prevGrad2, real scale)
virtual void print(std::ostream &os) const

print out the values of elements to os

virtual void print(std::ostream &os, size_t height, size_t width) const

print a part of the matrix from the (top,left) value to the (height, width) value (not included)

virtual void printOneRow(std::ostream &os, size_t idx) const

print one row to os

virtual void paramReluForward(Matrix &data, Matrix &W)
virtual void paramReluBackwardW(Matrix &oGrad, Matrix &data)
virtual void paramReluBackwardDiff(Matrix &oGrad, Matrix &data, Matrix &W)
virtual void check(std::ostream &os, Matrix &refMat, bool printDiff = true)
virtual real getMin()
virtual real getMax()
virtual void randomizeUniform()
virtual void classificationError(MatrixPtr output, IVectorPtr label)

calulate the error of classification

output[i] = 1 if row i is an error.

output[i] = 0 if row i is correct.

virtual void addByBitCode(size_t numClasses, const IVector &codes, const Matrix &vec)

For j < codeLength:
  this(i, j) += vec(index(i, j), 0)
where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1

virtual void addByBitCodeBackward(size_t numClasses, const IVector &codes, Matrix &vec)

For j < codeLength:
  vec(index(i, j), 0) += this(i, j)
where index is same as the index for addByBitCode

virtual void mulByBitCode(size_t numClasses, const IVector &codes, const Matrix &mat, const Matrix &input)

For j < codeLength:
  this(i, j) += <mat.row(index(i, j)), input.row(i)>
where index is same as the index for addByBitCode

virtual void mulByBitCodeBackwardWeight(size_t numClasses, const IVector &codes, Matrix &mat, const Matrix &input)

For j < codeLength:
  mat.row(index(i, j)) += this(i, j) * input.row(i)
where index is same as the index for addByBitCode

virtual void mulByBitCodeBackwardError(size_t numClasses, const IVector &codes, const Matrix &mat, Matrix &input)

For j < codeLength:
  input.row(i) += this(i, j) * mat.row(index(i, j))
where index is same as the index for addByBitCode

virtual void sumByBitCode(size_t numClasses, IVector &codes, Matrix &sum, real scaleSum)

For j < codeLength
  sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0

virtual void subByBitCode(size_t numClasses_, IVector &codes)

For j < codeLength
 this(i, j) -= bit(i, j)
where bit(i, j) is same as that for sumByBitCode

virtual void multiBinaryLabelCrossEntropy(Matrix &output, Matrix &label)

cross entropy for multi binary labels

this[i] = -sum(label[i][j]*log(output[i][j])
          + (1-label[i][j])*log(1-output[i][j]))

virtual void multiBinaryLabelCrossEntropyBp(Matrix &output, Matrix &label)

The gradient of cross entropy for multi binary labels on output.

this[i][j] = -label[i][j]/output[i][j]
             + (1-label[i][j])/(1-output[i][j])

virtual void classificationErrorMulti(Matrix &output, Matrix &label, real threshold)

Calculate the classification error for multi binary labels.

this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
           || (output[i][j] < threshold && label[i][j] == 1))
           / output->getWidth()

Public Static Functions

void mul(CpuMatrix *a, CpuMatrix *b, CpuSparseMatrix *c, real scaleAB, real scaleT)
template <typename MatBType, typename MatCType>
void mul(CpuSparseMatrix *a, MatBType *b, MatCType *c, real scaleAB, real scaleT)

c = a * b

use abstract getRow() to get row from B,C. Define B,C as template instead of virtual class for performance sake.

class SharedCpuMatrix

Inherits from paddle::CpuMatrix

Public Functions

SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
SharedCpuMatrix(int blockNum, real *data, size_t height, size_t width, bool trans = false)
SharedCpuMatrix(int blockNum, CpuMemHandlePtr dataHandle, size_t height, size_t width, bool trans = false)
SharedCpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width, bool trans = false)
~SharedCpuMatrix()
virtual void mul(CpuSparseMatrix *a, CpuMatrix *b, real scaleAB, real scaleT)
void add(Matrix &b, real p1, real p2)
void add(real p1, real p2)

Private Functions

void initShared(int blockNum)
void initBlock(int blockNum)

Private Members

int blockNum_
std::vector<std::unique_ptr<std::mutex>> blockLocks_
ThreadLocal<CpuMatrixPtr> localBuf_
ThreadLocal<std::vector<int>> localBufRows_
ThreadLocal<std::vector<int>> blockSeq_
struct sparse_non_value_t

Public Members

unsigned int col
struct sparse_float_value_t

Public Members

unsigned int col
float value
namespace paddle

Typedefs

typedef VectorT<real> Vector
typedef CpuVectorT<real> CpuVector
typedef GpuVectorT<real> GpuVector
typedef VectorT<int> IVector
typedef CpuVectorT<int> CpuIVector
typedef GpuVectorT<int> GpuIVector
typedef std::shared_ptr<Vector> VectorPtr
typedef std::shared_ptr<CpuVector> CpuVectorPtr
typedef std::shared_ptr<GpuVector> GpuVectorPtr
typedef std::shared_ptr<IVector> IVectorPtr
typedef std::shared_ptr<CpuIVector> CpuIVectorPtr
typedef std::shared_ptr<GpuIVector> GpuIVectorPtr
typedef CpuGpuVectorT<real> CpuGpuVector
typedef CpuGpuVectorT<int> ICpuGpuVector
typedef std::shared_ptr<CpuGpuVector> CpuGpuVectorPtr
typedef std::shared_ptr<ICpuGpuVector> ICpuGpuVectorPtr

Functions

template <class T>
std::ostream &operator<<(std::ostream &os, const VectorT<T> &vec)
template <class T>
class GpuVectorT

Inherits from paddle::VectorT< T >

Public Functions

GpuVectorT(size_t size)
GpuVectorT(size_t size, GpuMemHandlePtr memHandle, size_t offset)
GpuVectorT(size_t size, T *data)
virtual MemoryHandlePtr newMemory(size_t size)
virtual void zeroMem()
virtual void reset(const T &value)
virtual void fillSequence()
virtual void copyFrom(const T *src, size_t size)

copy size elements from src

If this is GpuVector, src can be cpu or gpu memory

If this is CpuVector, src is assumed to be cpu memory

virtual void copyFrom(const T *src, size_t size, hl_stream_t stream)

copy size elements from src

If this is GpuVector, src can be cpu or gpu memory

If this is CpuVector, src is assumed to be cpu memory,

virtual void copyFrom(const VectorT<T> &src)

This function will crash if the size of src and dest is different.

virtual void copyFrom(const VectorT<T> &src, hl_stream_t stream)

If use_gpu, this function will push the copy-task to the specifed-stream and return immediately.

If not use GPU, this function is same as the copyFrom(const VectorT<T>& src), which use stream HPPL_STREAM_DEFAULT.

virtual T getElement(size_t i) const

Get the value for the i’th element.

virtual void setElement(size_t i, const T &value)
virtual T *getPoint(const uint64_t beginPos)

Get the buffer point with beginPos.

virtual T getAbsSum()
virtual T getSum()
virtual T getMax()
virtual T getAbsMax()
virtual T getMin()
virtual void isEqualTo(const VectorT<T> &b, const T &value)

element-wise calc: this = (b == value)

virtual void selectFrom(const VectorT<T> &src, const VectorT<int> &ids)

select elements indexed by ids from vector src

virtual void histogram(std::ostream &os, int type)

print histogram of vector values

Note
only exponent histogram supported currently

virtual void rand()

generate uniform random value for each element

virtual void rand(size_t classes)

generate uniform random value for each element, data range is from 0 to (classes - 1).

virtual void randnorm(real mean, real standardDeviation)

generate univariate Gaussian distributed random numbers with given mean and standardDeviation.

virtual void uniform(real left, real right)

generate uniform distributed random numbers with given range.

virtual T get(size_t pos)

Debug use only. Very inefficient for GPU vector. get the value at pos.

virtual void print(std::ostream &os, size_t num) const

print the first “num” elements of the Vector

virtual void printOneElement(std::ostream &os, size_t idx) const

print the “idx” element of the Vector

Protected Functions

virtual void copyTo(CpuVectorT<T> *dest) const
virtual void copyTo(GpuVectorT<T> *dest) const
template <class T>
class CpuVectorT

Inherits from paddle::VectorT< T >

Subclassed by paddle::ParallelCpuVectorT< T >

Public Functions

CpuVectorT(size_t size)
CpuVectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset)
CpuVectorT(size_t size, T *data)
CpuVectorT(const VectorT<T> &src)

If src is a CpuVector, the new CpuVector will share the data with src

If src is a GpuVector, the new CpuVector will copy data from src

virtual MemoryHandlePtr newMemory(size_t size)
virtual void zeroMem()
virtual void reset(const T &value)
virtual void fillSequence()
virtual void copyFrom(const T *src, size_t size)

copy size elements from src

If this is GpuVector, src can be cpu or gpu memory

If this is CpuVector, src is assumed to be cpu memory

virtual void copyFrom(const T *src, size_t size, hl_stream_t stream)

copy size elements from src

If this is GpuVector, src can be cpu or gpu memory

If this is CpuVector, src is assumed to be cpu memory,

virtual void copyFrom(const VectorT<T> &src)

This function will crash if the size of src and dest is different.

virtual void copyFrom(const VectorT<T> &src, hl_stream_t stream)

If use_gpu, this function will push the copy-task to the specifed-stream and return immediately.

If not use GPU, this function is same as the copyFrom(const VectorT<T>& src), which use stream HPPL_STREAM_DEFAULT.

virtual void copyTo(CpuVectorT<T> *dest) const
virtual void copyTo(GpuVectorT<T> *dest) const
virtual T *getPoint(const uint64_t beginPos)

Get the buffer point with beginPos.

virtual T getElement(size_t i) const

Get the value for the i’th element.

virtual void setElement(size_t i, const T &value)
virtual T getAbsSum()
virtual T getSum()
virtual T getMax()
virtual T getAbsMax()
virtual T getMin()
virtual void isEqualTo(const VectorT<T> &b, const T &value)

element-wise calc: this = (b == value)

virtual void selectFrom(const VectorT<T> &src, const VectorT<int> &ids)

select elements indexed by ids from vector src

virtual void histogram(std::ostream &os, int type)

print histogram of vector values

Note
only exponent histogram supported currently

virtual void rand()

generate uniform random value for each element

virtual void rand(size_t classes)

generate uniform random value for each element, data range is from 0 to (classes - 1).

virtual void randnorm(real mean, real standardDeviation)

generate univariate Gaussian distributed random numbers with given mean and standardDeviation.

virtual void uniform(real left, real right)

generate uniform distributed random numbers with given range.

virtual T get(size_t pos)

Debug use only. Very inefficient for GPU vector. get the value at pos.

virtual void print(std::ostream &os, size_t num) const

print the first “num” elements of the Vector

virtual void printOneElement(std::ostream &os, size_t idx) const

print the “idx” element of the Vector

template <class T>
class BaseVector

Inherits from paddle::BaseMatrixT< T >

Subclassed by paddle::VectorT< T >

Public Functions

BaseVector(size_t size, T *data, bool useGpu)
~BaseVector()

Protected Attributes

size_t &size_
template <class T>
class VectorT
#include <Vector.h>

Copy or assignemnt constructor will share the data as opposed to making a copy of the original data. To make a copy of the orinal data, use copyFrom() instead.

Inherits from paddle::BaseVector< T >

Subclassed by paddle::CpuVectorT< T >, paddle::GpuVectorT< T >

Public Types

enum HistogramType

Values:

HISTOGRAM_EXPONENT = 0

Public Functions

virtual ~VectorT()
size_t getSize() const
const T *getData() const
T *getData()
virtual void zeroMem() = 0
virtual void reset(const T &value) = 0
virtual void fillSequence() = 0
MemoryHandlePtr getMemoryHandle() const
void resize(size_t newSize)

resizing to a big vector will not preserve old values.

virtual MemoryHandlePtr newMemory(size_t size) = 0
void subVecFrom(const VectorT<T> &src, size_t start, size_t size)

form sub vector from src, shallow copy

std::shared_ptr<VectorT<T>> subVec(size_t start, size_t size)
void subVecFrom(const T *src, size_t start, size_t size)

form sub vector from src, shallow copy

void subVecFrom(const VectorT<T> &src, std::pair<size_t, size_t> interval)

form sub vector from src, shallow copy in interval [interval.first, interval.second)

virtual void copyFrom(const VectorT<T> &src) = 0

This function will crash if the size of src and dest is different.

virtual void copyFrom(const VectorT<T> &src, hl_stream_t stream) = 0

If use_gpu, this function will push the copy-task to the specifed-stream and return immediately.

If not use GPU, this function is same as the copyFrom(const VectorT<T>& src), which use stream HPPL_STREAM_DEFAULT.

virtual void copyFrom(const T *src, size_t size) = 0

copy size elements from src

If this is GpuVector, src can be cpu or gpu memory

If this is CpuVector, src is assumed to be cpu memory

virtual void copyFrom(const T *src, size_t size, hl_stream_t stream) = 0

copy size elements from src

If this is GpuVector, src can be cpu or gpu memory

If this is CpuVector, src is assumed to be cpu memory,

virtual void exec(SyncThreadPool::JobFunc func)

exec a func in single/multi thread

virtual T *getPoint(const uint64_t beginPos) = 0

Get the buffer point with beginPos.

virtual T getElement(size_t i) const = 0

Get the value for the i’th element.

virtual void setElement(size_t i, const T &value) = 0
virtual T getAbsSum() = 0
virtual T getSum() = 0
virtual T getMax() = 0
virtual T getAbsMax() = 0
virtual T getMin() = 0
virtual void isEqualTo(const VectorT<T> &b, const T &value) = 0

element-wise calc: this = (b == value)

virtual void selectFrom(const VectorT<T> &src, const VectorT<int> &ids) = 0

select elements indexed by ids from vector src

virtual void histogram(std::ostream &os, int type = HISTOGRAM_EXPONENT) = 0

print histogram of vector values

Note
only exponent histogram supported currently

virtual void rand() = 0

generate uniform random value for each element

virtual void rand(size_t classes) = 0

generate uniform random value for each element, data range is from 0 to (classes - 1).

virtual T get(size_t pos) = 0

Debug use only. Very inefficient for GPU vector. get the value at pos.

virtual void randnorm(real mean, real standardDeviation) = 0

generate univariate Gaussian distributed random numbers with given mean and standardDeviation.

virtual void uniform(real left, real right) = 0

generate uniform distributed random numbers with given range.

virtual void print(std::ostream &os, size_t num) const = 0

print the first “num” elements of the Vector

virtual void printOneElement(std::ostream &os, size_t idx) const = 0

print the “idx” element of the Vector

Public Static Functions

std::shared_ptr<VectorT<T>> create(size_t size, bool useGpu)
std::shared_ptr<VectorT<T>> create(T *data, size_t size, bool useGpu)
std::shared_ptr<VectorT<T>> create(size_t size, MemoryHandlePtr memoryHandle, size_t offset = 0)
std::shared_ptr<VectorT<T>> createParallelVector(size_t size, bool useGpu, SyncThreadPool *pool = nullptr)
static void resizeOrCreate(std::shared_ptr<VectorT<T>> &vec, size_t size, bool useGpu)

Protected Functions

VectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset, bool useGpu)
VectorT(size_t size, T *data, bool useGpu)
virtual void copyTo(CpuVectorT<T> *dest) const = 0
virtual void copyTo(GpuVectorT<T> *dest) const = 0

Protected Attributes

MemoryHandlePtr memoryHandle_

Friends

friend paddle::GpuVectorT< T >
friend paddle::CpuVectorT< T >
template <class T>
class ParallelCpuVectorT

Inherits from paddle::CpuVectorT< T >

Public Functions

ParallelCpuVectorT(size_t size, SyncThreadPool *pool)
virtual void zeroMem()
virtual void randnorm(real mean, real standardDeviation)

generate univariate Gaussian distributed random numbers with given mean and standardDeviation.

virtual void uniform(real left, real right)

generate uniform distributed random numbers with given range.

virtual void exec(SyncThreadPool::JobFunc func)

exec a func in single/multi thread

Private Types

typedef std::function<void(CpuVectorT<T> &vec)> ExecFunc

Private Functions

void parallelExec(ExecFunc func)

Private Members

SyncThreadPool *pool_
template <class T>
class CpuGpuVectorT
#include <Vector.h>

A class to do conversion between CpuVector and GpuVector automatically.

Public Types

enum SyncedFlag

An enum type of SyncedFlag using to mark data memory is in CPU or GPU.

DATA_AT_CPU: data is located in CPU.

DATA_AT_GPU: data is located in GPU.

SYNCED: data is located in CPU and GPU simultaneously.

Values:

DATA_AT_CPU = 0
DATA_AT_GPU = 1
SYNCED = 2

Public Functions

CpuGpuVectorT(size_t size, bool useGpu)

A constructor, create cpuVectorT_ or gpuVectorT_.

Parameters
  • size -

    data size.

  • useGpu -

    use gpu or not.

CpuGpuVectorT(const std::shared_ptr<VectorT<T>> &src)

A constructor, create CpuGpuVectorT by VectorT.

If src is CpuVector, cpuVectorT_ is shared data with src.

If src is GpuVector, gpuVectorT_ is shared data with src.

CpuGpuVectorT(size_t size, T *data, bool useGpu)

A constructor.

If useGpu is true, data should be located in device and create gpuVectorT_ with data.

If useGpu is false, data should be located in host and create cpuVectorT_ with data.

Note
Data is owned by the caller and should be valid during the life of this vector. Caller is responsible for release the memory.

CpuGpuVectorT(CpuGpuVectorT<T> &src, size_t offset, size_t size)
virtual ~CpuGpuVectorT()
void resize(size_t size, bool useGpu)

resize vector.

If useGpu is true, resize gpuVectorT_ and set syncFlag_ to DATA_AT_GPU,

otherwise resize cpuVectorT_ and set syncFlag_ to DATA_AT_CPU.

std::shared_ptr<const VectorT<T>> getVector(bool useGpu) const

return a const cpuVectorT_ or gpuVectorT_.

If useGpu is true, return gpuVectorT_.

If useGpu is false, return cpuVectorT_.

Note
Caller should not change the data. If caller changes const attribute, should set syncFlag_.

std::shared_ptr<VectorT<T>> &getMutableVector(bool useGpu)

return a const cpuVectorT_ or gpuVectorT_.

Note
: This interface will change syncFlag_, so if you will not change the data, you should call getVector.

const T *getData(bool useGpu) const

return const T* data.

If useGpu is true, return device data.

If useGpu is false, return host data.

T *getMutableData(bool useGpu)
void zeroMem(bool useGpu)

If useGpu is true, gpuVectorT_->Op().

If useGpu is false, cpuVectorT_->Op().

Op is zeroMem, fillSequence, ...

void fillSequence(bool useGpu)
void setElement(size_t i, const T &value, bool useGpu)
T getElement(size_t i) const

return i-th element.

size_t getSize() const

return vector size.

void copyToCpu(const T *data, size_t size)

copy data to cpuVectorT_.

void copyToCpu(const T *data, size_t size, hl_stream_t stream)

copy data to cpuVectorT_ using specifed-stream.

void copyToGpu(const T *data, size_t size)

copy data to gpuVectorT_.

void copyToGpu(const T *data, size_t size, hl_stream_t stream)

copy data to gpuVectorT_ using specifed-stream.

void copyFrom(const VectorT<T> &src, hl_stream_t stream)

copy from src using specifed-stream.

If src is CpuVectorT, copy to cpuVectorT_.

If src is GpuVectorT, copy to gpuVectorT_.

void copyFrom(const T *data, size_t size, bool useGpu)

copy data.

If useGpu is false, copy host data to cpuVectorT_.

If useGpu is true, copy device data to gpuVectorT_.

Note
data address should consistent with useGpu.

void copyFrom(const T *data, size_t size, hl_stream_t stream, bool useGpu)
void copyFrom(CpuGpuVectorT<T> &src, size_t offset, size_t size, bool useGpu, hl_stream_t stream)

copy from (src + offset) using specifed-stream.

void copyFrom(CpuGpuVectorT<T> &src, hl_stream_t stream)

copy from src using specifed-stream.

SyncedFlag *getSync() const

return sync_.

void setSync(SyncedFlag *sync)

set sync_.

void setSync(SyncedFlag syncFlag)
void setSync(bool useGpu)

Public Static Functions

std::shared_ptr<CpuGpuVectorT<T>> create(size_t size, bool useGpu)
void resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>> &vec, size_t size, bool useGpu)

resize or create CpuGpuVectorT.

Protected Functions

void resizeOrCreate(size_t size, bool useGpu)
void copyToCpu()

copy between cpuVectorT_ and gpuVectorT_.

If syncFlag_ is DATA_AT_CPU and SYNCED, do nothing.

If syncFlag_ is DATA_AT_GPU, copy gpuVectorT_ to cpuVectorT_ and set syncFlag_ to SYNCED.

void copyToGpu()

copy between cpuVectorT_ and gpuVectorT_.

If syncFlag_ is DATA_AT_GPU and SYNCED, do nothing.

If syncFlag_ is DATA_AT_CPU, copy cpuVectorT_ to gpuVectorT_ and set syncFlag_ to SYNCED.

Protected Attributes

std::shared_ptr<VectorT<T>> cpuVectorT_

host pointer.

std::shared_ptr<VectorT<T>> gpuVectorT_

device pointer.

SyncedFlag syncFlag_

specify current data address.

SyncedFlag *sync_
namespace paddle

Typedefs

typedef std::shared_ptr<_hl_sparse_matrix_s> hl_sparse_matrix_s_ptr
class GpuSparseMatrix

Inherits from paddle::Matrix

Public Functions

GpuSparseMatrix(size_t height, size_t width, size_t nnz, SparseValueType valueType = FLOAT_VALUE, SparseFormat format_ = SPARSE_CSR, bool trans = false)
GpuSparseMatrix(GpuMemHandlePtr dataHandle, hl_sparse_matrix_s_ptr sMatrix, size_t height, size_t width, size_t nnz, SparseValueType valueType = FLOAT_VALUE, SparseFormat format_ = SPARSE_CSR, bool trans = false, MemoryHandlePtr sMemoryHandle = NULL)
GpuSparseMatrix(real *value, int *rows, int *cols, size_t height, size_t width, size_t nnz, SparseValueType valueType, SparseFormat format, bool trans)
GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height, size_t width, size_t nnz, SparseValueType valueType, SparseFormat format, bool trans, MemoryHandlePtr sMemoryHandle)
~GpuSparseMatrix()
virtual void resize(size_t newHeight, size_t newWidth, size_t newNnz, SparseValueType valueType, SparseFormat format)

Note
This should only be used for sparse matrix.

virtual void resize(size_t newHeight, size_t newWidth)

Note
Original data may not be preserved after resize().

void sparseResizeCSR()
void sparseResizeCSC()
void resizeCSR(size_t newHeight, size_t newWidth, size_t newNnz, SparseValueType valueType)
void resizeCSC(size_t newHeight, size_t newWidth, size_t newNnz, SparseValueType valueType)
void mul(const GpuMatrixPtr a, const GpuMatrixPtr b, real scaleAB, real scaleT)
virtual MatrixPtr getTranspose()

B = A , B.trans = !A.trans.

virtual void transpose(MatrixPtr matTrans, bool memAlloc)

B = A’.

virtual void copyFrom(const Matrix &src)
virtual void copyFrom(const Matrix &src, hl_stream_t stream)
void copyFromCSR(CpuSparseMatrix &src, hl_stream_t stream)
void copyFromCSC(CpuSparseMatrix &src, hl_stream_t stream)
virtual void copyFrom(const IVector &src)

convert a int vector to a real matrix.

(1) source and dest are both in CPU.

(2) sizes are exactly match.

void copyFrom(const IVector &src, hl_stream_t stream)
template <class T>
void copyFrom(int64_t *ids, int64_t *indices, T *data, hl_stream_t stream)
virtual void setRow(size_t row, size_t colNum, const unsigned int *cols, const real *values)

This should only be used for sparse matrix.

Currently must be called for each row in order. The matrix is not valid until setRow is called for the last row.

virtual SparseValueType getValueType() const
virtual SparseFormat getFormat() const
const int *getRowCols(size_t x) const
const real *getRowValues(size_t x) const
size_t getColNum(size_t x) const
virtual void print(std::ostream &os) const

print out the values of elements to os

virtual void zeroMem()

only set value_ of FLOAT_VALUE sparse matrix to zero

void add3(GpuMatrix *b)

sparseMatrix += denseMatrix

Named add3 just because add/add2 has been used in BaseMatrix.cu and they are not virtual function.

Only add value of same (row, col) index in dense matrix and do not use others values.

Parameters
  • b -

    dense matrix

virtual void add3(MatrixPtr b)

matrix elment-wise add

Named add3 just because add/add2 has been used in BaseMatrix.cu and they are not virtual function.

virtual void addBias(Matrix &b, real scale)

sparseMatrix[i,j] += bias[j], (j is the col index of sparse matrix)

Parameters
  • b -

    bias, dense matrix and height = 1

  • scale -

    scale of b

virtual int *getRows() const

return rows, which is gpu address

virtual int *getCols() const

return cols, which is gpu address

real *getValue() const

return value, which is gpu address

virtual real *getData()

return value_ of sparse matrix

Some times CpuSparseMatrix maybe Matrix, if getValue, must dynamic_cast to CpuSparseMatrix, getData is convenient to get value

virtual const real *getData() const
virtual void rowMax(IVector &maxIds, Matrix &maxVal)

Get top k value of each row in sparse matrix.

Store the value in maxVal and theirs index in maxIds. k = maxVal.width

Parameters
  • maxIds -

    index of top k

  • maxVal -

    value of top k

virtual void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, real scaleT)

this = scaleAB*(a*b) + scaleT*this

void copyFrom(CpuSparseMatrix &src, hl_stream_t stream)
void copyFrom(GpuSparseMatrix &src, hl_stream_t stream)
virtual void trimFrom(const CpuSparseMatrix &src)
void trimFromCSR(const CpuSparseMatrix &src)
void trimFromCSC(const CpuSparseMatrix &src)
virtual bool isSparse() const

Public Members

MemoryHandlePtr sMemoryHandle_
int *rows_
int *cols_
real *value_
const char *end_
hl_sparse_matrix_s_ptr sMatrix_
SparseValueType valueType_
SparseFormat format_

Protected Functions

void sparseResize()
void copyRow(int offsets, size_t colNum, const sparse_non_value_t *row)
void copyRow(int offsets, size_t colNum, const sparse_float_value_t *row)
struct Element

Public Functions

Element(int rowIn, int colIn, real valIn)

Public Members

int row
int col
real val

Functions

P_DECLARE_bool(allow_inefficient_sparse_update)
namespace paddle
class SparseRowCpuMatrix
#include <SparseRowMatrix.h>

Sparse Row

Inherits from paddle::CpuMatrix

Subclassed by paddle::SparseAutoGrowRowCpuMatrix, paddle::SparsePrefetchRowCpuMatrix

Public Types

typedef std::shared_ptr<IndexDict> IndexDictPtr

Public Functions

SparseRowCpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width, IndexDictPtr indexDictHandle = nullptr, bool trans = false)

heightStore is max number of rows of the sparse matrix.

virtual ~SparseRowCpuMatrix()
real *getRow(size_t row)

Get the row buf

Parameters
  • row -

    row id in the original matrix

real *getLocalRow(size_t row)

Get the row buf

Parameters
  • row -

    row id in local storage

void reserveStore()

reserve the storage for rows according to current size of indexDictHandle.

This is only used when SparseRowCpuMatrix is constructed with indexDictHandle.

virtual real *getRowBuf(size_t row)
virtual void mul(CpuSparseMatrix *a, CpuMatrix *b, real scaleAB, real scaleT)
virtual void copyFrom(const real *src, size_t size)

Fill data according to row indexs added, setup indices inside.

src and size are data and size of normal dense CpuMatrix.

virtual void zeroMem()
void applyL1Decay(real learningRate, real decayRate)

apply L1 to all sparse rows, should be apply after indices ready.

void clearIndices()
void zeroMemThread(size_t tid, size_t numThreads)
void sgdUpdate(BaseMatrix &value, IVector &t0, real learningRate, int currentTime, real decayRate, bool useL1, bool fini = false)

value -= grad * learningRate, this is gradient.

If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.

t0 is a int vector used by L1/L2 decay, size = height of parameter matrix, store the time that each weight row last updated.

Time is batchId, currentTime is current batchId.

While pass finished, caller should call this func one more time with (fini=true) to let weight decay catch up current time.

void addTo(BaseMatrix &dest, std::vector<uint32_t> &ids, size_t tid, size_t numThreads)

merge rows in this to dest for designated thread

values add to dest matrix

ids occured in this append to ids filtered by (id % numThreads == tid)

void addTo(SparseRowCpuMatrix &dest, size_t tid, size_t numThreads)

the second version addTo(), dest is a SparseRowCpuMatrix.

The dest’s indices should be setup already, addTo() will check src ids is exist in dest’s indices.

const IndexDictPtr &getIndexDictHandle() const
void checkIndices()

check all local and global indices consistency

void checkIndex(size_t i)

check whether row i exist in indices

std::vector<unsigned int> &getLocalIndices() const

Protected Functions

template <typename Func>
void apply(Func f)
void init(size_t height, size_t width)
void clearRows()

clear row indices.

void checkStoreSize()

Protected Attributes

CpuMatrix storeMat_
std::vector<real, AlignedAllocator<real, 32>> rowStore_
IndexDictPtr indexDictHandle_
std::vector<unsigned int> *localIndices_
unsigned int *globalIndices_

Protected Static Attributes

const unsigned int kUnusedId_
struct IndexDict

Public Members

std::vector<unsigned int> localIndices
std::vector<unsigned int> globalIndices
class SparsePrefetchRowCpuMatrix
#include <SparseRowMatrix.h>

For prefetching parameters from remote Parameter server.

Inherits from paddle::SparseRowCpuMatrix

Public Functions

SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width, IndexDictPtr indexDictHandle = nullptr, SyncThreadPool *pool = nullptr, bool trans = false)
void addRows(MatrixPtr input)

Extract feature ids from input, to fill row indexs.

input must be sparse matrix.

Can call many times before setup.

void addRows(IVectorPtr ids)
void setupIndices()

setup global indices of SparseRowMatrix after finish add rows.

Protected Functions

void addRows(const unsigned int *ids, size_t len)

Protected Attributes

SyncThreadPool *pool_
class SparseAutoGrowRowCpuMatrix

Inherits from paddle::SparseRowCpuMatrix

Subclassed by paddle::CacheRowCpuMatrix

Public Functions

SparseAutoGrowRowCpuMatrix(size_t height, size_t width, IndexDictPtr indexDictHandle = nullptr, bool trans = false)
real *getRow(size_t row)
virtual real *getRowBuf(size_t row)
virtual void mul(CpuSparseMatrix *a, CpuMatrix *b, real scaleAB, real scaleT)
class CacheRowCpuMatrix

Inherits from paddle::SparseAutoGrowRowCpuMatrix

Public Functions

CacheRowCpuMatrix(size_t height, size_t width, IndexDictPtr indexDictHandle = nullptr, bool trans = false)
void setSourceData(CpuVectorPtr sourceVec)
real *getRow(size_t row)
virtual real *getRowBuf(size_t row)
virtual void mul(CpuSparseMatrix *a, CpuMatrix *b, real scaleAB, real scaleT)

Public Members

CpuVectorPtr sourceDataVec_
real *sourceData_
class SparseRowIdsCpuMatrix
#include <SparseRowMatrix.h>

Sparse Row Ids Matrix.

mostly same as CpuMatrix, but maintain sparse row ids occured, ids are hashed by worker thread id.

Inherits from paddle::CpuMatrix

Public Functions

SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width, bool trans = false)
void setNumOfThreads(size_t numOfThreads)
std::vector<uint32_t> &getIds(size_t threadId)

Private Members

std::vector<std::vector<uint32_t>> idsArray_
namespace paddle
class CpuSparseMatrix

Inherits from paddle::Matrix

Public Functions

CpuSparseMatrix(size_t height, size_t width, size_t nnz, SparseValueType valueType = FLOAT_VALUE, SparseFormat format = SPARSE_CSR, bool trans = false)
CpuSparseMatrix(CpuMemHandlePtr memHandle, size_t height, size_t width, size_t nnz, SparseValueType valueType, SparseFormat format, bool trans)
CpuSparseMatrix(real *data, int *rows, int *cols, size_t height, size_t width, size_t nnz, SparseValueType valueType, SparseFormat format, bool trans)
~CpuSparseMatrix()
virtual void resize(size_t newHeight, size_t newWidth, size_t newNnz, SparseValueType valueType, SparseFormat format)

Note
This should only be used for sparse matrix.

virtual void resize(size_t newHeight, size_t newWidth)

Note
Original data may not be preserved after resize().

virtual MatrixPtr getTranspose()
SparseValueType getValueType()
real *getRowValues(size_t i) const
int *getRowCols(size_t i) const
void fillRowIndices(IVectorPtr &outVec) const

fill row indices of each value in CSR matrix

size_t getColNum(size_t i) const
real *getColumn(size_t i) const
size_t getColStartIdx(size_t i) const
size_t getRowStartIdx(size_t i) const
size_t getRowNum(size_t i) const
virtual real getSum()
virtual void square()
virtual real getMin()

only consider nonzero values. the actual min value should compare with 0.0.

virtual real getMax()

only consider nonzero values. the actual max value should compare with 0.0.

virtual void rowMax(IVector &maxIds, Matrix &max)

Get the top k elements of each row of this matrix.

The column ids and values of these elements are stored in maxIds and max respectively. Note that the top k elements are not sorted.

virtual int *getRows() const
virtual int *getCols() const
real *getValue() const
virtual SparseFormat getFormat() const
virtual SparseValueType getValueType() const
virtual real *getData()

return value_ of sparse matrix

Some times CpuSparseMatrix maybe Matrix, if getValue, must dynamic_cast to CpuSparseMatrix, getData is convenient to get value

virtual const real *getData() const
virtual void zeroMem()

only set value_ of FLOAT_VALUE sparse matrix to zero

virtual void transpose(MatrixPtr matTrans, bool memAlloc)

mem MUST be alloced outside (memAlloc=false)

virtual void mul(MatrixPtr a, MatrixPtr b, real scaleAB, real scaleT)

this = scaleAB*(a*b) + scaleT*this

void add3(CpuMatrix *b)

sparseMatrix += denseMatrix

Named add3 just because add/add2 has been used in BaseMatrix.cu and they are not virtual function.

Only add value of same (row, col) index in dense matrix and do not use others values whoes postions are not in sparse matirx.

Parameters
  • b -

    dense matrix

virtual void add3(MatrixPtr b)

matrix elment-wise add

Named add3 just because add/add2 has been used in BaseMatrix.cu and they are not virtual function.

virtual void addBias(Matrix &b, real scale)

sparseMatrix[i,j] += bias[j], (j is the col index of sparse matrix)

Parameters
  • b -

    bias, dense matrix and height = 1

  • scale -

    scale of b

virtual void print(std::ostream &os) const

print out the values of elements to os

virtual void printOneRow(std::ostream &os, size_t idx) const

print one row to os

virtual void setRow(size_t row, size_t colNum, const unsigned int *cols, const real *values)

This should only be used for sparse matrix.

Currently must be called for each row in order. The matrix is not valid until setRow is called for the last row.

virtual void randomizeUniform()
void copyFrom(const GpuSparseMatrix &src, hl_stream_t stream)
virtual void copyFrom(const Matrix &src, hl_stream_t stream = HPPL_STREAM_DEFAULT)
virtual void copyFrom(const Matrix &src)
CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width)

Get a temporary matrix. This is threadsafe. It should be only used temporarily, i.e. do not store it or use it as return value.

Note
Do NOT use large amount of tmp matrix.

virtual MatrixPtr subMatrix(size_t startRow, size_t numRows)
void copyFrom(std::vector<int> &rows, std::vector<int> &cols, std::vector<real> &values)
void copyFrom(const CpuMatrix &src)
void copyFrom(const CpuSparseMatrix &src)
virtual void trimFrom(const CpuSparseMatrix &src)
void copyRow(int offsets, size_t colNum, const sparse_non_value_t *row)
void copyRow(int offsets, size_t colNum, const sparse_float_value_t *row)
template <class T>
void copyFrom(int64_t *ids, int64_t *indices, T *data)
template <class T>
void copyFrom(int64_t *indices, T *data)
virtual void copyFrom(const real *src, size_t size)

If this is GpuMatrix, src is assumed to be CPU memory

If this is CpuMatrix, src is assumed to be CPU memory

virtual bool isSparse() const

Protected Functions

void sparseResize()

Protected Attributes

int *rows_
int *cols_
real *value_
SparseFormat format_
SparseValueType valueType_

Protected Static Attributes

const size_t DEFAULT_AVG_WIDTH
ThreadLocal<std::vector<CpuSparseMatrixPtr>> cpuLocalMats_

Private Functions

virtual MatrixPtr clone(size_t height = 0, size_t width = 0, bool useGpu = false)

Create a matrix with the same type (GpuMatrix, CpuMatrix, NonValueSparseMatrix, etc.) as this.

If height and width is zero, the new matrix will have the same size as this, otherwise the new matrix will have the specified size.

Others

namespace paddle

Functions

template <class T>
void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const T alpha, const T *A, const int lda, const T *B, const int ldb, const T beta, T *C, const int ldc)
template <class T>
void axpy(const int n, const T alpha, const T *x, T *y)
template <class T>
T dotProduct(const int n, const T *x, const T *y)
namespace paddle
namespace simd

Functions

template <typename Type>
void addTo(Type *a, const Type *b, size_t len)
template <typename Type>
void batchAddTo(Type *a, const Type *b[], int batch, size_t len)
template <typename Type>
void colMax(Type *result, const Type *data, int dim, int numSamples)
template <typename Type>
void decayL1(Type *dst, Type *src, Type *lr, Type lambda, size_t len)
template <typename Type>
void decayL1(Type *dst, Type *src, Type lambda, size_t len)
template <size_t AlignSize>
bool isPointerAlign(void *ptr)
bool vec_check(size_t len)
template <>
void addTo(float *a, const float *b, size_t len)
template <>
void batchAddTo(float *a, const float *b[], int batch, size_t len)
template <>
void colMax(float *result, const float *data, int dim, int numSamples)
template <>
void decayL1(float *dst, float *src, float lambda, size_t len)
template <>
void decayL1(float *dst, float *src, float *lr, float lambda, size_t len)
namespace naive

Functions

template <typename Type>
void addTo(Type *a, const Type *b, size_t len)
template <typename Type>
void batchAddTo(Type *a, const Type *b[], int batch, size_t len)
template <typename Type>
void colMax(Type *result, const Type *data, int dim, int numSamples)

Note
this method is unused in paddle.

template <typename Type>
void decayL1(Type *dst, Type *src, Type *lr, Type lambda, size_t len)
template <class Type>
void decayL1(Type *dst, Type *src, Type lambda, size_t len)