提交 b52039bd 编写于 作者: H hedaoyuan 提交者: qingqing01

some bug fix for sparse matrix (#133)

* some bug fix for sparse matrix

* a minor bug fix
上级 0276f15a
......@@ -223,6 +223,7 @@ extern void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
* @param[in] dimK width of op(A) & height of op(B)
* @param[in] alpha scalar used for multiplication.
* @param[in] beta scalar used for multiplication.
* If beta is zero, C does not have to be a valid input.
*
* @note transb is not support HPPL_OP_T.
*
......@@ -251,6 +252,7 @@ extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
* @param[in] dimK width of op(A) & height of op(B)
* @param[in] alpha scalar used for multiplication.
* @param[in] beta scalar used for multiplication.
* If beta is zero, C does not have to be a valid input.
*
* @note transb is not support HPPL_OP_T.
*
......@@ -275,6 +277,7 @@ extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
* @param[in] dimK width of op(A) & height of op(B)
* @param[in] alpha scalar used for multiplication.
* @param[in] beta scalar used for multiplication.
* If beta is zero, C does not have to be a valid input.
*
* @note transa is not support HPPL_OP_T.
*
......@@ -327,6 +330,7 @@ extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
* @param[in] dimK width of op(A) & height of op(B)
* @param[in] alpha scalar used for multiplication.
* @param[in] beta scalar used for multiplication.
* If beta is zero, C does not have to be a valid input.
*
*
* @note transa is not support HPPL_OP_T.
......
......@@ -562,6 +562,22 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
}
}
/**
* Calculate beta * C, if beta is zero, C does not have to be a valid input.
*/
static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
if (beta == 0.0) {
hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN);
} else {
if (beta != 1.0){
hl_gpu_apply_unary_op(
unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
}
}
return;
}
void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
real *B_d, hl_trans_op_t transb,
real *C_d,
......@@ -580,15 +596,8 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
}
if (A_d->nnz == 0) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
} else {
return;
}
_beta_mul_c(C_d, dimM, dimN, beta);
return;
}
/* nnz != 0 */
......@@ -633,13 +642,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
beta);
}
} else if (HPPL_OP_T == transa) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
}
_beta_mul_c(C_d, dimM, dimN, beta);
int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) /
CU_CSC_MUL_DENSE_BLOCK_N;
......@@ -699,15 +702,8 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
<< "matrix format error!";
if (B_d->nnz == 0) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
} else {
return;
}
_beta_mul_c(C_d, dimM, dimN, beta);
return;
}
/* nnz != 0 */
......@@ -750,13 +746,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
beta);
}
} else if (transb == HPPL_OP_T) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
}
_beta_mul_c(C_d, dimM, dimN, beta);
int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X;
int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M;
dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
......@@ -813,15 +803,8 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
<< "matrix format error!";
if (B_d->nnz == 0) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
} else {
return;
}
_beta_mul_c(C_d, dimM, dimN, beta);
return;
}
/* nnz != 0 */
......@@ -833,14 +816,7 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
}
if (transb == HPPL_OP_N) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
}
_beta_mul_c(C_d, dimM, dimN, beta);
int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X;
int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M;
dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
......@@ -925,15 +901,8 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
}
if (A_d->nnz == 0) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
} else {
return;
}
_beta_mul_c(C_d, dimM, dimN, beta);
return;
}
/* nnz != 0 */
......@@ -945,13 +914,7 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
}
if (HPPL_OP_N == transa) {
if (beta != 1.0) {
hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
C_d,
dimM,
dimN,
dimN);
}
_beta_mul_c(C_d, dimM, dimN, beta);
int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N -1)/CU_CSC_MUL_DENSE_BLOCK_N;
int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K -1)/CU_CSC_MUL_DENSE_BLOCK_K;
......@@ -1113,7 +1076,7 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
CHECK(!transA) << "Not supported A is trans and B is not trans!";
dim3 block(CU_BLOCK_SIZE, 1);
int avgNnzPerRow = C_d2->nnz_s / dimM;
int avgNnzPerRow = C_d->nnz / dimM;
avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
dim3 grid(gridx, dimM);
......@@ -1242,9 +1205,9 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
LOG(FATAL) << "parameter B is null!";
}
if (B_d2->nnz_s == 0) return;
if (B_d->nnz == 0) return;
int nnz = B_d2->nnz_s;
int nnz = B_d->nnz;
int block = 512;
int grid = DIVUP(nnz, 512);
KeSMatrixCsrColumnSum<<<grid, block, 0, STREAM_DEFAULT>>>(
......@@ -1273,9 +1236,9 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
LOG(FATAL) << "parameter A_d is null!";
}
if (A_d2->nnz_s == 0) return;
if (A_d->nnz == 0) return;
int nnz = A_d2->nnz_s;
int nnz = A_d->nnz;
int block = 512;
int grid = DIVUP(nnz, 512);
KeSMatrixCsrAddBias<<<grid, block, 0, STREAM_DEFAULT>>>(
......@@ -1308,9 +1271,9 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
LOG(FATAL) << "parameter A_d is null!";
}
if (A_d2->nnz_s == 0) return;
if (A_d->nnz == 0) return;
int gridX = DIVUP((A_d2->nnz_s / dimM), 512);
int gridX = DIVUP((A_d->nnz / dimM), 512);
gridX = gridX > 0 ? gridX : 1;
dim3 block(512, 1);
dim3 grid(gridX, dimM);
......
......@@ -85,6 +85,15 @@ __global__ void KeSMatrixCsc2Dense(real * csc_val,
C_d[row*dimN + col] = sum;
}
__device__ __forceinline__
void _calculate_c(real &c, real sum) {
c = sum;
}
__device__ __forceinline__
void _calculate_c(real &c, real sum, real beta) {
c = sum + beta * c;
}
#define CU_CSRMM_N 4
#define CU_CSRMM_THREAD_X 32
#define CU_CSRMM_THREAD_Y 32
......@@ -191,11 +200,19 @@ __global__ void KeSMatrixCsrMulDense(real *C_d,
}
C_d += __mul24(index_m, dimN);
#pragma unroll
for (int n = 0; n < CU_CSRMM_N; n++) {
if (index_n < dimN) {
C_d[index_n] = alpha*sum[n] + beta*C_d[index_n];
index_n += CU_CSRMM_THREAD_X;
if (beta == 0.0) {
for (int n = 0; n < CU_CSRMM_N; n++) {
if (index_n < dimN) {
_calculate_c(C_d[index_n], alpha * sum[n]);
index_n += CU_CSRMM_THREAD_X;
}
}
} else {
for (int n = 0; n < CU_CSRMM_N; n++) {
if (index_n < dimN) {
_calculate_c(C_d[index_n], alpha * sum[n], beta);
index_n += CU_CSRMM_THREAD_X;
}
}
}
}
......@@ -544,13 +561,22 @@ TEMP_TEST:
int index_m_c = ibx + idy;
int index_n_c = blockIdx.y*CU_CSCMM_BLOCK_N_BEST + idx;
C_d += index_n_c + __mul24(index_m_c, dimN);
#pragma unroll
for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
if (index_m_c < dimM && index_n_c < dimN) {
C_d[0] = A_s[idy+m*32][idx] + beta*C_d[0];
if (beta == 0.0) {
for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
if (index_m_c < dimM && index_n_c < dimN) {
_calculate_c(C_d[0], A_s[idy + m * 32][idx]);
}
index_m_c += 32;
C_d += dimN*32;
}
} else {
for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
if (index_m_c < dimM && index_n_c < dimN) {
_calculate_c(C_d[0], A_s[idy + m * 32][idx], beta);
}
index_m_c += 32;
C_d += dimN*32;
}
index_m_c += 32;
C_d += dimN*32;
}
}
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/utils/Util.h"
#include "paddle/utils/Logging.h"
#include "paddle/math/SparseMatrix.h"
#include "AddtoLayer.h"
#include "CosSimLayer.h"
......@@ -290,14 +291,30 @@ void Layer::showOutputStats() {
<< " is 0, skip to show the statistics";
return;
}
real mean = out->getSum() / out->getElementCnt();
MatrixPtr outSquare = out->clone();
outSquare->copyFrom(*out);
MatrixPtr outSquare;
if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
GpuSparseMatrix *tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
outSquare = std::make_shared<CpuSparseMatrix>(
tmp->getHeight(), tmp->getWidth(), tmp->getElementCnt(),
tmp->getValueType(), tmp->getFormat());
} else {
outSquare = out->clone();
}
outSquare->copyFrom(*out, HPPL_STREAM_DEFAULT);
hl_stream_synchronize(HPPL_STREAM_DEFAULT);
real mean = outSquare->getSum() / out->getElementCnt();
real min;
real max;
if (dynamic_cast<CpuSparseMatrix*>(outSquare.get())) {
auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
min = tmpMat->getMin();
max = tmpMat->getMax();
tmpMat->square();
LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
} else {
min = outSquare->getMin();
max = outSquare->getMax();
outSquare->square();
}
real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
......@@ -306,8 +323,8 @@ void Layer::showOutputStats() {
<< ", "
<< "std=" << std
<< ", "
<< "min=" << out->getMin() << ", "
<< "max=" << out->getMax();
<< "min=" << min << ", "
<< "max=" << max;
}
void Layer::forwardActivation() {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册