diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/cuda/include/hl_sparse.h index 22f7a228e0ad6bce75957fcae695b71324a5c325..9acdebdebf37761e1485e3441963586ead9f3c85 100644 --- a/paddle/cuda/include/hl_sparse.h +++ b/paddle/cuda/include/hl_sparse.h @@ -223,6 +223,7 @@ extern void hl_matrix_csc2dense(hl_sparse_matrix_s A_d, * @param[in] dimK width of op(A) & height of op(B) * @param[in] alpha scalar used for multiplication. * @param[in] beta scalar used for multiplication. + * If beta is zero, C does not have to be a valid input. * * @note transb is not support HPPL_OP_T. * @@ -251,6 +252,7 @@ extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, * @param[in] dimK width of op(A) & height of op(B) * @param[in] alpha scalar used for multiplication. * @param[in] beta scalar used for multiplication. + * If beta is zero, C does not have to be a valid input. * * @note transb is not support HPPL_OP_T. * @@ -275,6 +277,7 @@ extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, * @param[in] dimK width of op(A) & height of op(B) * @param[in] alpha scalar used for multiplication. * @param[in] beta scalar used for multiplication. + * If beta is zero, C does not have to be a valid input. * * @note transa is not support HPPL_OP_T. * @@ -327,6 +330,7 @@ extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa, * @param[in] dimK width of op(A) & height of op(B) * @param[in] alpha scalar used for multiplication. * @param[in] beta scalar used for multiplication. + * If beta is zero, C does not have to be a valid input. * * * @note transa is not support HPPL_OP_T. diff --git a/paddle/cuda/src/hl_cuda_sparse.cu b/paddle/cuda/src/hl_cuda_sparse.cu index b42568afdaaf59fda7ebcc93b2500c93e2256608..1687fcc221ab85aff943ccf8c5be7c1ed918f853 100644 --- a/paddle/cuda/src/hl_cuda_sparse.cu +++ b/paddle/cuda/src/hl_cuda_sparse.cu @@ -562,6 +562,22 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst, } } +/** + * Calculate beta * C, if beta is zero, C does not have to be a valid input. + */ +static void _beta_mul_c(real *c, int dimM, int dimN, real beta) { + if (beta == 0.0) { + hl_gpu_apply_unary_op(unary::Zero(), c, dimM, dimN, dimN); + } else { + if (beta != 1.0){ + hl_gpu_apply_unary_op( + unary::mul_scalar(beta), c, dimM, dimN, dimN); + } + } + + return; +} + void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, real *B_d, hl_trans_op_t transb, real *C_d, @@ -580,15 +596,8 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, } if (A_d->nnz == 0) { - if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d, - dimM, - dimN, - dimN); - } else { - return; - } + _beta_mul_c(C_d, dimM, dimN, beta); + return; } /* nnz != 0 */ @@ -633,13 +642,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, beta); } } else if (HPPL_OP_T == transa) { - if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d, - dimM, - dimN, - dimN); - } + _beta_mul_c(C_d, dimM, dimN, beta); int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N; @@ -699,15 +702,8 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, << "matrix format error!"; if (B_d->nnz == 0) { - if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d, - dimM, - dimN, - dimN); - } else { - return; - } + _beta_mul_c(C_d, dimM, dimN, beta); + return; } /* nnz != 0 */ @@ -750,13 +746,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, beta); } } else if (transb == HPPL_OP_T) { - if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d, - dimM, - dimN, - dimN); - } + _beta_mul_c(C_d, dimM, dimN, beta); int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X; int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M; dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y); @@ -813,15 +803,8 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, << "matrix format error!"; if (B_d->nnz == 0) { - if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d, - dimM, - dimN, - dimN); - } else { - return; - } + _beta_mul_c(C_d, dimM, dimN, beta); + return; } /* nnz != 0 */ @@ -833,14 +816,7 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, } if (transb == HPPL_OP_N) { - if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d, - dimM, - dimN, - dimN); - } - + _beta_mul_c(C_d, dimM, dimN, beta); int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X; int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M; dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y); @@ -925,15 +901,8 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, } if (A_d->nnz == 0) { - if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d, - dimM, - dimN, - dimN); - } else { - return; - } + _beta_mul_c(C_d, dimM, dimN, beta); + return; } /* nnz != 0 */ @@ -945,13 +914,7 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, } if (HPPL_OP_N == transa) { - if (beta != 1.0) { - hl_gpu_apply_unary_op(unary::mul_scalar(beta), - C_d, - dimM, - dimN, - dimN); - } + _beta_mul_c(C_d, dimM, dimN, beta); int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N -1)/CU_CSC_MUL_DENSE_BLOCK_N; int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K -1)/CU_CSC_MUL_DENSE_BLOCK_K; @@ -1113,7 +1076,7 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa, CHECK(!transA) << "Not supported A is trans and B is not trans!"; dim3 block(CU_BLOCK_SIZE, 1); - int avgNnzPerRow = C_d2->nnz_s / dimM; + int avgNnzPerRow = C_d->nnz / dimM; avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1; int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE); dim3 grid(gridx, dimM); @@ -1242,9 +1205,9 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d, LOG(FATAL) << "parameter B is null!"; } - if (B_d2->nnz_s == 0) return; + if (B_d->nnz == 0) return; - int nnz = B_d2->nnz_s; + int nnz = B_d->nnz; int block = 512; int grid = DIVUP(nnz, 512); KeSMatrixCsrColumnSum<<>>( @@ -1273,9 +1236,9 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d, LOG(FATAL) << "parameter A_d is null!"; } - if (A_d2->nnz_s == 0) return; + if (A_d->nnz == 0) return; - int nnz = A_d2->nnz_s; + int nnz = A_d->nnz; int block = 512; int grid = DIVUP(nnz, 512); KeSMatrixCsrAddBias<<>>( @@ -1308,9 +1271,9 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM, LOG(FATAL) << "parameter A_d is null!"; } - if (A_d2->nnz_s == 0) return; + if (A_d->nnz == 0) return; - int gridX = DIVUP((A_d2->nnz_s / dimM), 512); + int gridX = DIVUP((A_d->nnz / dimM), 512); gridX = gridX > 0 ? gridX : 1; dim3 block(512, 1); dim3 grid(gridX, dimM); diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh index db5c9ce979885a173c8caadc8f6b47836f1771b5..13e89390d68c226734245aa6edcec132362c1a1e 100644 --- a/paddle/cuda/src/hl_cuda_sparse.cuh +++ b/paddle/cuda/src/hl_cuda_sparse.cuh @@ -85,6 +85,15 @@ __global__ void KeSMatrixCsc2Dense(real * csc_val, C_d[row*dimN + col] = sum; } +__device__ __forceinline__ +void _calculate_c(real &c, real sum) { + c = sum; +} +__device__ __forceinline__ +void _calculate_c(real &c, real sum, real beta) { + c = sum + beta * c; +} + #define CU_CSRMM_N 4 #define CU_CSRMM_THREAD_X 32 #define CU_CSRMM_THREAD_Y 32 @@ -191,11 +200,19 @@ __global__ void KeSMatrixCsrMulDense(real *C_d, } C_d += __mul24(index_m, dimN); - #pragma unroll - for (int n = 0; n < CU_CSRMM_N; n++) { - if (index_n < dimN) { - C_d[index_n] = alpha*sum[n] + beta*C_d[index_n]; - index_n += CU_CSRMM_THREAD_X; + if (beta == 0.0) { + for (int n = 0; n < CU_CSRMM_N; n++) { + if (index_n < dimN) { + _calculate_c(C_d[index_n], alpha * sum[n]); + index_n += CU_CSRMM_THREAD_X; + } + } + } else { + for (int n = 0; n < CU_CSRMM_N; n++) { + if (index_n < dimN) { + _calculate_c(C_d[index_n], alpha * sum[n], beta); + index_n += CU_CSRMM_THREAD_X; + } } } } @@ -544,13 +561,22 @@ TEMP_TEST: int index_m_c = ibx + idy; int index_n_c = blockIdx.y*CU_CSCMM_BLOCK_N_BEST + idx; C_d += index_n_c + __mul24(index_m_c, dimN); - #pragma unroll - for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) { - if (index_m_c < dimM && index_n_c < dimN) { - C_d[0] = A_s[idy+m*32][idx] + beta*C_d[0]; + if (beta == 0.0) { + for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) { + if (index_m_c < dimM && index_n_c < dimN) { + _calculate_c(C_d[0], A_s[idy + m * 32][idx]); + } + index_m_c += 32; + C_d += dimN*32; + } + } else { + for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) { + if (index_m_c < dimM && index_n_c < dimN) { + _calculate_c(C_d[0], A_s[idy + m * 32][idx], beta); + } + index_m_c += 32; + C_d += dimN*32; } - index_m_c += 32; - C_d += dimN*32; } } diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp index 44ea95c80ab083fb7d4f0e7b89aebe2b3dc764f7..78d15c553021de6bbda210cb782c8a240cc2bf73 100644 --- a/paddle/gserver/layers/Layer.cpp +++ b/paddle/gserver/layers/Layer.cpp @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/utils/Util.h" #include "paddle/utils/Logging.h" +#include "paddle/math/SparseMatrix.h" #include "AddtoLayer.h" #include "CosSimLayer.h" @@ -290,14 +291,30 @@ void Layer::showOutputStats() { << " is 0, skip to show the statistics"; return; } - real mean = out->getSum() / out->getElementCnt(); - MatrixPtr outSquare = out->clone(); - outSquare->copyFrom(*out); + MatrixPtr outSquare; + if (dynamic_cast(out.get())) { + GpuSparseMatrix *tmp = dynamic_cast(out.get()); + outSquare = std::make_shared( + tmp->getHeight(), tmp->getWidth(), tmp->getElementCnt(), + tmp->getValueType(), tmp->getFormat()); + } else { + outSquare = out->clone(); + } + outSquare->copyFrom(*out, HPPL_STREAM_DEFAULT); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); + + real mean = outSquare->getSum() / out->getElementCnt(); + real min; + real max; if (dynamic_cast(outSquare.get())) { auto tmpMat = dynamic_cast(outSquare.get()); + min = tmpMat->getMin(); + max = tmpMat->getMax(); tmpMat->square(); LOG(INFO) << "show statistics of [none zero values] in sparse matrix"; } else { + min = outSquare->getMin(); + max = outSquare->getMax(); outSquare->square(); } real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean; @@ -306,8 +323,8 @@ void Layer::showOutputStats() { << ", " << "std=" << std << ", " - << "min=" << out->getMin() << ", " - << "max=" << out->getMax(); + << "min=" << min << ", " + << "max=" << max; } void Layer::forwardActivation() {