Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
b52039bd
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b52039bd
编写于
9月 30, 2016
作者:
H
hedaoyuan
提交者:
qingqing01
9月 30, 2016
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
some bug fix for sparse matrix (#133)
* some bug fix for sparse matrix * a minor bug fix
上级
0276f15a
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
98 addition
and
88 deletion
+98
-88
paddle/cuda/include/hl_sparse.h
paddle/cuda/include/hl_sparse.h
+4
-0
paddle/cuda/src/hl_cuda_sparse.cu
paddle/cuda/src/hl_cuda_sparse.cu
+35
-72
paddle/cuda/src/hl_cuda_sparse.cuh
paddle/cuda/src/hl_cuda_sparse.cuh
+37
-11
paddle/gserver/layers/Layer.cpp
paddle/gserver/layers/Layer.cpp
+22
-5
未找到文件。
paddle/cuda/include/hl_sparse.h
浏览文件 @
b52039bd
...
...
@@ -223,6 +223,7 @@ extern void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
* @param[in] dimK width of op(A) & height of op(B)
* @param[in] alpha scalar used for multiplication.
* @param[in] beta scalar used for multiplication.
* If beta is zero, C does not have to be a valid input.
*
* @note transb is not support HPPL_OP_T.
*
...
...
@@ -251,6 +252,7 @@ extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
* @param[in] dimK width of op(A) & height of op(B)
* @param[in] alpha scalar used for multiplication.
* @param[in] beta scalar used for multiplication.
* If beta is zero, C does not have to be a valid input.
*
* @note transb is not support HPPL_OP_T.
*
...
...
@@ -275,6 +277,7 @@ extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
* @param[in] dimK width of op(A) & height of op(B)
* @param[in] alpha scalar used for multiplication.
* @param[in] beta scalar used for multiplication.
* If beta is zero, C does not have to be a valid input.
*
* @note transa is not support HPPL_OP_T.
*
...
...
@@ -327,6 +330,7 @@ extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
* @param[in] dimK width of op(A) & height of op(B)
* @param[in] alpha scalar used for multiplication.
* @param[in] beta scalar used for multiplication.
* If beta is zero, C does not have to be a valid input.
*
*
* @note transa is not support HPPL_OP_T.
...
...
paddle/cuda/src/hl_cuda_sparse.cu
浏览文件 @
b52039bd
...
...
@@ -562,6 +562,22 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
}
}
/**
* Calculate beta * C, if beta is zero, C does not have to be a valid input.
*/
static
void
_beta_mul_c
(
real
*
c
,
int
dimM
,
int
dimN
,
real
beta
)
{
if
(
beta
==
0.0
)
{
hl_gpu_apply_unary_op
(
unary
::
Zero
<
real
>
(),
c
,
dimM
,
dimN
,
dimN
);
}
else
{
if
(
beta
!=
1.0
){
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
c
,
dimM
,
dimN
,
dimN
);
}
}
return
;
}
void
hl_matrix_csr_mul_dense
(
hl_sparse_matrix_s
A_d
,
hl_trans_op_t
transa
,
real
*
B_d
,
hl_trans_op_t
transb
,
real
*
C_d
,
...
...
@@ -580,15 +596,8 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
}
if
(
A_d
->
nnz
==
0
)
{
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
C_d
,
dimM
,
dimN
,
dimN
);
}
else
{
return
;
}
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
return
;
}
/* nnz != 0 */
...
...
@@ -633,13 +642,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
beta
);
}
}
else
if
(
HPPL_OP_T
==
transa
)
{
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
C_d
,
dimM
,
dimN
,
dimN
);
}
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
int
blocksX
=
(
dimN
+
CU_CSC_MUL_DENSE_BLOCK_N
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_N
;
...
...
@@ -699,15 +702,8 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
<<
"matrix format error!"
;
if
(
B_d
->
nnz
==
0
)
{
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
C_d
,
dimM
,
dimN
,
dimN
);
}
else
{
return
;
}
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
return
;
}
/* nnz != 0 */
...
...
@@ -750,13 +746,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
beta
);
}
}
else
if
(
transb
==
HPPL_OP_T
)
{
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
C_d
,
dimM
,
dimN
,
dimN
);
}
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
int
blocksX
=
1
+
(
dimK
-
1
)
/
CU_DM_CSR_THREAD_X
;
int
blocksY
=
1
+
(
dimM
-
1
)
/
CU_DM_CSR_BLOCK_M
;
dim3
threads
(
CU_DM_CSR_THREAD_X
,
CU_DM_CSR_THREAD_Y
);
...
...
@@ -813,15 +803,8 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
<<
"matrix format error!"
;
if
(
B_d
->
nnz
==
0
)
{
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
C_d
,
dimM
,
dimN
,
dimN
);
}
else
{
return
;
}
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
return
;
}
/* nnz != 0 */
...
...
@@ -833,14 +816,7 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
}
if
(
transb
==
HPPL_OP_N
)
{
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
C_d
,
dimM
,
dimN
,
dimN
);
}
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
int
blocksX
=
1
+
(
dimK
-
1
)
/
CU_DM_CSR_THREAD_X
;
int
blocksY
=
1
+
(
dimM
-
1
)
/
CU_DM_CSR_BLOCK_M
;
dim3
threads
(
CU_DM_CSR_THREAD_X
,
CU_DM_CSR_THREAD_Y
);
...
...
@@ -925,15 +901,8 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
}
if
(
A_d
->
nnz
==
0
)
{
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
C_d
,
dimM
,
dimN
,
dimN
);
}
else
{
return
;
}
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
return
;
}
/* nnz != 0 */
...
...
@@ -945,13 +914,7 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
}
if
(
HPPL_OP_N
==
transa
)
{
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
C_d
,
dimM
,
dimN
,
dimN
);
}
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
int
blocksX
=
(
dimN
+
CU_CSC_MUL_DENSE_BLOCK_N
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_N
;
int
blocksY
=
(
dimK
+
CU_CSC_MUL_DENSE_BLOCK_K
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_K
;
...
...
@@ -1113,7 +1076,7 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
CHECK
(
!
transA
)
<<
"Not supported A is trans and B is not trans!"
;
dim3
block
(
CU_BLOCK_SIZE
,
1
);
int
avgNnzPerRow
=
C_d
2
->
nnz_s
/
dimM
;
int
avgNnzPerRow
=
C_d
->
nnz
/
dimM
;
avgNnzPerRow
=
avgNnzPerRow
>
0
?
avgNnzPerRow
:
1
;
int
gridx
=
DIVUP
(
avgNnzPerRow
,
CU_BLOCK_SIZE
);
dim3
grid
(
gridx
,
dimM
);
...
...
@@ -1242,9 +1205,9 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
LOG
(
FATAL
)
<<
"parameter B is null!"
;
}
if
(
B_d
2
->
nnz_s
==
0
)
return
;
if
(
B_d
->
nnz
==
0
)
return
;
int
nnz
=
B_d
2
->
nnz_s
;
int
nnz
=
B_d
->
nnz
;
int
block
=
512
;
int
grid
=
DIVUP
(
nnz
,
512
);
KeSMatrixCsrColumnSum
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
...
...
@@ -1273,9 +1236,9 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
LOG
(
FATAL
)
<<
"parameter A_d is null!"
;
}
if
(
A_d
2
->
nnz_s
==
0
)
return
;
if
(
A_d
->
nnz
==
0
)
return
;
int
nnz
=
A_d
2
->
nnz_s
;
int
nnz
=
A_d
->
nnz
;
int
block
=
512
;
int
grid
=
DIVUP
(
nnz
,
512
);
KeSMatrixCsrAddBias
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
...
...
@@ -1308,9 +1271,9 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
LOG
(
FATAL
)
<<
"parameter A_d is null!"
;
}
if
(
A_d
2
->
nnz_s
==
0
)
return
;
if
(
A_d
->
nnz
==
0
)
return
;
int
gridX
=
DIVUP
((
A_d
2
->
nnz_s
/
dimM
),
512
);
int
gridX
=
DIVUP
((
A_d
->
nnz
/
dimM
),
512
);
gridX
=
gridX
>
0
?
gridX
:
1
;
dim3
block
(
512
,
1
);
dim3
grid
(
gridX
,
dimM
);
...
...
paddle/cuda/src/hl_cuda_sparse.cuh
浏览文件 @
b52039bd
...
...
@@ -85,6 +85,15 @@ __global__ void KeSMatrixCsc2Dense(real * csc_val,
C_d
[
row
*
dimN
+
col
]
=
sum
;
}
__device__
__forceinline__
void
_calculate_c
(
real
&
c
,
real
sum
)
{
c
=
sum
;
}
__device__
__forceinline__
void
_calculate_c
(
real
&
c
,
real
sum
,
real
beta
)
{
c
=
sum
+
beta
*
c
;
}
#define CU_CSRMM_N 4
#define CU_CSRMM_THREAD_X 32
#define CU_CSRMM_THREAD_Y 32
...
...
@@ -191,11 +200,19 @@ __global__ void KeSMatrixCsrMulDense(real *C_d,
}
C_d
+=
__mul24
(
index_m
,
dimN
);
#pragma unroll
for
(
int
n
=
0
;
n
<
CU_CSRMM_N
;
n
++
)
{
if
(
index_n
<
dimN
)
{
C_d
[
index_n
]
=
alpha
*
sum
[
n
]
+
beta
*
C_d
[
index_n
];
index_n
+=
CU_CSRMM_THREAD_X
;
if
(
beta
==
0.0
)
{
for
(
int
n
=
0
;
n
<
CU_CSRMM_N
;
n
++
)
{
if
(
index_n
<
dimN
)
{
_calculate_c
(
C_d
[
index_n
],
alpha
*
sum
[
n
]);
index_n
+=
CU_CSRMM_THREAD_X
;
}
}
}
else
{
for
(
int
n
=
0
;
n
<
CU_CSRMM_N
;
n
++
)
{
if
(
index_n
<
dimN
)
{
_calculate_c
(
C_d
[
index_n
],
alpha
*
sum
[
n
],
beta
);
index_n
+=
CU_CSRMM_THREAD_X
;
}
}
}
}
...
...
@@ -544,13 +561,22 @@ TEMP_TEST:
int
index_m_c
=
ibx
+
idy
;
int
index_n_c
=
blockIdx
.
y
*
CU_CSCMM_BLOCK_N_BEST
+
idx
;
C_d
+=
index_n_c
+
__mul24
(
index_m_c
,
dimN
);
#pragma unroll
for
(
int
m
=
0
;
m
<
CU_CSCMM_THREAD_M_BEST
;
m
++
)
{
if
(
index_m_c
<
dimM
&&
index_n_c
<
dimN
)
{
C_d
[
0
]
=
A_s
[
idy
+
m
*
32
][
idx
]
+
beta
*
C_d
[
0
];
if
(
beta
==
0.0
)
{
for
(
int
m
=
0
;
m
<
CU_CSCMM_THREAD_M_BEST
;
m
++
)
{
if
(
index_m_c
<
dimM
&&
index_n_c
<
dimN
)
{
_calculate_c
(
C_d
[
0
],
A_s
[
idy
+
m
*
32
][
idx
]);
}
index_m_c
+=
32
;
C_d
+=
dimN
*
32
;
}
}
else
{
for
(
int
m
=
0
;
m
<
CU_CSCMM_THREAD_M_BEST
;
m
++
)
{
if
(
index_m_c
<
dimM
&&
index_n_c
<
dimN
)
{
_calculate_c
(
C_d
[
0
],
A_s
[
idy
+
m
*
32
][
idx
],
beta
);
}
index_m_c
+=
32
;
C_d
+=
dimN
*
32
;
}
index_m_c
+=
32
;
C_d
+=
dimN
*
32
;
}
}
...
...
paddle/gserver/layers/Layer.cpp
浏览文件 @
b52039bd
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/utils/Util.h"
#include "paddle/utils/Logging.h"
#include "paddle/math/SparseMatrix.h"
#include "AddtoLayer.h"
#include "CosSimLayer.h"
...
...
@@ -290,14 +291,30 @@ void Layer::showOutputStats() {
<<
" is 0, skip to show the statistics"
;
return
;
}
real
mean
=
out
->
getSum
()
/
out
->
getElementCnt
();
MatrixPtr
outSquare
=
out
->
clone
();
outSquare
->
copyFrom
(
*
out
);
MatrixPtr
outSquare
;
if
(
dynamic_cast
<
GpuSparseMatrix
*>
(
out
.
get
()))
{
GpuSparseMatrix
*
tmp
=
dynamic_cast
<
GpuSparseMatrix
*>
(
out
.
get
());
outSquare
=
std
::
make_shared
<
CpuSparseMatrix
>
(
tmp
->
getHeight
(),
tmp
->
getWidth
(),
tmp
->
getElementCnt
(),
tmp
->
getValueType
(),
tmp
->
getFormat
());
}
else
{
outSquare
=
out
->
clone
();
}
outSquare
->
copyFrom
(
*
out
,
HPPL_STREAM_DEFAULT
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
real
mean
=
outSquare
->
getSum
()
/
out
->
getElementCnt
();
real
min
;
real
max
;
if
(
dynamic_cast
<
CpuSparseMatrix
*>
(
outSquare
.
get
()))
{
auto
tmpMat
=
dynamic_cast
<
CpuSparseMatrix
*>
(
outSquare
.
get
());
min
=
tmpMat
->
getMin
();
max
=
tmpMat
->
getMax
();
tmpMat
->
square
();
LOG
(
INFO
)
<<
"show statistics of [none zero values] in sparse matrix"
;
}
else
{
min
=
outSquare
->
getMin
();
max
=
outSquare
->
getMax
();
outSquare
->
square
();
}
real
std
=
(
outSquare
->
getSum
()
/
outSquare
->
getElementCnt
())
-
mean
*
mean
;
...
...
@@ -306,8 +323,8 @@ void Layer::showOutputStats() {
<<
", "
<<
"std="
<<
std
<<
", "
<<
"min="
<<
out
->
getMin
()
<<
", "
<<
"max="
<<
out
->
getMax
()
;
<<
"min="
<<
min
<<
", "
<<
"max="
<<
max
;
}
void
Layer
::
forwardActivation
()
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录