Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
1d4fa243
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1d4fa243
编写于
8月 04, 2017
作者:
L
liaogang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
ClangFormat for proto and cuda
上级
6512893b
变更
39
隐藏空白更改
内联
并排
Showing
39 changed file
with
3661 addition
and
2920 deletion
+3661
-2920
.pre-commit-config.yaml
.pre-commit-config.yaml
+1
-1
paddle/cuda/src/hl_batch_transpose.cu
paddle/cuda/src/hl_batch_transpose.cu
+7
-9
paddle/cuda/src/hl_cuda_aggregate.cu
paddle/cuda/src/hl_cuda_aggregate.cu
+61
-101
paddle/cuda/src/hl_cuda_cnn.cu
paddle/cuda/src/hl_cuda_cnn.cu
+275
-134
paddle/cuda/src/hl_cuda_lstm.cu
paddle/cuda/src/hl_cuda_lstm.cu
+331
-159
paddle/cuda/src/hl_cuda_matrix.cu
paddle/cuda/src/hl_cuda_matrix.cu
+147
-196
paddle/cuda/src/hl_cuda_sequence.cu
paddle/cuda/src/hl_cuda_sequence.cu
+96
-88
paddle/cuda/src/hl_cuda_sparse.cu
paddle/cuda/src/hl_cuda_sparse.cu
+475
-509
paddle/cuda/src/hl_perturbation_util.cu
paddle/cuda/src/hl_perturbation_util.cu
+104
-45
paddle/cuda/src/hl_table_apply.cu
paddle/cuda/src/hl_table_apply.cu
+35
-33
paddle/cuda/src/hl_top_k.cu
paddle/cuda/src/hl_top_k.cu
+127
-114
paddle/framework/attr_type.proto
paddle/framework/attr_type.proto
+7
-7
paddle/framework/op_desc.proto
paddle/framework/op_desc.proto
+17
-17
paddle/framework/op_proto.proto
paddle/framework/op_proto.proto
+72
-70
paddle/function/ContextProjectionOpGpu.cu
paddle/function/ContextProjectionOpGpu.cu
+70
-56
paddle/function/CosSimOpGpu.cu
paddle/function/CosSimOpGpu.cu
+34
-26
paddle/function/CropOpGpu.cu
paddle/function/CropOpGpu.cu
+59
-25
paddle/function/CrossMapNormalOpGpu.cu
paddle/function/CrossMapNormalOpGpu.cu
+46
-25
paddle/function/DepthwiseConvOpGpu.cu
paddle/function/DepthwiseConvOpGpu.cu
+253
-218
paddle/function/Im2ColOpGpu.cu
paddle/function/Im2ColOpGpu.cu
+150
-106
paddle/function/MulOpGpu.cu
paddle/function/MulOpGpu.cu
+1
-1
paddle/function/PadOpGpu.cu
paddle/function/PadOpGpu.cu
+49
-15
paddle/function/RowConvOpGpu.cu
paddle/function/RowConvOpGpu.cu
+87
-68
paddle/gserver/layers/GruCompute.cu
paddle/gserver/layers/GruCompute.cu
+4
-3
paddle/gserver/layers/LstmCompute.cu
paddle/gserver/layers/LstmCompute.cu
+38
-17
paddle/math/BaseMatrix.cu
paddle/math/BaseMatrix.cu
+619
-366
paddle/math/TrainingAlgorithmOp.cu
paddle/math/TrainingAlgorithmOp.cu
+32
-33
paddle/math/tests/test_Tensor.cu
paddle/math/tests/test_Tensor.cu
+167
-170
paddle/math/tests/test_lazyAssign.cu
paddle/math/tests/test_lazyAssign.cu
+40
-34
paddle/operators/softmax_op.cu
paddle/operators/softmax_op.cu
+2
-1
paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
...a_provider_wrapper_dir/test_pydata_provider_wrapper.proto
+0
-0
proto/DataConfig.proto
proto/DataConfig.proto
+27
-26
proto/DataFormat.proto
proto/DataFormat.proto
+22
-16
proto/ModelConfig.proto
proto/ModelConfig.proto
+57
-57
proto/OptimizerConfig.proto
proto/OptimizerConfig.proto
+36
-36
proto/ParameterConfig.proto
proto/ParameterConfig.proto
+23
-22
proto/ParameterServerConfig.proto
proto/ParameterServerConfig.proto
+10
-13
proto/ParameterService.proto
proto/ParameterService.proto
+37
-64
proto/TrainerConfig.proto
proto/TrainerConfig.proto
+43
-39
未找到文件。
.pre-commit-config.yaml
浏览文件 @
1d4fa243
...
@@ -24,7 +24,7 @@
...
@@ -24,7 +24,7 @@
description
:
Format files with ClangFormat.
description
:
Format files with ClangFormat.
entry
:
clang-format -i
entry
:
clang-format -i
language
:
system
language
:
system
files
:
\.(c|cc|cxx|cpp|
h|hpp|hxx
)$
files
:
\.(c|cc|cxx|cpp|
cu|h|hpp|hxx|proto
)$
-
repo
:
https://github.com/PaddlePaddle/pre-commit-golang
-
repo
:
https://github.com/PaddlePaddle/pre-commit-golang
sha
:
8337620115c25ff8333f1b1a493bd031049bd7c0
sha
:
8337620115c25ff8333f1b1a493bd031049bd7c0
hooks
:
hooks
:
...
...
paddle/cuda/src/hl_batch_transpose.cu
浏览文件 @
1d4fa243
...
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_batch_transpose.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_batch_transpose.h"
const
int
TILE_DIM
=
64
;
const
int
TILE_DIM
=
64
;
const
int
BLOCK_ROWS
=
16
;
const
int
BLOCK_ROWS
=
16
;
// No bank-conflict transpose for a batch of data.
// No bank-conflict transpose for a batch of data.
__global__
void
batchTransposeNoBankConflicts
(
real
*
odata
,
__global__
void
batchTransposeNoBankConflicts
(
const
real
*
idata
,
real
*
odata
,
const
real
*
idata
,
int
numSamples
,
int
width
,
int
height
)
{
int
numSamples
,
int
width
,
int
height
)
{
__shared__
float
tile
[
TILE_DIM
][
TILE_DIM
+
1
];
__shared__
float
tile
[
TILE_DIM
][
TILE_DIM
+
1
];
const
int
x
=
blockIdx
.
x
*
TILE_DIM
+
threadIdx
.
x
;
const
int
x
=
blockIdx
.
x
*
TILE_DIM
+
threadIdx
.
x
;
...
@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
...
@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
newX
]
=
tile
[
threadIdx
.
x
][
j
];
newX
]
=
tile
[
threadIdx
.
x
][
j
];
}
}
void
batchTranspose
(
const
real
*
input
,
real
*
output
,
int
width
,
int
height
,
void
batchTranspose
(
int
batchSize
)
{
const
real
*
input
,
real
*
output
,
int
width
,
int
height
,
int
batchSize
)
{
dim3
dimBlock
(
TILE_DIM
,
BLOCK_ROWS
,
1
);
dim3
dimBlock
(
TILE_DIM
,
BLOCK_ROWS
,
1
);
dim3
dimGrid
(
DIVUP
(
width
,
TILE_DIM
),
DIVUP
(
height
,
TILE_DIM
),
batchSize
);
dim3
dimGrid
(
DIVUP
(
width
,
TILE_DIM
),
DIVUP
(
height
,
TILE_DIM
),
batchSize
);
batchTransposeNoBankConflicts
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
batchTransposeNoBankConflicts
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
input
,
batchSize
,
width
,
height
);
output
,
input
,
batchSize
,
width
,
height
);
CHECK_SYNC
(
"batchTranspose failed!"
);
CHECK_SYNC
(
"batchTranspose failed!"
);
}
}
paddle/cuda/src/hl_cuda_aggregate.cu
浏览文件 @
1d4fa243
...
@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_aggregate.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_cuda.h"
#include "hl_cuda.h"
#include "hl_cuda.ph"
#include "hl_cuda.ph"
#include "hl_aggregate.h"
#include "hl_thread.ph"
#include "hl_matrix_base.cuh"
#include "hl_matrix_base.cuh"
#include "hl_thread.ph"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
/**
/**
* @brief matrix row operator.
* @brief matrix row operator.
*/
*/
template
<
class
Agg
,
int
blockSize
>
template
<
class
Agg
,
int
blockSize
>
__global__
void
KeMatrixRowOp
(
Agg
agg
,
__global__
void
KeMatrixRowOp
(
Agg
agg
,
real
*
E
,
real
*
Sum
,
int
dimN
)
{
real
*
E
,
real
*
Sum
,
int
dimN
)
{
__shared__
real
sum_s
[
blockSize
];
__shared__
real
sum_s
[
blockSize
];
int
cnt
=
(
dimN
+
blockSize
-
1
)
/
blockSize
;
int
cnt
=
(
dimN
+
blockSize
-
1
)
/
blockSize
;
int
rowId
=
blockIdx
.
x
+
blockIdx
.
y
*
gridDim
.
x
;
int
rowId
=
blockIdx
.
x
+
blockIdx
.
y
*
gridDim
.
x
;
int
index
=
rowId
*
dimN
;
int
index
=
rowId
*
dimN
;
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
lmt
=
tid
;
int
lmt
=
tid
;
...
@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
...
@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
sum_s
[
tid
]
=
tmp
;
sum_s
[
tid
]
=
tmp
;
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
tid
<
stride
)
{
if
(
tid
<
stride
)
{
sum_s
[
tid
]
=
agg
(
sum_s
[
tid
],
sum_s
[
tid
+
stride
]);
sum_s
[
tid
]
=
agg
(
sum_s
[
tid
],
sum_s
[
tid
+
stride
]);
}
}
...
@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
...
@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
}
}
template
<
class
Agg
>
template
<
class
Agg
>
void
hl_matrix_row_op
(
Agg
agg
,
void
hl_matrix_row_op
(
Agg
agg
,
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
int
blocksX
=
dimM
;
int
blocksX
=
dimM
;
int
blocksY
=
1
;
int
blocksY
=
1
;
dim3
threads
(
128
,
1
);
dim3
threads
(
128
,
1
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixRowOp
<
Agg
,
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixRowOp
<
Agg
,
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
agg
,
A_d
,
C_d
,
dimN
);
agg
,
A_d
,
C_d
,
dimN
);
}
}
void
hl_matrix_row_sum
(
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
void
hl_matrix_row_sum
(
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_row_op
(
aggregate
::
sum
(),
hl_matrix_row_op
(
aggregate
::
sum
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_row_sum failed"
);
CHECK_SYNC
(
"hl_matrix_row_sum failed"
);
}
}
...
@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_row_op
(
aggregate
::
max
(),
hl_matrix_row_op
(
aggregate
::
max
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_row_max failed"
);
CHECK_SYNC
(
"hl_matrix_row_max failed"
);
}
}
...
@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_row_op
(
aggregate
::
min
(),
hl_matrix_row_op
(
aggregate
::
min
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_row_min failed"
);
CHECK_SYNC
(
"hl_matrix_row_min failed"
);
}
}
/**
/**
* @brief matrix column operator.
* @brief matrix column operator.
*/
*/
template
<
class
Agg
>
template
<
class
Agg
>
__global__
void
KeMatrixColumnOp
(
Agg
agg
,
__global__
void
KeMatrixColumnOp
(
real
*
E
,
Agg
agg
,
real
*
E
,
real
*
Sum
,
int
dimM
,
int
dimN
)
{
real
*
Sum
,
int
dimM
,
int
dimN
)
{
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
real
tmp
=
agg
.
init
();
real
tmp
=
agg
.
init
();
if
(
rowIdx
<
dimN
)
{
if
(
rowIdx
<
dimN
)
{
...
@@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg,
...
@@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg,
}
}
}
}
template
<
class
Agg
,
int
blockDimX
,
int
blockDimY
>
template
<
class
Agg
,
int
blockDimX
,
int
blockDimY
>
__global__
void
KeMatrixColumnOp_S
(
Agg
agg
,
__global__
void
KeMatrixColumnOp_S
(
real
*
E
,
Agg
agg
,
real
*
E
,
real
*
Sum
,
int
dimM
,
int
dimN
)
{
real
*
Sum
,
__shared__
real
_sum
[
blockDimX
*
blockDimY
];
int
dimM
,
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
dimN
)
{
int
index
=
threadIdx
.
y
;
__shared__
real
_sum
[
blockDimX
*
blockDimY
];
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
threadIdx
.
y
;
real
tmp
=
agg
.
init
();
real
tmp
=
agg
.
init
();
if
(
rowIdx
<
dimN
)
{
if
(
rowIdx
<
dimN
)
{
...
@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
...
@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
index
+=
blockDimY
;
index
+=
blockDimY
;
}
}
}
}
_sum
[
threadIdx
.
x
+
threadIdx
.
y
*
blockDimX
]
=
tmp
;
_sum
[
threadIdx
.
x
+
threadIdx
.
y
*
blockDimX
]
=
tmp
;
__syncthreads
();
__syncthreads
();
if
(
rowIdx
<
dimN
)
{
if
(
rowIdx
<
dimN
)
{
if
(
threadIdx
.
y
==
0
)
{
if
(
threadIdx
.
y
==
0
)
{
real
tmp
=
agg
.
init
();
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
blockDimY
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockDimY
;
i
++
)
{
tmp
=
agg
(
tmp
,
_sum
[
threadIdx
.
x
+
i
*
blockDimX
]);
tmp
=
agg
(
tmp
,
_sum
[
threadIdx
.
x
+
i
*
blockDimX
]);
}
}
Sum
[
rowIdx
]
=
tmp
;
Sum
[
rowIdx
]
=
tmp
;
}
}
...
@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
...
@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
}
}
template
<
class
Agg
>
template
<
class
Agg
>
void
hl_matrix_column_op
(
Agg
agg
,
void
hl_matrix_column_op
(
Agg
agg
,
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
if
(
dimN
>=
8192
)
{
if
(
dimN
>=
8192
)
{
int
blocksX
=
(
dimN
+
128
-
1
)
/
128
;
int
blocksX
=
(
dimN
+
128
-
1
)
/
128
;
int
blocksY
=
1
;
int
blocksY
=
1
;
dim3
threads
(
128
,
1
);
dim3
threads
(
128
,
1
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixColumnOp
<
Agg
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixColumnOp
<
Agg
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
agg
,
A_d
,
C_d
,
dimM
,
dimN
);
agg
,
A_d
,
C_d
,
dimM
,
dimN
);
}
else
{
}
else
{
int
blocksX
=
(
dimN
+
32
-
1
)
/
32
;
int
blocksX
=
(
dimN
+
32
-
1
)
/
32
;
int
blocksY
=
1
;
int
blocksY
=
1
;
dim3
threads
(
32
,
32
);
dim3
threads
(
32
,
32
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixColumnOp_S
<
Agg
,
32
,
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixColumnOp_S
<
Agg
,
32
,
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
agg
,
A_d
,
C_d
,
dimM
,
dimN
);
agg
,
A_d
,
C_d
,
dimM
,
dimN
);
}
}
return
;
return
;
...
@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_column_op
(
aggregate
::
sum
(),
hl_matrix_column_op
(
aggregate
::
sum
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_column_sum failed"
);
CHECK_SYNC
(
"hl_matrix_column_sum failed"
);
}
}
...
@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_column_op
(
aggregate
::
max
(),
hl_matrix_column_op
(
aggregate
::
max
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_column_max failed"
);
CHECK_SYNC
(
"hl_matrix_column_max failed"
);
}
}
...
@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_column_op
(
aggregate
::
min
(),
hl_matrix_column_op
(
aggregate
::
min
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_column_min failed"
);
CHECK_SYNC
(
"hl_matrix_column_min failed"
);
}
}
...
@@ -226,16 +184,16 @@ template <int blockSize>
...
@@ -226,16 +184,16 @@ template <int blockSize>
__global__
void
KeVectorSum
(
real
*
E
,
real
*
Sum
,
int
dimM
)
{
__global__
void
KeVectorSum
(
real
*
E
,
real
*
Sum
,
int
dimM
)
{
__shared__
double
sum_s
[
blockSize
];
__shared__
double
sum_s
[
blockSize
];
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
index
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
sum_s
[
tid
]
=
0.0
f
;
sum_s
[
tid
]
=
0.0
f
;
while
(
index
<
dimM
)
{
while
(
index
<
dimM
)
{
sum_s
[
tid
]
+=
E
[
index
];
sum_s
[
tid
]
+=
E
[
index
];
index
+=
blockDim
.
x
*
gridDim
.
y
;
index
+=
blockDim
.
x
*
gridDim
.
y
;
}
}
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
tid
<
stride
)
{
if
(
tid
<
stride
)
{
sum_s
[
tid
]
+=
sum_s
[
tid
+
stride
];
sum_s
[
tid
]
+=
sum_s
[
tid
+
stride
];
}
}
...
@@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
...
@@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
dim3
threads
(
blockSize
,
1
);
dim3
threads
(
blockSize
,
1
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
struct
_hl_event_st
hl_event_st
=
{.
cu_event
=
t_resource
.
event
};
struct
_hl_event_st
hl_event_st
=
{.
cu_event
=
t_resource
.
event
};
hl_event_t
hl_event
=
&
hl_event_st
;
hl_event_t
hl_event
=
&
hl_event_st
;
while
(
!
hl_cuda_event_is_ready
(
hl_event
))
{}
while
(
!
hl_cuda_event_is_ready
(
hl_event
))
{
}
KeVectorSum
<
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorSum
<
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
A_d
,
t_resource
.
gpu_mem
,
dimM
);
A_d
,
t_resource
.
gpu_mem
,
dimM
);
KeVectorSum
<
128
><<<
1
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorSum
<
128
><<<
1
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
t_resource
.
gpu_mem
,
t_resource
.
cpu_mem
,
128
);
t_resource
.
gpu_mem
,
t_resource
.
cpu_mem
,
128
);
hl_memcpy_async
(
C_h
,
t_resource
.
cpu_mem
,
sizeof
(
real
),
HPPL_STREAM_DEFAULT
);
hl_memcpy_async
(
C_h
,
t_resource
.
cpu_mem
,
sizeof
(
real
),
HPPL_STREAM_DEFAULT
);
hl_stream_record_event
(
HPPL_STREAM_DEFAULT
,
hl_event
);
hl_stream_record_event
(
HPPL_STREAM_DEFAULT
,
hl_event
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
cudaError_t
err
=
(
cudaError_t
)
hl_get_device_last_error
();
cudaError_t
err
=
(
cudaError_t
)
hl_get_device_last_error
();
CHECK_EQ
(
cudaSuccess
,
err
)
CHECK_EQ
(
cudaSuccess
,
err
)
<<
"CUDA error: "
<<
"CUDA error: "
<<
hl_get_device_error_string
((
size_t
)
err
);
<<
hl_get_device_error_string
((
size_t
)
err
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeVectorAbsSum
(
real
*
E
,
real
*
Sum
,
int
dimM
)
{
__global__
void
KeVectorAbsSum
(
real
*
E
,
real
*
Sum
,
int
dimM
)
{
__shared__
double
sum_s
[
blockSize
];
__shared__
double
sum_s
[
blockSize
];
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
index
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
sum_s
[
tid
]
=
0.0
f
;
sum_s
[
tid
]
=
0.0
f
;
while
(
index
<
dimM
)
{
while
(
index
<
dimM
)
{
sum_s
[
tid
]
+=
abs
(
E
[
index
]);
sum_s
[
tid
]
+=
abs
(
E
[
index
]);
index
+=
blockDim
.
x
*
gridDim
.
y
;
index
+=
blockDim
.
x
*
gridDim
.
y
;
}
}
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
tid
<
stride
)
{
if
(
tid
<
stride
)
{
sum_s
[
tid
]
+=
sum_s
[
tid
+
stride
];
sum_s
[
tid
]
+=
sum_s
[
tid
+
stride
];
}
}
...
@@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
...
@@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
dim3
threads
(
blockSize
,
1
);
dim3
threads
(
blockSize
,
1
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
struct
_hl_event_st
hl_event_st
=
{.
cu_event
=
t_resource
.
event
};
struct
_hl_event_st
hl_event_st
=
{.
cu_event
=
t_resource
.
event
};
hl_event_t
hl_event
=
&
hl_event_st
;
hl_event_t
hl_event
=
&
hl_event_st
;
while
(
!
hl_cuda_event_is_ready
(
hl_event
))
{}
while
(
!
hl_cuda_event_is_ready
(
hl_event
))
{
}
KeVectorAbsSum
<
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorAbsSum
<
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
A_d
,
t_resource
.
gpu_mem
,
dimM
);
A_d
,
t_resource
.
gpu_mem
,
dimM
);
KeVectorAbsSum
<
128
><<<
1
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorAbsSum
<
128
><<<
1
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
t_resource
.
gpu_mem
,
t_resource
.
cpu_mem
,
128
);
t_resource
.
gpu_mem
,
t_resource
.
cpu_mem
,
128
);
hl_memcpy_async
(
C_h
,
t_resource
.
cpu_mem
,
sizeof
(
real
),
HPPL_STREAM_DEFAULT
);
hl_memcpy_async
(
C_h
,
t_resource
.
cpu_mem
,
sizeof
(
real
),
HPPL_STREAM_DEFAULT
);
hl_stream_record_event
(
HPPL_STREAM_DEFAULT
,
hl_event
);
hl_stream_record_event
(
HPPL_STREAM_DEFAULT
,
hl_event
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
cudaError_t
err
=
(
cudaError_t
)
hl_get_device_last_error
();
cudaError_t
err
=
(
cudaError_t
)
hl_get_device_last_error
();
CHECK_EQ
(
cudaSuccess
,
err
)
CHECK_EQ
(
cudaSuccess
,
err
)
<<
"CUDA error: "
<<
"CUDA error: "
<<
hl_get_device_error_string
((
size_t
)
err
);
<<
hl_get_device_error_string
((
size_t
)
err
);
}
}
paddle/cuda/src/hl_cuda_cnn.cu
浏览文件 @
1d4fa243
...
@@ -12,21 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,21 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <float.h>
#include <float.h>
#include "hl_base.h"
#include "hl_base.h"
#include "hl_cnn.h"
#include "hl_cnn.h"
#include "hl_device_functions.cuh"
#include "hl_device_functions.cuh"
__global__
void
KeMaxPoolForward
(
const
int
nthreads
,
const
real
*
inputData
,
__global__
void
KeMaxPoolForward
(
const
int
nthreads
,
const
int
channels
,
const
int
height
,
const
real
*
inputData
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
width
,
const
int
pooledH
,
const
int
pooledW
,
const
int
pooledH
,
const
int
ksizeW
,
const
int
ksizeH
,
const
int
pooledW
,
const
int
strideH
,
const
int
strideW
,
const
int
ksizeW
,
const
int
offsetH
,
const
int
offsetW
,
const
int
ksizeH
,
real
*
tgtData
,
const
int
tgtStride
)
{
const
int
strideH
,
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
strideW
,
const
int
offsetH
,
const
int
offsetW
,
real
*
tgtData
,
const
int
tgtStride
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
int
pw
=
index
%
pooledW
;
int
pw
=
index
%
pooledW
;
int
ph
=
(
index
/
pooledW
)
%
pooledH
;
int
ph
=
(
index
/
pooledW
)
%
pooledH
;
...
@@ -46,44 +52,70 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
...
@@ -46,44 +52,70 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
maxval
=
inputData
[
h
*
width
+
w
];
maxval
=
inputData
[
h
*
width
+
w
];
}
}
}
}
int
tgtIndex
=
index
%
(
pooledW
*
pooledH
*
channels
)
+
int
tgtIndex
=
frameNum
*
tgtStride
;
index
%
(
pooledW
*
pooledH
*
channels
)
+
frameNum
*
tgtStride
;
tgtData
[
tgtIndex
]
=
maxval
;
tgtData
[
tgtIndex
]
=
maxval
;
}
}
}
}
void
hl_maxpool_forward
(
const
int
frameCnt
,
const
real
*
inputData
,
void
hl_maxpool_forward
(
const
int
frameCnt
,
const
real
*
inputData
,
const
int
channels
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
height
,
const
int
pooledH
,
const
int
pooledW
,
const
int
width
,
const
int
sizeX
,
const
int
sizeY
,
const
int
pooledH
,
const
int
strideH
,
const
int
strideW
,
const
int
pooledW
,
const
int
paddingH
,
const
int
paddingW
,
const
int
sizeX
,
real
*
tgtData
,
const
int
tgtStride
)
{
const
int
sizeY
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
real
*
tgtData
,
const
int
tgtStride
)
{
int
num_kernels
=
pooledH
*
pooledW
*
channels
*
frameCnt
;
int
num_kernels
=
pooledH
*
pooledW
*
channels
*
frameCnt
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blocks
,
1
);
dim3
grid
(
blocks
,
1
);
KeMaxPoolForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMaxPoolForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
num_kernels
,
(
num_kernels
,
inputData
,
channels
,
height
,
width
,
inputData
,
pooledH
,
pooledW
,
sizeX
,
sizeY
,
strideH
,
strideW
,
channels
,
paddingH
,
paddingW
,
tgtData
,
tgtStride
);
height
,
width
,
pooledH
,
pooledW
,
sizeX
,
sizeY
,
strideH
,
strideW
,
paddingH
,
paddingW
,
tgtData
,
tgtStride
);
CHECK_SYNC
(
"hl_maxpool_forward failed"
);
CHECK_SYNC
(
"hl_maxpool_forward failed"
);
}
}
__global__
void
KeMaxPoolBackward
(
const
int
nthreads
,
const
real
*
inputData
,
__global__
void
KeMaxPoolBackward
(
const
int
nthreads
,
const
real
*
outData
,
const
real
*
outGrad
,
const
real
*
inputData
,
const
int
channels
,
const
int
height
,
const
real
*
outData
,
const
real
*
outGrad
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
width
,
const
int
pooledH
,
const
int
pooledW
,
const
int
pooledH
,
const
int
sizeX
,
const
int
sizeY
,
const
int
pooledW
,
const
int
strideH
,
const
int
strideW
,
const
int
sizeX
,
const
int
padH
,
const
int
padW
,
const
int
sizeY
,
real
scaleA
,
real
scaleB
,
const
int
strideH
,
real
*
targetGrad
,
const
int
outStride
)
{
const
int
strideW
,
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
padH
,
const
int
padW
,
real
scaleA
,
real
scaleB
,
real
*
targetGrad
,
const
int
outStride
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
// find out the local index
// find out the local index
// find out the local offset
// find out the local offset
...
@@ -107,43 +139,69 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
...
@@ -107,43 +139,69 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
}
}
}
}
}
}
targetGrad
[
index
]
=
targetGrad
[
index
]
=
scaleB
*
targetGrad
[
index
]
+
scaleA
*
gradient
;
scaleB
*
targetGrad
[
index
]
+
scaleA
*
gradient
;
}
}
}
}
void
hl_maxpool_backward
(
const
int
frameCnt
,
const
real
*
inputData
,
void
hl_maxpool_backward
(
const
int
frameCnt
,
const
real
*
outData
,
const
real
*
outGrad
,
const
real
*
inputData
,
const
int
channels
,
const
int
height
,
const
real
*
outData
,
const
int
width
,
const
real
*
outGrad
,
const
int
pooledH
,
const
int
pooledW
,
const
int
channels
,
const
int
sizeX
,
const
int
sizeY
,
const
int
height
,
const
int
strideH
,
const
int
strideW
,
const
int
width
,
const
int
paddingH
,
const
int
paddingW
,
const
int
pooledH
,
real
scaleA
,
real
scaleB
,
const
int
pooledW
,
real
*
targetGrad
,
const
int
outStride
)
{
const
int
sizeX
,
const
int
sizeY
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
real
scaleA
,
real
scaleB
,
real
*
targetGrad
,
const
int
outStride
)
{
int
num_kernels
=
height
*
width
*
channels
*
frameCnt
;
int
num_kernels
=
height
*
width
*
channels
*
frameCnt
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
KeMaxPoolBackward
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
KeMaxPoolBackward
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
num_kernels
,
(
num_kernels
,
inputData
,
outData
,
outGrad
,
channels
,
inputData
,
height
,
width
,
pooledH
,
pooledW
,
sizeX
,
sizeY
,
outData
,
strideH
,
strideW
,
outGrad
,
paddingH
,
paddingW
,
channels
,
scaleA
,
scaleB
,
height
,
targetGrad
,
outStride
);
width
,
pooledH
,
pooledW
,
sizeX
,
sizeY
,
strideH
,
strideW
,
paddingH
,
paddingW
,
scaleA
,
scaleB
,
targetGrad
,
outStride
);
CHECK_SYNC
(
"hl_maxpool_backward"
);
CHECK_SYNC
(
"hl_maxpool_backward"
);
}
}
__global__
void
KeAvgPoolForward
(
const
int
nthreads
,
const
real
*
inputData
,
__global__
void
KeAvgPoolForward
(
const
int
nthreads
,
const
real
*
inputData
,
const
int
channels
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
height
,
const
int
pooledH
,
const
int
pooledW
,
const
int
width
,
const
int
sizeX
,
const
int
sizeY
,
const
int
pooledH
,
const
int
strideH
,
const
int
strideW
,
const
int
pooledW
,
const
int
padH
,
const
int
padW
,
const
int
sizeX
,
real
*
tgtData
,
const
int
tgtStride
)
{
const
int
sizeY
,
const
int
strideH
,
const
int
strideW
,
const
int
padH
,
const
int
padW
,
real
*
tgtData
,
const
int
tgtStride
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
int
pw
=
index
%
pooledW
;
int
pw
=
index
%
pooledW
;
...
@@ -168,39 +226,64 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
...
@@ -168,39 +226,64 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
aveval
+=
inputData
[
h
*
width
+
w
];
aveval
+=
inputData
[
h
*
width
+
w
];
}
}
}
}
int
tgtIndex
=
index
%
(
pooledW
*
pooledH
*
channels
)
+
int
tgtIndex
=
frameNum
*
tgtStride
;
index
%
(
pooledW
*
pooledH
*
channels
)
+
frameNum
*
tgtStride
;
tgtData
[
tgtIndex
]
=
aveval
/
pool_size
;
tgtData
[
tgtIndex
]
=
aveval
/
pool_size
;
}
}
}
}
void
hl_avgpool_forward
(
const
int
frameCnt
,
const
real
*
inputData
,
void
hl_avgpool_forward
(
const
int
frameCnt
,
const
real
*
inputData
,
const
int
channels
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
height
,
const
int
pooledH
,
const
int
pooledW
,
const
int
width
,
const
int
sizeX
,
const
int
sizeY
,
const
int
pooledH
,
const
int
strideH
,
const
int
strideW
,
const
int
pooledW
,
const
int
paddingH
,
const
int
paddingW
,
const
int
sizeX
,
real
*
tgtData
,
const
int
tgtStride
)
{
const
int
sizeY
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
real
*
tgtData
,
const
int
tgtStride
)
{
int
num_kernels
=
pooledH
*
pooledW
*
channels
*
frameCnt
;
int
num_kernels
=
pooledH
*
pooledW
*
channels
*
frameCnt
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
KeAvgPoolForward
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
KeAvgPoolForward
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
num_kernels
,
(
num_kernels
,
inputData
,
channels
,
inputData
,
height
,
width
,
pooledH
,
pooledW
,
channels
,
sizeX
,
sizeY
,
strideH
,
strideW
,
height
,
paddingH
,
paddingW
,
tgtData
,
tgtStride
);
width
,
pooledH
,
pooledW
,
sizeX
,
sizeY
,
strideH
,
strideW
,
paddingH
,
paddingW
,
tgtData
,
tgtStride
);
CHECK_SYNC
(
"hl_avgpool_forward failed"
);
CHECK_SYNC
(
"hl_avgpool_forward failed"
);
}
}
__global__
void
KeAvgPoolBackward
(
const
int
nthreads
,
const
real
*
outGrad
,
__global__
void
KeAvgPoolBackward
(
const
int
nthreads
,
const
int
channels
,
const
int
height
,
const
real
*
outGrad
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
width
,
const
int
pooledH
,
const
int
pooledW
,
const
int
pooledH
,
const
int
sizeX
,
const
int
sizeY
,
const
int
pooledW
,
const
int
strideH
,
const
int
strideW
,
const
int
sizeX
,
const
int
padH
,
const
int
padW
,
const
int
sizeY
,
real
scaleA
,
real
scaleB
,
const
int
strideH
,
real
*
tgtGrad
,
const
int
outStride
)
{
const
int
strideW
,
const
int
padH
,
const
int
padW
,
real
scaleA
,
real
scaleB
,
real
*
tgtGrad
,
const
int
outStride
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
int
offsetW
=
index
%
width
+
padW
;
int
offsetW
=
index
%
width
+
padW
;
...
@@ -215,7 +298,6 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
...
@@ -215,7 +298,6 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
real
gradient
=
0
;
real
gradient
=
0
;
outGrad
+=
(
frameNum
*
outStride
+
offsetC
*
pooledH
*
pooledW
);
outGrad
+=
(
frameNum
*
outStride
+
offsetC
*
pooledH
*
pooledW
);
for
(
int
ph
=
phstart
;
ph
<
phend
;
++
ph
)
{
for
(
int
ph
=
phstart
;
ph
<
phend
;
++
ph
)
{
for
(
int
pw
=
pwstart
;
pw
<
pwend
;
++
pw
)
{
for
(
int
pw
=
pwstart
;
pw
<
pwend
;
++
pw
)
{
// figure out the pooling size
// figure out the pooling size
...
@@ -224,32 +306,50 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
...
@@ -224,32 +306,50 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
int
hend
=
min
(
hstart
+
sizeY
,
height
+
padH
);
int
hend
=
min
(
hstart
+
sizeY
,
height
+
padH
);
int
wend
=
min
(
wstart
+
sizeX
,
width
+
padW
);
int
wend
=
min
(
wstart
+
sizeX
,
width
+
padW
);
int
poolsize
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
int
poolsize
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
gradient
+=
outGrad
[
ph
*
pooledW
+
pw
]
/
poolsize
;
gradient
+=
outGrad
[
ph
*
pooledW
+
pw
]
/
poolsize
;
}
}
}
}
tgtGrad
[
index
]
=
scaleB
*
tgtGrad
[
index
]
+
scaleA
*
gradient
;
tgtGrad
[
index
]
=
scaleB
*
tgtGrad
[
index
]
+
scaleA
*
gradient
;
}
}
}
}
void
hl_avgpool_backward
(
const
int
frameCnt
,
const
real
*
outGrad
,
void
hl_avgpool_backward
(
const
int
frameCnt
,
const
real
*
outGrad
,
const
int
channels
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
height
,
const
int
pooledH
,
const
int
pooledW
,
const
int
width
,
const
int
sizeX
,
const
int
sizeY
,
const
int
pooledH
,
const
int
strideH
,
const
int
strideW
,
const
int
pooledW
,
const
int
paddingH
,
const
int
paddingW
,
const
int
sizeX
,
real
scaleA
,
real
scaleB
,
const
int
sizeY
,
real
*
backGrad
,
const
int
outStride
)
{
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
real
scaleA
,
real
scaleB
,
real
*
backGrad
,
const
int
outStride
)
{
int
num_kernels
=
height
*
width
*
channels
*
frameCnt
;
int
num_kernels
=
height
*
width
*
channels
*
frameCnt
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
KeAvgPoolBackward
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
KeAvgPoolBackward
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
num_kernels
,
(
num_kernels
,
outGrad
,
channels
,
height
,
width
,
outGrad
,
pooledH
,
pooledW
,
sizeX
,
sizeY
,
channels
,
strideH
,
strideW
,
height
,
paddingH
,
paddingW
,
width
,
scaleA
,
scaleB
,
pooledH
,
backGrad
,
outStride
);
pooledW
,
sizeX
,
sizeY
,
strideH
,
strideW
,
paddingH
,
paddingW
,
scaleA
,
scaleB
,
backGrad
,
outStride
);
CHECK_SYNC
(
"hl_avgpool_backward failed"
);
CHECK_SYNC
(
"hl_avgpool_backward failed"
);
}
}
...
@@ -266,7 +366,7 @@ __global__ void KeBilinearInterpFw(const real* in,
...
@@ -266,7 +366,7 @@ __global__ void KeBilinearInterpFw(const real* in,
const
size_t
numChannels
,
const
size_t
numChannels
,
const
real
ratioH
,
const
real
ratioH
,
const
real
ratioW
)
{
const
real
ratioW
)
{
int
nthreads
=
outputH
*
outputW
;
int
nthreads
=
outputH
*
outputW
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
tid
<
nthreads
)
{
if
(
tid
<
nthreads
)
{
int
outIdH
=
tid
/
outputW
;
int
outIdH
=
tid
/
outputW
;
...
@@ -287,13 +387,14 @@ __global__ void KeBilinearInterpFw(const real* in,
...
@@ -287,13 +387,14 @@ __global__ void KeBilinearInterpFw(const real* in,
real
w1lambda
=
ratioW
*
outImgIdx
-
inImgIdx
;
real
w1lambda
=
ratioW
*
outImgIdx
-
inImgIdx
;
real
w2lambda
=
1.
f
-
w1lambda
;
real
w2lambda
=
1.
f
-
w1lambda
;
const
real
*
inPos
=
const
real
*
inPos
=
&
in
[
outIdH
*
inputW
+
channelId
*
inImgSize
+
&
in
[
outIdH
*
inputW
+
channelId
*
inImgSize
+
inImgIdy
*
inImgW
+
inImgIdx
];
inImgIdy
*
inImgW
+
inImgIdx
];
// bilinear interpolation
// bilinear interpolation
out
[
outIdH
*
outputW
+
outIdW
]
=
out
[
outIdH
*
outputW
+
outIdW
]
=
h2lambda
*
(
w2lambda
*
inPos
[
0
]
+
w1lambda
*
inPos
[
wId
])
+
h2lambda
*
(
w2lambda
*
inPos
[
0
]
+
w1lambda
*
inPos
[
wId
])
+
h1lambda
*
(
w2lambda
*
inPos
[
hId
*
inImgW
]
+
w1lambda
*
inPos
[
hId
*
inImgW
+
wId
]);
h1lambda
*
(
w2lambda
*
inPos
[
hId
*
inImgW
]
+
w1lambda
*
inPos
[
hId
*
inImgW
+
wId
]);
}
}
}
}
...
@@ -313,9 +414,19 @@ void hl_bilinear_forward(const real* inData,
...
@@ -313,9 +414,19 @@ void hl_bilinear_forward(const real* inData,
int
threadNum
=
outputH
*
outputW
;
int
threadNum
=
outputH
*
outputW
;
int
blocks
=
(
threadNum
+
1024
-
1
)
/
1024
;
int
blocks
=
(
threadNum
+
1024
-
1
)
/
1024
;
KeBilinearInterpFw
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
KeBilinearInterpFw
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
inData
,
inData
,
inImgH
,
inImgW
,
inputH
,
inputW
,
outData
,
outImgH
,
inImgH
,
outImgW
,
outputH
,
outputW
,
numChannels
,
ratioH
,
ratioW
);
inImgW
,
inputH
,
inputW
,
outData
,
outImgH
,
outImgW
,
outputH
,
outputW
,
numChannels
,
ratioH
,
ratioW
);
CHECK_SYNC
(
"hl_bilinear_forward failed"
);
CHECK_SYNC
(
"hl_bilinear_forward failed"
);
}
}
...
@@ -353,13 +464,15 @@ __global__ void KeBilinearInterpBw(real* in,
...
@@ -353,13 +464,15 @@ __global__ void KeBilinearInterpBw(real* in,
real
w1lambda
=
ratioW
*
outImgIdx
-
inImgIdx
;
real
w1lambda
=
ratioW
*
outImgIdx
-
inImgIdx
;
real
w2lambda
=
1.
f
-
w1lambda
;
real
w2lambda
=
1.
f
-
w1lambda
;
real
*
inPos
=
real
*
inPos
=
&
in
[
outIdH
*
inputW
+
channelId
*
inImgSize
+
&
in
[
outIdH
*
inputW
+
channelId
*
inImgSize
+
inImgIdy
*
inImgW
+
inImgIdx
];
inImgIdy
*
inImgW
+
inImgIdx
];
const
real
*
outPos
=
&
out
[
outIdH
*
outputW
+
outIdW
];
const
real
*
outPos
=
&
out
[
outIdH
*
outputW
+
outIdW
];
paddle
::
paddleAtomicAdd
(
&
inPos
[
0
],
h2lambda
*
w2lambda
*
outPos
[
0
]);
paddle
::
paddleAtomicAdd
(
&
inPos
[
0
],
h2lambda
*
w2lambda
*
outPos
[
0
]);
paddle
::
paddleAtomicAdd
(
&
inPos
[
wId
],
h2lambda
*
w1lambda
*
outPos
[
0
]);
paddle
::
paddleAtomicAdd
(
&
inPos
[
wId
],
h2lambda
*
w1lambda
*
outPos
[
0
]);
paddle
::
paddleAtomicAdd
(
&
inPos
[
hId
*
inImgW
],
h1lambda
*
w2lambda
*
outPos
[
0
]);
paddle
::
paddleAtomicAdd
(
&
inPos
[
hId
*
inImgW
],
paddle
::
paddleAtomicAdd
(
&
inPos
[
hId
*
inImgW
+
wId
],
h1lambda
*
w1lambda
*
outPos
[
0
]);
h1lambda
*
w2lambda
*
outPos
[
0
]);
paddle
::
paddleAtomicAdd
(
&
inPos
[
hId
*
inImgW
+
wId
],
h1lambda
*
w1lambda
*
outPos
[
0
]);
}
}
}
}
...
@@ -379,22 +492,37 @@ void hl_bilinear_backward(real* inGrad,
...
@@ -379,22 +492,37 @@ void hl_bilinear_backward(real* inGrad,
int
threadNum
=
outputH
*
outputW
;
int
threadNum
=
outputH
*
outputW
;
int
blocks
=
(
threadNum
+
1024
-
1
)
/
1024
;
int
blocks
=
(
threadNum
+
1024
-
1
)
/
1024
;
KeBilinearInterpBw
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
KeBilinearInterpBw
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
inGrad
,
inGrad
,
inImgH
,
inImgW
,
inputH
,
inputW
,
outGrad
,
outImgH
,
inImgH
,
outImgW
,
outputH
,
outputW
,
numChannels
,
ratioH
,
ratioW
);
inImgW
,
inputH
,
inputW
,
outGrad
,
outImgH
,
outImgW
,
outputH
,
outputW
,
numChannels
,
ratioH
,
ratioW
);
CHECK_SYNC
(
"hl_bilinear_backward failed"
);
CHECK_SYNC
(
"hl_bilinear_backward failed"
);
}
}
__global__
void
maxoutFpCompute
(
size_t
nthreads
,
const
real
*
inData
,
__global__
void
maxoutFpCompute
(
size_t
nthreads
,
real
*
outData
,
int
*
idData
,
const
real
*
inData
,
size_t
size
,
size_t
featLen
,
size_t
groups
)
{
real
*
outData
,
int
*
idData
,
size_t
size
,
size_t
featLen
,
size_t
groups
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
size_t
batch_idx
=
index
/
size
;
size_t
batch_idx
=
index
/
size
;
size_t
i
=
index
%
size
;
size_t
i
=
index
%
size
;
size_t
channel_idx
=
i
/
featLen
;
size_t
channel_idx
=
i
/
featLen
;
size_t
feat_idx
=
i
%
featLen
;
size_t
feat_idx
=
i
%
featLen
;
size_t
data_idx
=
(
batch_idx
*
size
+
channel_idx
*
featLen
)
*
groups
+
feat_idx
;
size_t
data_idx
=
(
batch_idx
*
size
+
channel_idx
*
featLen
)
*
groups
+
feat_idx
;
real
max
=
inData
[
data_idx
];
real
max
=
inData
[
data_idx
];
int
maxId
=
0
;
int
maxId
=
0
;
for
(
size_t
g
=
1
;
g
<
groups
;
++
g
)
{
for
(
size_t
g
=
1
;
g
<
groups
;
++
g
)
{
...
@@ -409,37 +537,50 @@ __global__ void maxoutFpCompute(size_t nthreads, const real * inData,
...
@@ -409,37 +537,50 @@ __global__ void maxoutFpCompute(size_t nthreads, const real * inData,
}
}
}
}
void
hl_maxout_forward
(
const
real
*
inData
,
real
*
outData
,
void
hl_maxout_forward
(
const
real
*
inData
,
int
*
idData
,
size_t
batchSize
,
size_t
size
,
real
*
outData
,
size_t
featLen
,
size_t
groups
)
{
int
*
idData
,
size_t
batchSize
,
size_t
size
,
size_t
featLen
,
size_t
groups
)
{
int
num_kernels
=
size
*
batchSize
;
int
num_kernels
=
size
*
batchSize
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
maxoutFpCompute
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
maxoutFpCompute
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
num_kernels
,
inData
,
outData
,
idData
,
size
,
featLen
,
groups
);
num_kernels
,
inData
,
outData
,
idData
,
size
,
featLen
,
groups
);
CHECK_SYNC
(
"hl_maxout_forward failed"
);
CHECK_SYNC
(
"hl_maxout_forward failed"
);
}
}
__global__
void
maxoutBpCompute
(
size_t
nthreads
,
real
*
inGrad
,
__global__
void
maxoutBpCompute
(
size_t
nthreads
,
const
real
*
outGrad
,
const
int
*
idData
,
real
*
inGrad
,
size_t
size
,
size_t
featLen
,
size_t
groups
)
{
const
real
*
outGrad
,
const
int
*
idData
,
size_t
size
,
size_t
featLen
,
size_t
groups
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
size_t
batch_idx
=
index
/
size
;
size_t
batch_idx
=
index
/
size
;
size_t
i
=
index
%
size
;
size_t
i
=
index
%
size
;
size_t
channel_idx
=
i
/
featLen
;
size_t
channel_idx
=
i
/
featLen
;
size_t
feat_idx
=
i
%
featLen
;
size_t
feat_idx
=
i
%
featLen
;
size_t
newIndex
=
batch_idx
*
size
;
size_t
newIndex
=
batch_idx
*
size
;
size_t
gradIdx
=
(
channel_idx
*
groups
+
(
idData
+
newIndex
)[
i
])
*
featLen
+
feat_idx
;
size_t
gradIdx
=
(
channel_idx
*
groups
+
(
idData
+
newIndex
)[
i
])
*
featLen
+
feat_idx
;
(
inGrad
+
newIndex
*
groups
)[
gradIdx
]
+=
(
outGrad
+
newIndex
)[
i
];
(
inGrad
+
newIndex
*
groups
)[
gradIdx
]
+=
(
outGrad
+
newIndex
)[
i
];
}
}
}
}
void
hl_maxout_backward
(
real
*
inGrad
,
const
real
*
outGrad
,
void
hl_maxout_backward
(
real
*
inGrad
,
const
int
*
idData
,
size_t
batchSize
,
size_t
size
,
const
real
*
outGrad
,
size_t
featLen
,
size_t
groups
)
{
const
int
*
idData
,
size_t
batchSize
,
size_t
size
,
size_t
featLen
,
size_t
groups
)
{
int
num_kernels
=
size
*
batchSize
;
int
num_kernels
=
size
*
batchSize
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
num_kernels
+
1024
-
1
)
/
1024
;
maxoutBpCompute
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
maxoutBpCompute
<<<
blocks
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
num_kernels
,
inGrad
,
outGrad
,
idData
,
size
,
featLen
,
groups
);
num_kernels
,
inGrad
,
outGrad
,
idData
,
size
,
featLen
,
groups
);
CHECK_SYNC
(
"hl_maxout_backward failed"
);
CHECK_SYNC
(
"hl_maxout_backward failed"
);
}
}
paddle/cuda/src/hl_cuda_lstm.cu
浏览文件 @
1d4fa243
...
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_activation_functions.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_cuda_cublas.h"
#include "hl_cuda_cublas.h"
#include "hl_device_functions.cuh"
#include "hl_device_functions.cuh"
#include "hl_activation_functions.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
typedef
hppl
::
Active
<
real
>::
forward
t_forward
;
typedef
hppl
::
Active
<
real
>::
forward
t_forward
;
typedef
hppl
::
Active
<
real
>::
backward
t_backward
;
typedef
hppl
::
Active
<
real
>::
backward
t_backward
;
bool
hl_lstm_sequence_parallel
(
int
frameSize
)
{
bool
hl_lstm_sequence_parallel
(
int
frameSize
)
{
...
@@ -42,9 +41,9 @@ public:
...
@@ -42,9 +41,9 @@ public:
value_
+=
(
start
+
length
-
1
)
*
frameSize
+
idx
;
value_
+=
(
start
+
length
-
1
)
*
frameSize
+
idx
;
}
}
}
}
__device__
inline
real
*
getPtr
()
const
{
return
value_
;
}
__device__
inline
real
*
getPtr
()
const
{
return
value_
;
}
__device__
inline
real
getValue
()
{
return
*
value_
;
}
__device__
inline
real
getValue
()
{
return
*
value_
;
}
__device__
inline
void
setValue
(
real
value
)
{
*
value_
=
value
;
}
__device__
inline
void
setValue
(
real
value
)
{
*
value_
=
value
;
}
template
<
int
reversed
,
int
frameSize
>
template
<
int
reversed
,
int
frameSize
>
__device__
inline
void
nextFrame
()
{
__device__
inline
void
nextFrame
()
{
if
(
reversed
==
0
)
{
if
(
reversed
==
0
)
{
...
@@ -55,28 +54,25 @@ public:
...
@@ -55,28 +54,25 @@ public:
}
}
};
};
__device__
__forceinline__
__device__
__forceinline__
void
ptx_sync
(
const
int
id
,
const
int
barriers
)
{
void
ptx_sync
(
const
int
id
,
const
int
barriers
)
{
asm
volatile
(
"bar.sync %0, %1;"
:
:
"r"
(
id
),
"r"
(
barriers
)
:
"memory"
);
asm
volatile
(
"bar.sync %0, %1;"
:
:
"r"
(
id
),
"r"
(
barriers
)
:
"memory"
);
}
}
__device__
__forceinline__
__device__
__forceinline__
void
ptx_arrive
(
const
int
id
,
const
int
barriers
)
{
void
ptx_arrive
(
const
int
id
,
const
int
barriers
)
{
asm
volatile
(
"bar.arrive %0, %1;"
:
:
"r"
(
id
),
"r"
(
barriers
)
:
"memory"
);
asm
volatile
(
"bar.arrive %0, %1;"
:
:
"r"
(
id
),
"r"
(
barriers
)
:
"memory"
);
}
}
template
<
int
valueSize
,
int
frameSize
>
template
<
int
valueSize
,
int
frameSize
>
__device__
__forceinline__
real
__device__
__forceinline__
real
forward_sequence
(
real
value
,
forward_sequence
(
real
value
,
real
*
shValue
,
real
*
shValue
,
real
*
state
,
real
*
state
,
real
*
preOutput
,
real
*
preOutput
,
real
*
output
,
real
*
output
,
real
check
,
real
check
,
int
index
,
int
index
,
t_forward
activeNode
,
t_forward
activeNode
,
t_forward
activeGate
,
t_forward
activeGate
,
t_forward
activeState
)
{
t_forward
activeState
)
{
real
out
;
real
out
;
real
prevOut
;
real
prevOut
;
real
state_r
;
real
state_r
;
...
@@ -112,17 +108,20 @@ forward_sequence(real value,
...
@@ -112,17 +108,20 @@ forward_sequence(real value,
if
(
idy
==
0
)
{
if
(
idy
==
0
)
{
ptx_sync
(
2
,
frameSize
*
2
);
ptx_sync
(
2
,
frameSize
*
2
);
prevOut
=
state
[
idx
];
prevOut
=
state
[
idx
];
prevOut
=
activeState
(
prevOut
);
prevOut
=
activeState
(
prevOut
);
preOutput
[
idx
]
=
prevOut
;
preOutput
[
idx
]
=
prevOut
;
ptx_arrive
(
3
,
frameSize
*
2
);
ptx_arrive
(
3
,
frameSize
*
2
);
}
}
return
value
;
return
value
;
}
}
#define OUTPUT_BARRIER_ID 10
#define OUTPUT_BARRIER_ID 10
#define OUTPUT_BARRIER_ID2 11
#define OUTPUT_BARRIER_ID2 11
template
<
int
valueSize
,
int
frameSize
,
int
reversed
,
template
<
int
valueSize
,
int
computeThreads
,
int
blockSize
>
int
frameSize
,
int
reversed
,
int
computeThreads
,
int
blockSize
>
__global__
void
KeLstmForward
(
real
*
gateValue
,
__global__
void
KeLstmForward
(
real
*
gateValue
,
real
*
state
,
real
*
state
,
real
*
output
,
real
*
output
,
...
@@ -184,10 +183,16 @@ __global__ void KeLstmForward(real *gateValue,
...
@@ -184,10 +183,16 @@ __global__ void KeLstmForward(real *gateValue,
}
}
}
}
value
=
forward_sequence
<
valueSize
,
frameSize
>
(
value
=
forward_sequence
<
valueSize
,
frameSize
>
(
value
,
shValue
,
shState
,
shPrevOutput
,
shOutput
,
check
,
index
,
value
,
hppl
::
gpu
::
forward
[
active_node
],
shValue
,
hppl
::
gpu
::
forward
[
active_gate
],
shState
,
hppl
::
gpu
::
forward
[
active_state
]);
shPrevOutput
,
shOutput
,
check
,
index
,
hppl
::
gpu
::
forward
[
active_node
],
hppl
::
gpu
::
forward
[
active_gate
],
hppl
::
gpu
::
forward
[
active_state
]);
const
int
idx
=
index
%
frameSize
;
const
int
idx
=
index
%
frameSize
;
const
int
idy
=
index
/
frameSize
;
const
int
idy
=
index
/
frameSize
;
if
(
valueSize
==
128
)
{
if
(
valueSize
==
128
)
{
...
@@ -218,7 +223,7 @@ __global__ void KeLstmForward(real *gateValue,
...
@@ -218,7 +223,7 @@ __global__ void KeLstmForward(real *gateValue,
real
B_r
[
frameSize
];
real
B_r
[
frameSize
];
const
int
computeIdx
=
index
-
valueSize
;
const
int
computeIdx
=
index
-
valueSize
;
if
(
i
==
0
)
{
if
(
i
==
0
)
{
#pragma unroll
#pragma unroll
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
B_r
[
n
]
=
weight
[
n
*
valueSize
+
computeIdx
];
B_r
[
n
]
=
weight
[
n
*
valueSize
+
computeIdx
];
}
}
...
@@ -230,7 +235,7 @@ __global__ void KeLstmForward(real *gateValue,
...
@@ -230,7 +235,7 @@ __global__ void KeLstmForward(real *gateValue,
}
}
real
sum
=
0.0
f
;
real
sum
=
0.0
f
;
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
sum
+=
A_r
[
n
]
*
B_r
[
n
];
sum
+=
A_r
[
n
]
*
B_r
[
n
];
}
}
shValue
[
computeIdx
]
=
sum
;
shValue
[
computeIdx
]
=
sum
;
ptx_arrive
(
OUTPUT_BARRIER_ID2
,
blockSize
);
ptx_arrive
(
OUTPUT_BARRIER_ID2
,
blockSize
);
...
@@ -239,14 +244,14 @@ __global__ void KeLstmForward(real *gateValue,
...
@@ -239,14 +244,14 @@ __global__ void KeLstmForward(real *gateValue,
if
(
valueSize
==
256
)
{
if
(
valueSize
==
256
)
{
real
B_r
[
frameSize
];
real
B_r
[
frameSize
];
if
(
i
==
0
)
{
if
(
i
==
0
)
{
#pragma unroll
#pragma unroll
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
B_r
[
n
]
=
weight
[
n
*
valueSize
+
index
];
B_r
[
n
]
=
weight
[
n
*
valueSize
+
index
];
}
}
}
}
real
sum
=
0.0
f
;
real
sum
=
0.0
f
;
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
sum
+=
shOutput
[
n
]
*
B_r
[
n
];
sum
+=
shOutput
[
n
]
*
B_r
[
n
];
}
}
value
+=
sum
;
value
+=
sum
;
}
}
...
@@ -273,50 +278,81 @@ void hl_lstm_parallel_forward(real *gateValue,
...
@@ -273,50 +278,81 @@ void hl_lstm_parallel_forward(real *gateValue,
dim3
grid
(
numSequences
,
1
);
dim3
grid
(
numSequences
,
1
);
if
(
!
reversed
)
{
if
(
!
reversed
)
{
if
(
frameSize
==
32
)
{
if
(
frameSize
==
32
)
{
KeLstmForward
<
128
,
32
,
0
,
128
,
256
>
KeLstmForward
<
128
,
32
,
0
,
128
,
256
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
gateValue
,
(
gateValue
,
stateValue
,
outputValue
,
preOutputValue
,
stateValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
outputValue
,
active_node
,
active_gate
,
active_state
);
preOutputValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
64
)
{
}
else
if
(
frameSize
==
64
)
{
KeLstmForward
<
256
,
64
,
0
,
256
,
256
>
KeLstmForward
<
256
,
64
,
0
,
256
,
256
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
gateValue
,
(
gateValue
,
stateValue
,
outputValue
,
preOutputValue
,
stateValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
outputValue
,
active_node
,
active_gate
,
active_state
);
preOutputValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
}
}
else
{
}
else
{
if
(
frameSize
==
32
)
{
if
(
frameSize
==
32
)
{
KeLstmForward
<
128
,
32
,
1
,
128
,
256
>
KeLstmForward
<
128
,
32
,
1
,
128
,
256
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
gateValue
,
(
gateValue
,
stateValue
,
outputValue
,
preOutputValue
,
stateValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
outputValue
,
active_node
,
active_gate
,
active_state
);
preOutputValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
64
)
{
}
else
if
(
frameSize
==
64
)
{
KeLstmForward
<
256
,
64
,
1
,
256
,
256
>
KeLstmForward
<
256
,
64
,
1
,
256
,
256
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
gateValue
,
(
gateValue
,
stateValue
,
outputValue
,
preOutputValue
,
stateValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
outputValue
,
active_node
,
active_gate
,
active_state
);
preOutputValue
,
checkIg
,
checkFg
,
checkOg
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
}
}
}
CHECK_SYNC
(
"hl_lstm_parallel_forward failed"
);
CHECK_SYNC
(
"hl_lstm_parallel_forward failed"
);
}
}
__device__
__forceinline__
__device__
__forceinline__
void
transpose_32x32
(
real
a
[],
const
int
idx
)
{
void
transpose_32x32
(
real
a
[],
const
int
idx
)
{
int
addr
=
idx
%
32
;
int
addr
=
idx
%
32
;
#pragma unroll
#pragma unroll
for
(
int
k
=
1
;
k
<
32
;
k
++
)
{
for
(
int
k
=
1
;
k
<
32
;
k
++
)
{
// rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
// rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
addr
=
__shfl
(
addr
,
(
idx
+
1
)
%
32
,
32
);
addr
=
__shfl
(
addr
,
(
idx
+
1
)
%
32
,
32
);
a
[
k
]
=
__shfl
(
a
[
k
],
addr
,
32
);
a
[
k
]
=
__shfl
(
a
[
k
],
addr
,
32
);
}
}
#pragma unroll
#pragma unroll
for
(
int
tid
=
0
;
tid
<
31
;
tid
++
)
{
for
(
int
tid
=
0
;
tid
<
31
;
tid
++
)
{
real
tmp
=
(
idx
>
tid
)
?
a
[
0
]
:
a
[
1
];
real
tmp
=
(
idx
>
tid
)
?
a
[
0
]
:
a
[
1
];
#pragma unroll
#pragma unroll
for
(
int
k
=
31
;
k
>
0
;
k
--
)
{
for
(
int
k
=
31
;
k
>
0
;
k
--
)
{
a
[(
k
+
1
)
%
32
]
=
(
idx
>
tid
)
?
a
[
k
]
:
a
[(
k
+
1
)
%
32
];
a
[(
k
+
1
)
%
32
]
=
(
idx
>
tid
)
?
a
[
k
]
:
a
[(
k
+
1
)
%
32
];
}
}
...
@@ -324,29 +360,28 @@ void transpose_32x32(real a[], const int idx) {
...
@@ -324,29 +360,28 @@ void transpose_32x32(real a[], const int idx) {
}
}
addr
=
(
32
-
idx
)
%
32
;
addr
=
(
32
-
idx
)
%
32
;
#pragma unroll
#pragma unroll
for
(
int
k
=
0
;
k
<
32
;
k
++
)
{
for
(
int
k
=
0
;
k
<
32
;
k
++
)
{
a
[
k
]
=
__shfl
(
a
[
k
],
addr
,
32
);
a
[
k
]
=
__shfl
(
a
[
k
],
addr
,
32
);
addr
=
__shfl
(
addr
,
(
idx
+
31
)
%
32
,
32
);
addr
=
__shfl
(
addr
,
(
idx
+
31
)
%
32
,
32
);
}
}
}
}
template
<
int
valueSize
,
int
frameSize
>
template
<
int
valueSize
,
int
frameSize
>
__device__
void
__device__
void
backward_sequence
(
real
rGateValue
,
backward_sequence
(
real
rGateValue
,
real
rOutputGrad
,
real
rOutputGrad
,
real
rPreOutputValue
,
real
rPreOutputValue
,
real
&
rGateGrad
,
real
&
rGateGrad
,
real
&
rStateGrad
,
real
&
rStateGrad
,
real
*
shStateGrad
,
real
*
shStateGrad
,
real
*
shStateValue
,
real
*
shStateValue
,
real
*
shGateValue
,
real
*
shGateValue
,
real
rCheck
,
real
rCheck
,
real
&
rGateValuePrev
,
real
&
rGateValuePrev
,
int
index
,
int
index
,
t_backward
activeNode
,
t_backward
activeNode
,
t_backward
activeGate
,
t_backward
activeGate
,
t_backward
activeState
)
{
t_backward
activeState
)
{
const
int
frameIdx
=
index
%
frameSize
;
const
int
frameIdx
=
index
%
frameSize
;
const
int
frameIdy
=
index
/
frameSize
;
const
int
frameIdy
=
index
/
frameSize
;
if
(
frameIdy
==
3
)
{
if
(
frameIdy
==
3
)
{
...
@@ -363,8 +398,8 @@ backward_sequence(real rGateValue,
...
@@ -363,8 +398,8 @@ backward_sequence(real rGateValue,
rStateGrad
=
rGateGrad
*
rCheck
;
rStateGrad
=
rGateGrad
*
rCheck
;
shStateGrad
[
index
]
=
rStateGrad
;
shStateGrad
[
index
]
=
rStateGrad
;
ptx_sync
(
3
,
valueSize
);
ptx_sync
(
3
,
valueSize
);
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
2
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
2
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
3
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
3
];
rGateGrad
=
rStateGrad
*
shGateValue
[
frameIdx
];
rGateGrad
=
rStateGrad
*
shGateValue
[
frameIdx
];
rGateGrad
=
activeGate
(
rGateGrad
,
rGateValue
);
rGateGrad
=
activeGate
(
rGateGrad
,
rGateValue
);
}
else
if
(
frameIdy
==
2
)
{
}
else
if
(
frameIdy
==
2
)
{
...
@@ -373,7 +408,7 @@ backward_sequence(real rGateValue,
...
@@ -373,7 +408,7 @@ backward_sequence(real rGateValue,
shStateGrad
[
index
]
=
rStateGrad
;
shStateGrad
[
index
]
=
rStateGrad
;
ptx_sync
(
3
,
valueSize
);
ptx_sync
(
3
,
valueSize
);
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
3
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
3
];
rGateValuePrev
=
rGateValue
;
rGateValuePrev
=
rGateValue
;
rGateGrad
=
rStateGrad
*
shStateValue
[
frameIdx
];
rGateGrad
=
rStateGrad
*
shStateValue
[
frameIdx
];
rGateGrad
=
activeGate
(
rGateGrad
,
rGateValue
);
rGateGrad
=
activeGate
(
rGateGrad
,
rGateValue
);
...
@@ -381,43 +416,43 @@ backward_sequence(real rGateValue,
...
@@ -381,43 +416,43 @@ backward_sequence(real rGateValue,
shGateValue
[
frameIdx
]
=
rGateValue
;
shGateValue
[
frameIdx
]
=
rGateValue
;
ptx_sync
(
3
,
valueSize
);
ptx_sync
(
3
,
valueSize
);
rStateGrad
=
shStateGrad
[
frameIdx
+
frameSize
];
rStateGrad
=
shStateGrad
[
frameIdx
+
frameSize
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
2
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
2
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
3
];
rStateGrad
+=
shStateGrad
[
frameIdx
+
frameSize
*
3
];
rGateGrad
=
rStateGrad
*
shGateValue
[
frameIdx
+
frameSize
];
rGateGrad
=
rStateGrad
*
shGateValue
[
frameIdx
+
frameSize
];
rGateGrad
=
activeNode
(
rGateGrad
,
rGateValue
);
rGateGrad
=
activeNode
(
rGateGrad
,
rGateValue
);
}
}
}
}
template
<
int
valueSize
,
int
frameSize
>
template
<
int
valueSize
,
int
frameSize
>
__device__
void
load_weight
(
real
rWeight
[],
real
*
weight
,
const
int
index
)
{
__device__
void
load_weight
(
real
rWeight
[],
real
*
weight
,
const
int
index
)
{
if
(
valueSize
==
128
)
{
if
(
valueSize
==
128
)
{
weight
+=
index
;
weight
+=
index
;
#pragma unroll
#pragma unroll
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
rWeight
[
n
]
=
weight
[
n
*
valueSize
];
rWeight
[
n
]
=
weight
[
n
*
valueSize
];
}
}
transpose_32x32
(
rWeight
,
index
%
32
);
transpose_32x32
(
rWeight
,
index
%
32
);
}
}
if
(
valueSize
==
256
)
{
if
(
valueSize
==
256
)
{
int
id
=
(
index
/
32
)
%
2
;
int
id
=
(
index
/
32
)
%
2
;
weight
+=
index
-
id
*
32
+
id
*
32
*
valueSize
;
weight
+=
index
-
id
*
32
+
id
*
32
*
valueSize
;
#pragma unroll
#pragma unroll
for
(
int
n
=
0
;
n
<
32
;
n
++
)
{
for
(
int
n
=
0
;
n
<
32
;
n
++
)
{
rWeight
[
n
]
=
weight
[
n
*
valueSize
];
rWeight
[
n
]
=
weight
[
n
*
valueSize
];
rWeight
[
n
+
32
]
=
weight
[
n
*
valueSize
+
32
];
rWeight
[
n
+
32
]
=
weight
[
n
*
valueSize
+
32
];
}
}
transpose_32x32
(
rWeight
,
index
%
32
);
transpose_32x32
(
rWeight
,
index
%
32
);
transpose_32x32
(
&
rWeight
[
32
],
index
%
32
);
transpose_32x32
(
&
rWeight
[
32
],
index
%
32
);
}
}
}
}
template
<
int
valueSize
,
int
frameSize
,
int
reversed
>
template
<
int
valueSize
,
int
frameSize
,
int
reversed
>
__global__
void
KeLstmBackward
(
real
*
gateValue
,
__global__
void
KeLstmBackward
(
real
*
gateValue
,
real
*
gateGrad
,
real
*
gateGrad
,
real
*
stateValue
,
real
*
stateValue
,
real
*
stateGrad
,
/* do not need save */
real
*
stateGrad
,
/* do not need save */
real
*
preOutputValue
,
real
*
preOutputValue
,
real
*
preOutputGrad
,
/* do not need save */
real
*
preOutputGrad
,
/* do not need save */
real
*
checkIg
,
real
*
checkIg
,
real
*
checkIgGrad
,
real
*
checkIgGrad
,
real
*
checkFg
,
real
*
checkFg
,
...
@@ -484,20 +519,27 @@ __global__ void KeLstmBackward(real *gateValue,
...
@@ -484,20 +519,27 @@ __global__ void KeLstmBackward(real *gateValue,
for
(
int
i
=
0
;
i
<
length
;
++
i
)
{
for
(
int
i
=
0
;
i
<
length
;
++
i
)
{
if
(
frameIdy
==
3
)
{
if
(
frameIdy
==
3
)
{
if
(
i
!=
length
-
1
)
{
if
(
i
!=
length
-
1
)
{
frameStateValue
.
nextFrame
<!
reversed
,
frameSize
>
();
frameStateValue
.
nextFrame
<!
reversed
,
frameSize
>
();
shStateValue
[
frameIdx
]
=
frameStateValue
.
getValue
();
shStateValue
[
frameIdx
]
=
frameStateValue
.
getValue
();
}
else
{
}
else
{
shStateValue
[
frameIdx
]
=
0.0
;
shStateValue
[
frameIdx
]
=
0.0
;
}
}
}
}
backward_sequence
<
valueSize
,
frameSize
>
(
backward_sequence
<
valueSize
,
frameSize
>
(
rGateValue
,
rGateValue
,
rOutputGrad
,
rPreOutputValue
,
rGateGrad
,
rOutputGrad
,
rStateGrad
,
shStateGrad
,
shStateValue
,
shGateValue
,
rPreOutputValue
,
rCheck
,
rGateValuePrev
,
index
,
rGateGrad
,
hppl
::
gpu
::
backward
[
active_node
],
rStateGrad
,
hppl
::
gpu
::
backward
[
active_gate
],
shStateGrad
,
hppl
::
gpu
::
backward
[
active_state
]);
shStateValue
,
shGateValue
,
rCheck
,
rGateValuePrev
,
index
,
hppl
::
gpu
::
backward
[
active_node
],
hppl
::
gpu
::
backward
[
active_gate
],
hppl
::
gpu
::
backward
[
active_state
]);
if
(
frameIdy
==
3
)
{
if
(
frameIdy
==
3
)
{
rCheckGrad
+=
rGateGrad
*
rStateValue
;
rCheckGrad
+=
rGateGrad
*
rStateValue
;
rStateValue
=
shStateValue
[
frameIdx
];
rStateValue
=
shStateValue
[
frameIdx
];
...
@@ -523,9 +565,9 @@ __global__ void KeLstmBackward(real *gateValue,
...
@@ -523,9 +565,9 @@ __global__ void KeLstmBackward(real *gateValue,
shGateGrad
[
frameIdy
][
frameIdx
]
=
rGateGrad
;
shGateGrad
[
frameIdy
][
frameIdx
]
=
rGateGrad
;
if
(
valueSize
==
128
)
{
if
(
valueSize
==
128
)
{
real
sum
=
0.0
f
;
real
sum
=
0.0
f
;
#pragma unroll
#pragma unroll
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
sum
+=
shGateGrad
[
frameIdy
][
n
]
*
B_r
[
n
];
sum
+=
shGateGrad
[
frameIdy
][
n
]
*
B_r
[
n
];
}
}
if
(
frameIdy
==
3
)
{
if
(
frameIdy
==
3
)
{
rOutputGrad
+=
sum
;
rOutputGrad
+=
sum
;
...
@@ -541,7 +583,7 @@ __global__ void KeLstmBackward(real *gateValue,
...
@@ -541,7 +583,7 @@ __global__ void KeLstmBackward(real *gateValue,
}
}
real
sum
=
0.0
f
;
real
sum
=
0.0
f
;
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
for
(
int
n
=
0
;
n
<
frameSize
;
n
++
)
{
sum
+=
A_r
[
n
]
*
B_r
[
n
];
sum
+=
A_r
[
n
]
*
B_r
[
n
];
}
}
if
(
frameIdy
==
3
)
{
if
(
frameIdy
==
3
)
{
rOutputGrad
+=
sum
;
rOutputGrad
+=
sum
;
...
@@ -552,8 +594,8 @@ __global__ void KeLstmBackward(real *gateValue,
...
@@ -552,8 +594,8 @@ __global__ void KeLstmBackward(real *gateValue,
if
(
frameIdy
==
3
)
{
if
(
frameIdy
==
3
)
{
ptx_sync
(
6
,
valueSize
);
ptx_sync
(
6
,
valueSize
);
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
for
(
int
i
=
0
;
i
<
3
;
i
++
)
{
rOutputGrad
+=
shOutputGrad
[
i
][
frameIdx
];
rOutputGrad
+=
shOutputGrad
[
i
][
frameIdx
];
}
}
}
else
{
}
else
{
...
@@ -564,11 +606,14 @@ __global__ void KeLstmBackward(real *gateValue,
...
@@ -564,11 +606,14 @@ __global__ void KeLstmBackward(real *gateValue,
/* TODO: Temporary save & merger in another kernel */
/* TODO: Temporary save & merger in another kernel */
if
(
frameIdy
==
1
)
{
if
(
frameIdy
==
1
)
{
if
(
checkIgGrad
)
paddle
::
paddleAtomicAdd
(
checkIgGrad
+
frameIdx
,
rCheckGrad
);
if
(
checkIgGrad
)
paddle
::
paddleAtomicAdd
(
checkIgGrad
+
frameIdx
,
rCheckGrad
);
}
else
if
(
frameIdy
==
2
)
{
}
else
if
(
frameIdy
==
2
)
{
if
(
checkFgGrad
)
paddle
::
paddleAtomicAdd
(
checkFgGrad
+
frameIdx
,
rCheckGrad
);
if
(
checkFgGrad
)
paddle
::
paddleAtomicAdd
(
checkFgGrad
+
frameIdx
,
rCheckGrad
);
}
else
if
(
frameIdy
==
3
)
{
}
else
if
(
frameIdy
==
3
)
{
if
(
checkOgGrad
)
paddle
::
paddleAtomicAdd
(
checkOgGrad
+
frameIdx
,
rCheckGrad
);
if
(
checkOgGrad
)
paddle
::
paddleAtomicAdd
(
checkOgGrad
+
frameIdx
,
rCheckGrad
);
}
}
}
}
...
@@ -593,68 +638,183 @@ void hl_lstm_parallel_backward_data(real *gateValue,
...
@@ -593,68 +638,183 @@ void hl_lstm_parallel_backward_data(real *gateValue,
hl_activation_mode_t
active_node
,
hl_activation_mode_t
active_node
,
hl_activation_mode_t
active_gate
,
hl_activation_mode_t
active_gate
,
hl_activation_mode_t
active_state
)
{
hl_activation_mode_t
active_state
)
{
CHECK
(
frameSize
==
32
||
frameSize
==
64
||
CHECK
(
frameSize
==
32
||
frameSize
==
64
||
frameSize
==
128
||
frameSize
==
128
||
frameSize
==
256
);
frameSize
==
256
);
dim3
grid
(
numSequences
,
1
);
dim3
grid
(
numSequences
,
1
);
if
(
!
reversed
)
{
if
(
!
reversed
)
{
if
(
frameSize
==
32
)
{
if
(
frameSize
==
32
)
{
KeLstmBackward
<
128
,
32
,
0
><<<
grid
,
128
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
128
,
32
,
0
><<<
grid
,
128
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
64
)
{
}
else
if
(
frameSize
==
64
)
{
KeLstmBackward
<
256
,
64
,
0
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
256
,
64
,
0
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
128
)
{
}
else
if
(
frameSize
==
128
)
{
KeLstmBackward
<
512
,
128
,
0
><<<
grid
,
512
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
512
,
128
,
0
><<<
grid
,
512
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
256
)
{
}
else
if
(
frameSize
==
256
)
{
KeLstmBackward
<
1024
,
256
,
0
><<<
grid
,
1024
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
1024
,
256
,
0
><<<
grid
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
}
}
else
{
}
else
{
if
(
frameSize
==
32
)
{
if
(
frameSize
==
32
)
{
KeLstmBackward
<
128
,
32
,
1
><<<
grid
,
128
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
128
,
32
,
1
><<<
grid
,
128
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
64
)
{
}
else
if
(
frameSize
==
64
)
{
KeLstmBackward
<
256
,
64
,
1
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
256
,
64
,
1
><<<
grid
,
256
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
128
)
{
}
else
if
(
frameSize
==
128
)
{
KeLstmBackward
<
512
,
128
,
1
><<<
grid
,
512
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
512
,
128
,
1
><<<
grid
,
512
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
else
if
(
frameSize
==
256
)
{
}
else
if
(
frameSize
==
256
)
{
KeLstmBackward
<
1024
,
256
,
1
><<<
grid
,
1024
,
0
,
STREAM_DEFAULT
>>>
KeLstmBackward
<
1024
,
256
,
1
><<<
grid
,
1024
,
0
,
STREAM_DEFAULT
>>>
(
(
gateValue
,
gateGrad
,
stateValue
,
stateGrad
,
preOutputValue
,
gateValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
gateGrad
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
stateValue
,
active_node
,
active_gate
,
active_state
);
stateGrad
,
preOutputValue
,
preOutputGrad
,
checkIg
,
checkIgGrad
,
checkFg
,
checkFgGrad
,
checkOg
,
checkOgGrad
,
outputGrad
,
weight
,
sequence
,
active_node
,
active_gate
,
active_state
);
}
}
}
}
CHECK_SYNC
(
"hl_lstm_parallel_backward_data"
);
CHECK_SYNC
(
"hl_lstm_parallel_backward_data"
);
}
}
template
<
int
B_X
,
int
B_Y
>
template
<
int
B_X
,
int
B_Y
>
__global__
void
KeSetGradZero
(
real
*
gateGrad
,
__global__
void
KeSetGradZero
(
real
*
gateGrad
,
const
int
*
starts
,
int
valueSize
,
int
numSequences
,
bool
reversed
)
{
const
int
*
starts
,
int
valueSize
,
int
numSequences
,
bool
reversed
)
{
// const int tid = threadIdx.x;
// const int tid = threadIdx.x;
const
int
frameIdx
=
blockIdx
.
x
*
B_X
+
threadIdx
.
x
;
const
int
frameIdx
=
blockIdx
.
x
*
B_X
+
threadIdx
.
x
;
...
@@ -682,19 +842,31 @@ void hl_lstm_parallel_backward_weight(real *weightGrad,
...
@@ -682,19 +842,31 @@ void hl_lstm_parallel_backward_weight(real *weightGrad,
int
valueSize
=
4
*
frameSize
;
int
valueSize
=
4
*
frameSize
;
dim3
threads
(
32
,
32
);
dim3
threads
(
32
,
32
);
dim3
grid
((
valueSize
+
32
-
1
)
/
32
,
(
numSequences
+
32
-
1
)
/
32
);
dim3
grid
((
valueSize
+
32
-
1
)
/
32
,
(
numSequences
+
32
-
1
)
/
32
);
KeSetGradZero
<
32
,
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSetGradZero
<
32
,
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
gateGrad
,
sequence
,
valueSize
,
numSequences
,
reversed
);
gateGrad
,
sequence
,
valueSize
,
numSequences
,
reversed
);
if
(
!
reversed
)
{
if
(
!
reversed
)
{
hl_matrix_mul
(
outputValue
,
hl_matrix_mul
(
outputValue
,
HPPL_OP_T
,
gateGrad
+
valueSize
,
HPPL_OP_N
,
weightGrad
,
HPPL_OP_T
,
frameSize
,
valueSize
,
batchSize
-
1
,
gateGrad
+
valueSize
,
1.0
,
1.0
);
HPPL_OP_N
,
weightGrad
,
frameSize
,
valueSize
,
batchSize
-
1
,
1.0
,
1.0
);
}
else
{
}
else
{
hl_matrix_mul
(
outputValue
+
frameSize
,
hl_matrix_mul
(
outputValue
+
frameSize
,
HPPL_OP_T
,
gateGrad
,
HPPL_OP_N
,
weightGrad
,
HPPL_OP_T
,
frameSize
,
valueSize
,
batchSize
-
1
,
gateGrad
,
1.0
,
1.0
);
HPPL_OP_N
,
weightGrad
,
frameSize
,
valueSize
,
batchSize
-
1
,
1.0
,
1.0
);
}
}
CHECK_SYNC
(
"hl_lstm_parallel_backward_weight"
);
CHECK_SYNC
(
"hl_lstm_parallel_backward_weight"
);
}
}
paddle/cuda/src/hl_cuda_matrix.cu
浏览文件 @
1d4fa243
...
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
#include "hl_matrix.h"
#include "hl_matrix.h"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_ops.cuh"
#include "hl_sequence.h"
#include "hl_sequence.h"
#include "hl_sparse.ph"
#include "hl_sparse.ph"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
DEFINE_MATRIX_UNARY_OP
(
Zero
,
a
=
0
);
DEFINE_MATRIX_UNARY_OP
(
Zero
,
a
=
0
);
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
_add
,
TWO_PARAMETER
,
c
=
p1
*
a
+
p2
*
b
);
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
_add
,
TWO_PARAMETER
,
c
=
p1
*
a
+
p2
*
b
);
void
hl_matrix_add
(
real
*
A_d
,
void
hl_matrix_add
(
real
*
A_d
,
real
*
B_d
,
real
*
B_d
,
real
*
C_d
,
real
*
C_d
,
int
dimM
,
int
dimM
,
int
dimN
,
int
dimN
,
real
alpha
,
real
alpha
,
...
@@ -36,33 +35,32 @@ void hl_matrix_add(real *A_d,
...
@@ -36,33 +35,32 @@ void hl_matrix_add(real *A_d,
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_gpu_apply_ternary_op
hl_gpu_apply_ternary_op
<
real
,
ternary
::
_add
<
real
>
,
0
,
0
>
(
<
real
,
ternary
::
_add
<
real
>
,
0
,
0
>
(
ternary
::
_add
<
real
>
(
alpha
,
beta
),
ternary
::
_add
<
real
>
(
alpha
,
beta
),
A_d
,
A_d
,
B_d
,
B_d
,
C_d
,
C_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimN
,
dimN
,
dimN
,
dimN
,
dimN
);
dimN
);
CHECK_SYNC
(
"hl_matrix_add failed"
);
CHECK_SYNC
(
"hl_matrix_add failed"
);
}
}
#ifdef PADDLE_TYPE_DOUBLE
#ifdef PADDLE_TYPE_DOUBLE
#define THRESHOLD
128
#define THRESHOLD
128
#else
#else
#define THRESHOLD
64
#define THRESHOLD
64
#endif
#endif
__device__
__forceinline__
__device__
__forceinline__
void
findMax
(
real
*
I
,
void
findMax
(
real
*
I
,
real
*
dfMax_s
,
real
*
dfMax_s
,
int
blockSize
,
int
blockSize
,
int
base
,
int
base
,
int
curIdx
,
int
curIdx
,
int
nextIdx
,
int
nextIdx
,
int
dimN
,
int
dimN
,
real
*
max
)
{
real
*
max
)
{
dfMax_s
[
base
]
=
-
1.0e20
;
dfMax_s
[
base
]
=
-
1.0e20
;
while
(
curIdx
<
dimN
)
{
while
(
curIdx
<
dimN
)
{
if
(
dfMax_s
[
base
]
<
I
[
nextIdx
])
{
if
(
dfMax_s
[
base
]
<
I
[
nextIdx
])
{
...
@@ -78,25 +76,24 @@ void findMax(real* I,
...
@@ -78,25 +76,24 @@ void findMax(real* I,
if
(
base
<
stride
)
{
if
(
base
<
stride
)
{
nextIdx
=
base
+
stride
;
nextIdx
=
base
+
stride
;
if
(
dfMax_s
[
base
]
<
dfMax_s
[
nextIdx
])
{
if
(
dfMax_s
[
base
]
<
dfMax_s
[
nextIdx
])
{
dfMax_s
[
base
]
=
dfMax_s
[
nextIdx
];
dfMax_s
[
base
]
=
dfMax_s
[
nextIdx
];
}
}
}
}
}
}
if
(
0
==
base
)
{
if
(
0
==
base
)
{
max
[
0
]
=
dfMax_s
[
0
];
max
[
0
]
=
dfMax_s
[
0
];
}
}
__syncthreads
();
__syncthreads
();
}
}
__device__
__forceinline__
__device__
__forceinline__
void
subMaxAndExp
(
real
*
I
,
void
subMaxAndExp
(
real
*
I
,
real
*
O
,
real
*
O
,
int
curIdx
,
int
curIdx
,
int
nextIdx
,
int
nextIdx
,
int
blockSize
,
int
blockSize
,
int
dimN
,
int
dimN
,
real
max
)
{
real
max
)
{
real
val
;
real
val
;
while
(
curIdx
<
dimN
)
{
while
(
curIdx
<
dimN
)
{
val
=
I
[
nextIdx
]
-
max
;
val
=
I
[
nextIdx
]
-
max
;
...
@@ -115,14 +112,13 @@ void subMaxAndExp(real* I,
...
@@ -115,14 +112,13 @@ void subMaxAndExp(real* I,
__syncthreads
();
__syncthreads
();
}
}
__device__
__forceinline__
__device__
__forceinline__
void
valueSum
(
real
*
O
,
void
valueSum
(
real
*
O
,
real
*
dfMax_s
,
real
*
dfMax_s
,
int
blockSize
,
int
blockSize
,
int
base
,
int
base
,
int
curIdx
,
int
curIdx
,
int
nextIdx
,
int
nextIdx
,
int
dimN
)
{
int
dimN
)
{
dfMax_s
[
base
]
=
0
;
dfMax_s
[
base
]
=
0
;
while
(
curIdx
<
dimN
)
{
while
(
curIdx
<
dimN
)
{
dfMax_s
[
base
]
+=
O
[
nextIdx
];
dfMax_s
[
base
]
+=
O
[
nextIdx
];
...
@@ -141,13 +137,8 @@ void valueSum(real* O,
...
@@ -141,13 +137,8 @@ void valueSum(real* O,
__syncthreads
();
__syncthreads
();
}
}
__device__
__forceinline__
__device__
__forceinline__
void
divSum
(
void
divSum
(
real
*
O
,
real
*
O
,
real
sum
,
int
curIdx
,
int
nextIdx
,
int
blockSize
,
int
dimN
)
{
real
sum
,
int
curIdx
,
int
nextIdx
,
int
blockSize
,
int
dimN
)
{
while
(
curIdx
<
dimN
)
{
while
(
curIdx
<
dimN
)
{
O
[
nextIdx
]
/=
sum
;
O
[
nextIdx
]
/=
sum
;
nextIdx
+=
blockSize
;
nextIdx
+=
blockSize
;
...
@@ -155,20 +146,18 @@ void divSum(real* O,
...
@@ -155,20 +146,18 @@ void divSum(real* O,
}
}
}
}
__device__
__forceinline__
__device__
__forceinline__
void
softmax
(
real
*
I
,
void
softmax
(
real
*
I
,
real
*
O
,
real
*
O
,
real
*
dfMax_s
,
real
*
dfMax_s
,
int
blockSize
,
int
blockSize
,
int
base
,
int
base
,
int
curIdx
,
int
curIdx
,
int
nextIdx
,
int
nextIdx
,
int
dimN
)
{
int
dimN
)
{
__shared__
real
max
;
__shared__
real
max
;
// find the max number
// find the max number
findMax
(
I
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
findMax
(
I
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
,
&
max
);
nextIdx
,
dimN
,
&
max
);
// sub max Value and do Exp operation
// sub max Value and do Exp operation
subMaxAndExp
(
I
,
O
,
base
,
nextIdx
,
blockSize
,
dimN
,
max
);
subMaxAndExp
(
I
,
O
,
base
,
nextIdx
,
blockSize
,
dimN
,
max
);
...
@@ -181,8 +170,8 @@ void softmax(real* I,
...
@@ -181,8 +170,8 @@ void softmax(real* I,
divSum
(
O
,
dfMax_s
[
0
],
curIdx
,
nextIdx
,
blockSize
,
dimN
);
divSum
(
O
,
dfMax_s
[
0
],
curIdx
,
nextIdx
,
blockSize
,
dimN
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeMatrixSoftMax
(
real
*
O
,
real
*
I
,
int
dimN
)
{
__global__
void
KeMatrixSoftMax
(
real
*
O
,
real
*
I
,
int
dimN
)
{
int
base
=
threadIdx
.
x
;
int
base
=
threadIdx
.
x
;
__shared__
real
dfMax_s
[
blockSize
];
__shared__
real
dfMax_s
[
blockSize
];
int
nextIdx
=
blockIdx
.
x
*
dimN
+
base
;
int
nextIdx
=
blockIdx
.
x
*
dimN
+
base
;
...
@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
...
@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
softmax
(
I
,
O
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
);
softmax
(
I
,
O
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
);
}
}
void
hl_matrix_softmax
(
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
void
hl_matrix_softmax
(
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
dim3
block
(
512
,
1
);
dim3
block
(
512
,
1
);
dim3
grid
(
dimM
,
1
);
dim3
grid
(
dimM
,
1
);
KeMatrixSoftMax
<
512
>
KeMatrixSoftMax
<
512
><<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
A_d
,
dimN
);
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
A_d
,
dimN
);
CHECK_SYNC
(
"hl_matrix_softmax failed"
);
CHECK_SYNC
(
"hl_matrix_softmax failed"
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeSequenceSoftMax
(
real
*
O
,
real
*
I
,
const
int
*
index
)
{
__global__
void
KeSequenceSoftMax
(
real
*
O
,
real
*
I
,
const
int
*
index
)
{
int
base
=
threadIdx
.
x
;
int
base
=
threadIdx
.
x
;
int
bid
=
blockIdx
.
x
;
int
bid
=
blockIdx
.
x
;
__shared__
real
dfMax_s
[
blockSize
];
__shared__
real
dfMax_s
[
blockSize
];
...
@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
...
@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
softmax
(
I
,
O
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
);
softmax
(
I
,
O
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
);
}
}
void
hl_sequence_softmax_forward
(
real
*
A_d
,
void
hl_sequence_softmax_forward
(
real
*
A_d
,
real
*
C_d
,
real
*
C_d
,
const
int
*
index
,
const
int
*
index
,
int
numSequence
)
{
int
numSequence
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
...
@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
...
@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
dim3
block
(
512
,
1
);
dim3
block
(
512
,
1
);
dim3
grid
(
numSequence
,
1
);
dim3
grid
(
numSequence
,
1
);
KeSequenceSoftMax
<
512
>
KeSequenceSoftMax
<
512
><<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
A_d
,
index
);
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
A_d
,
index
);
CHECK_SYNC
(
"hl_sequence_softmax_forward failed"
);
CHECK_SYNC
(
"hl_sequence_softmax_forward failed"
);
}
}
__global__
void
KeMatrixDerivative
(
real
*
grad_d
,
__global__
void
KeMatrixDerivative
(
real
*
output_d
,
real
*
grad_d
,
real
*
output_d
,
real
*
sftmaxSum_d
,
int
dimM
,
int
dimN
)
{
real
*
sftmaxSum_d
,
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
dimM
,
int
colIdx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
dimN
)
{
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
colIdx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
index
;
int
index
;
if
(
rowIdx
<
dimM
&&
colIdx
<
dimN
)
{
if
(
rowIdx
<
dimM
&&
colIdx
<
dimN
)
{
index
=
rowIdx
*
dimN
+
colIdx
;
index
=
rowIdx
*
dimN
+
colIdx
;
grad_d
[
index
]
=
output_d
[
index
]
*
(
grad_d
[
index
]
-
sftmaxSum_d
[
rowIdx
]);
grad_d
[
index
]
=
output_d
[
index
]
*
(
grad_d
[
index
]
-
sftmaxSum_d
[
rowIdx
]);
}
}
}
}
void
hl_matrix_softmax_derivative
(
real
*
grad_d
,
void
hl_matrix_softmax_derivative
(
real
*
output_d
,
real
*
grad_d
,
real
*
output_d
,
real
*
sftmaxSum_d
,
int
dimM
,
int
dimN
)
{
real
*
sftmaxSum_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
grad_d
);
CHECK_NOTNULL
(
grad_d
);
CHECK_NOTNULL
(
output_d
);
CHECK_NOTNULL
(
output_d
);
CHECK_NOTNULL
(
sftmaxSum_d
);
CHECK_NOTNULL
(
sftmaxSum_d
);
int
blocksX
=
(
dimM
+
0
)
/
1
;
int
blocksX
=
(
dimM
+
0
)
/
1
;
int
blocksY
=
(
dimN
+
1024
-
1
)
/
1024
;
int
blocksY
=
(
dimN
+
1024
-
1
)
/
1024
;
dim3
threads
(
1
,
1024
);
dim3
threads
(
1
,
1024
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixDerivative
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixDerivative
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad_d
,
output_d
,
sftmaxSum_d
,
dimM
,
dimN
);
grad_d
,
output_d
,
sftmaxSum_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_softmax_derivative failed"
);
CHECK_SYNC
(
"hl_matrix_softmax_derivative failed"
);
}
}
__global__
void
KeMatrixMultiBinaryCrossEntropy
(
real
*
output
,
__global__
void
KeMatrixMultiBinaryCrossEntropy
(
real
*
entropy
,
real
*
output
,
real
*
entropy
,
int
*
row
,
int
*
col
,
int
dimM
,
int
dimN
)
{
int
*
row
,
int
*
col
,
int
dimM
,
int
dimN
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
dimM
)
{
if
(
index
<
dimM
)
{
for
(
int
i
=
0
;
i
<
dimN
;
i
++
)
{
for
(
int
i
=
0
;
i
<
dimN
;
i
++
)
{
entropy
[
index
]
-=
log
(
1
-
output
[
index
*
dimN
+
i
]);
entropy
[
index
]
-=
log
(
1
-
output
[
index
*
dimN
+
i
]);
}
}
int
*
row_col
=
col
+
row
[
index
];
int
*
row_col
=
col
+
row
[
index
];
int
col_num
=
row
[
index
+
1
]
-
row
[
index
];
int
col_num
=
row
[
index
+
1
]
-
row
[
index
];
for
(
int
i
=
0
;
i
<
col_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
col_num
;
i
++
)
{
real
o
=
output
[
index
*
dimN
+
row_col
[
i
]];
real
o
=
output
[
index
*
dimN
+
row_col
[
i
]];
entropy
[
index
]
-=
log
(
o
/
(
1
-
o
));
entropy
[
index
]
-=
log
(
o
/
(
1
-
o
));
}
}
...
@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
...
@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
dim3
threads
(
n_threads
);
dim3
threads
(
n_threads
);
dim3
grid
(
blocks
);
dim3
grid
(
blocks
);
hl_csr_matrix
mat
=
(
hl_csr_matrix
)(
csr_mat
->
matrix
);
hl_csr_matrix
mat
=
(
hl_csr_matrix
)(
csr_mat
->
matrix
);
KeMatrixMultiBinaryCrossEntropy
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixMultiBinaryCrossEntropy
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
entropy
,
mat
->
csr_row
,
mat
->
csr_col
,
dimM
,
dimN
);
output
,
entropy
,
mat
->
csr_row
,
mat
->
csr_col
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_multi_binary_cross_entropy failed"
);
CHECK_SYNC
(
"hl_matrix_multi_binary_cross_entropy failed"
);
}
}
__global__
void
KeMatrixMultiBinaryCrossEntropyBp
(
real
*
output
,
__global__
void
KeMatrixMultiBinaryCrossEntropyBp
(
real
*
grad
,
real
*
output
,
real
*
grad
,
int
*
row
,
int
*
col
,
int
dimM
,
int
dimN
)
{
int
*
row
,
int
*
col
,
int
dimM
,
int
dimN
)
{
int
row_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
row_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
row_idx
<
dimM
)
{
if
(
row_idx
<
dimM
)
{
for
(
int
i
=
0
;
i
<
dimN
;
i
++
)
{
for
(
int
i
=
0
;
i
<
dimN
;
i
++
)
{
int
index
=
row_idx
*
dimN
+
i
;
int
index
=
row_idx
*
dimN
+
i
;
grad
[
index
]
+=
1.0
/
(
1
-
output
[
index
]);
grad
[
index
]
+=
1.0
/
(
1
-
output
[
index
]);
}
}
int
col_num
=
row
[
row_idx
+
1
]
-
row
[
row_idx
];
int
col_num
=
row
[
row_idx
+
1
]
-
row
[
row_idx
];
int
*
row_col
=
col
+
row
[
row_idx
];
int
*
row_col
=
col
+
row
[
row_idx
];
for
(
int
i
=
0
;
i
<
col_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
col_num
;
i
++
)
{
int
index
=
row_idx
*
dimN
+
row_col
[
i
];
int
index
=
row_idx
*
dimN
+
row_col
[
i
];
grad
[
index
]
-=
1.0
/
(
output
[
index
]
*
(
1
-
output
[
index
]));
grad
[
index
]
-=
1.0
/
(
output
[
index
]
*
(
1
-
output
[
index
]));
}
}
}
}
}
}
void
hl_matrix_multi_binary_cross_entropy_bp
(
real
*
output
,
void
hl_matrix_multi_binary_cross_entropy_bp
(
real
*
grad
,
real
*
output
,
real
*
grad
,
hl_sparse_matrix_s
csr_mat
,
int
dimM
,
int
dimN
)
{
hl_sparse_matrix_s
csr_mat
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
output
);
CHECK_NOTNULL
(
output
);
CHECK_NOTNULL
(
grad
);
CHECK_NOTNULL
(
grad
);
CHECK_NOTNULL
(
csr_mat
);
CHECK_NOTNULL
(
csr_mat
);
...
@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
...
@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
dim3
threads
(
n_threads
);
dim3
threads
(
n_threads
);
dim3
grid
(
blocks
);
dim3
grid
(
blocks
);
hl_csr_matrix
mat
=
(
hl_csr_matrix
)(
csr_mat
->
matrix
);
hl_csr_matrix
mat
=
(
hl_csr_matrix
)(
csr_mat
->
matrix
);
KeMatrixMultiBinaryCrossEntropyBp
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixMultiBinaryCrossEntropyBp
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
grad
,
mat
->
csr_row
,
mat
->
csr_col
,
dimM
,
dimN
);
output
,
grad
,
mat
->
csr_row
,
mat
->
csr_col
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_multi_binary_cross_entropy_bp failed"
);
CHECK_SYNC
(
"hl_matrix_multi_binary_cross_entropy_bp failed"
);
}
}
__global__
void
KeMatrixCrossEntropy
(
real
*
O
,
__global__
void
KeMatrixCrossEntropy
(
real
*
E
,
real
*
O
,
real
*
E
,
int
*
label
,
int
dimM
,
int
dimN
)
{
int
*
label
,
int
dimM
,
int
dimN
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
newBase
;
int
newBase
;
if
(
index
<
dimM
)
{
if
(
index
<
dimM
)
{
...
@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
...
@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
}
}
}
}
void
hl_matrix_cross_entropy
(
real
*
A_d
,
void
hl_matrix_cross_entropy
(
real
*
C_d
,
real
*
A_d
,
real
*
C_d
,
int
*
label_d
,
int
dimM
,
int
dimN
)
{
int
*
label_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
int
blocks
=
(
dimM
+
1024
-
1
)
/
1024
;
int
blocks
=
(
dimM
+
1024
-
1
)
/
1024
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blocks
,
1
);
dim3
grid
(
blocks
,
1
);
KeMatrixCrossEntropy
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixCrossEntropy
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
A_d
,
C_d
,
label_d
,
dimM
,
dimN
);
A_d
,
C_d
,
label_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_cross_entropy failed"
);
CHECK_SYNC
(
"hl_matrix_cross_entropy failed"
);
}
}
__global__
void
KeMatrixCrossEntropyBp
(
real
*
grad_d
,
__global__
void
KeMatrixCrossEntropyBp
(
real
*
output_d
,
real
*
grad_d
,
real
*
output_d
,
int
*
label_d
,
int
dimM
,
int
dimN
)
{
int
*
label_d
,
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
dimM
,
int
colIdx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
dimN
)
{
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
colIdx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
index
;
int
index
;
if
(
rowIdx
<
dimM
&&
colIdx
<
dimN
)
{
if
(
rowIdx
<
dimM
&&
colIdx
<
dimN
)
{
index
=
rowIdx
*
dimN
+
colIdx
;
index
=
rowIdx
*
dimN
+
colIdx
;
if
(
label_d
[
rowIdx
]
==
colIdx
)
{
if
(
label_d
[
rowIdx
]
==
colIdx
)
{
grad_d
[
index
]
-=
1.0
f
/
output_d
[
index
];
grad_d
[
index
]
-=
1.0
f
/
output_d
[
index
];
}
}
}
}
}
}
void
hl_matrix_cross_entropy_bp
(
real
*
grad_d
,
void
hl_matrix_cross_entropy_bp
(
real
*
output_d
,
real
*
grad_d
,
real
*
output_d
,
int
*
label_d
,
int
dimM
,
int
dimN
)
{
int
*
label_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
grad_d
);
CHECK_NOTNULL
(
grad_d
);
CHECK_NOTNULL
(
output_d
);
CHECK_NOTNULL
(
output_d
);
CHECK_NOTNULL
(
label_d
);
CHECK_NOTNULL
(
label_d
);
int
blocksX
=
(
dimM
+
0
)
/
1
;
int
blocksX
=
(
dimM
+
0
)
/
1
;
int
blocksY
=
(
dimN
+
1024
-
1
)
/
1024
;
int
blocksY
=
(
dimN
+
1024
-
1
)
/
1024
;
dim3
threads
(
1
,
1024
);
dim3
threads
(
1
,
1024
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixCrossEntropyBp
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixCrossEntropyBp
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad_d
,
output_d
,
label_d
,
dimM
,
dimN
);
grad_d
,
output_d
,
label_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_cross_entropy_bp failed"
);
CHECK_SYNC
(
"hl_matrix_cross_entropy_bp failed"
);
}
}
void
hl_matrix_zero_mem
(
real
*
data
,
int
num
)
{
void
hl_matrix_zero_mem
(
real
*
data
,
int
num
)
{
hl_gpu_apply_unary_op
(
hl_gpu_apply_unary_op
(
unary
::
Zero
<
real
>
(),
data
,
1
,
num
,
num
);
unary
::
Zero
<
real
>
(),
data
,
1
,
num
,
num
);
}
}
__global__
void
KeParamReluForward
(
real
*
output
,
__global__
void
KeParamReluForward
(
real
*
output
,
...
@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
...
@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
int
ty
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
ty
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
if
(
tx
<
width
&&
ty
<
height
)
{
if
(
tx
<
width
&&
ty
<
height
)
{
int
index
=
ty
*
width
+
tx
;
int
index
=
ty
*
width
+
tx
;
output
[
index
]
=
input
[
index
]
>
0
?
input
[
index
]
:
output
[
index
]
=
input
[
index
]
*
w
[
tx
/
partial_sum
];
input
[
index
]
>
0
?
input
[
index
]
:
input
[
index
]
*
w
[
tx
/
partial_sum
];
}
}
}
}
...
@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
...
@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
CHECK_NOTNULL
(
w
);
CHECK_NOTNULL
(
w
);
dim3
threads
(
16
,
16
);
dim3
threads
(
16
,
16
);
int
blockX
=
(
width
+
16
-
1
)
/
16
;
int
blockX
=
(
width
+
16
-
1
)
/
16
;
int
blockY
=
(
height
+
16
-
1
)
/
16
;
int
blockY
=
(
height
+
16
-
1
)
/
16
;
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
KeParamReluForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeParamReluForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
input
,
w
,
width
,
height
,
partial_sum
);
output
,
input
,
w
,
width
,
height
,
partial_sum
);
CHECK_SYNC
(
"hl_param_relu_forward failed"
);
CHECK_SYNC
(
"hl_param_relu_forward failed"
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeParamReluBackWardW
(
real
*
grad_w
,
__global__
void
KeParamReluBackWardW
(
real
*
grad_w
,
real
*
grad_o
,
real
*
grad_o
,
real
*
input
,
real
*
input
,
...
@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
...
@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
int
grid_num
=
width
/
partial_sum
;
int
grid_num
=
width
/
partial_sum
;
dim3
threads
(
blockSize
,
1
);
dim3
threads
(
blockSize
,
1
);
dim3
grid
(
grid_num
,
1
);
dim3
grid
(
grid_num
,
1
);
KeParamReluBackWardW
<
blockSize
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeParamReluBackWardW
<
blockSize
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad_w
,
grad_o
,
input
,
width
,
height
,
partial_sum
);
grad_w
,
grad_o
,
input
,
width
,
height
,
partial_sum
);
CHECK_SYNC
(
"hl_param_relu_backward_w failed"
);
CHECK_SYNC
(
"hl_param_relu_backward_w failed"
);
}
}
...
@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
...
@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
CHECK_NOTNULL
(
diff
);
CHECK_NOTNULL
(
diff
);
dim3
threads
(
16
,
16
);
dim3
threads
(
16
,
16
);
int
blockX
=
(
width
+
16
-
1
)
/
16
;
int
blockX
=
(
width
+
16
-
1
)
/
16
;
int
blockY
=
(
height
+
16
-
1
)
/
16
;
int
blockY
=
(
height
+
16
-
1
)
/
16
;
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
KeParamReluBackwardDiff
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeParamReluBackwardDiff
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad_o
,
data
,
w
,
diff
,
width
,
height
,
partial_sum
);
grad_o
,
data
,
w
,
diff
,
width
,
height
,
partial_sum
);
CHECK_SYNC
(
"hl_param_relu_backward_diff failed"
);
CHECK_SYNC
(
"hl_param_relu_backward_diff failed"
);
}
}
__global__
void
KeMatrixAddSharedBias
(
real
*
A
,
__global__
void
KeMatrixAddSharedBias
(
real
*
B
,
real
*
A
,
real
*
B
,
const
int
channel
,
const
int
M
,
const
int
N
,
real
scale
)
{
const
int
channel
,
const
int
M
,
const
int
N
,
real
scale
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
dim
=
N
/
channel
;
int
dim
=
N
/
channel
;
if
(
index
<
M
*
N
)
{
if
(
index
<
M
*
N
)
{
...
@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
...
@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
real
scale
)
{
real
scale
)
{
const
int
blocks
=
512
;
const
int
blocks
=
512
;
const
int
grids
=
DIVUP
(
dimM
*
dimN
,
blocks
);
const
int
grids
=
DIVUP
(
dimM
*
dimN
,
blocks
);
KeMatrixAddSharedBias
<<<
grids
,
blocks
,
0
,
STREAM_DEFAULT
>>>
KeMatrixAddSharedBias
<<<
grids
,
blocks
,
0
,
STREAM_DEFAULT
>>>
(
(
A_d
,
B_d
,
channel
,
dimM
,
dimN
,
scale
);
A_d
,
B_d
,
channel
,
dimM
,
dimN
,
scale
);
CHECK_SYNC
(
"hl_matrix_add_shared_bias failed"
);
CHECK_SYNC
(
"hl_matrix_add_shared_bias failed"
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeMatrixCollectSharedBias
(
real
*
B
,
__global__
void
KeMatrixCollectSharedBias
(
real
*
B
,
real
*
A
,
real
*
A
,
const
int
channel
,
const
int
channel
,
const
int
M
,
const
int
M
,
const
int
N
,
const
int
N
,
...
@@ -589,7 +541,7 @@ __global__ void KeMatrixCollectSharedBias(real *B,
...
@@ -589,7 +541,7 @@ __global__ void KeMatrixCollectSharedBias(real *B,
int
n
=
j
*
blockSize
+
tid
;
int
n
=
j
*
blockSize
+
tid
;
int
m
=
n
/
dim
;
int
m
=
n
/
dim
;
int
w
=
n
%
dim
;
int
w
=
n
%
dim
;
smem
[
tid
]
=
(
m
<
M
&&
w
<
dim
)
?
A
[
m
*
N
+
bid
*
dim
+
w
]
:
0.0
;
smem
[
tid
]
=
(
m
<
M
&&
w
<
dim
)
?
A
[
m
*
N
+
bid
*
dim
+
w
]
:
0.0
;
__syncthreads
();
__syncthreads
();
simpleReduce
(
smem
,
tid
,
blockSize
);
simpleReduce
(
smem
,
tid
,
blockSize
);
sum
+=
smem
[
0
];
sum
+=
smem
[
0
];
...
@@ -611,33 +563,32 @@ void hl_matrix_collect_shared_bias(real* B_d,
...
@@ -611,33 +563,32 @@ void hl_matrix_collect_shared_bias(real* B_d,
const
int
limit
=
64
;
const
int
limit
=
64
;
int
grids
=
(
dimM
*
dim
)
<
limit
?
DIVUP
(
channel
,
blocks
)
:
channel
;
int
grids
=
(
dimM
*
dim
)
<
limit
?
DIVUP
(
channel
,
blocks
)
:
channel
;
KeMatrixCollectSharedBias
<
blocks
>
KeMatrixCollectSharedBias
<
blocks
><<<
grids
,
blocks
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grids
,
blocks
,
0
,
STREAM_DEFAULT
>>>
B_d
,
A_d
,
channel
,
dimM
,
dimN
,
dim
,
limit
,
scale
);
(
B_d
,
A_d
,
channel
,
dimM
,
dimN
,
dim
,
limit
,
scale
);
CHECK_SYNC
(
"hl_matrix_collect_shared_bias failed"
);
CHECK_SYNC
(
"hl_matrix_collect_shared_bias failed"
);
}
}
__global__
void
keMatrixRotate
(
real
*
mat
,
real
*
matRot
,
__global__
void
keMatrixRotate
(
int
dimM
,
int
dimN
,
bool
clockWise
)
{
real
*
mat
,
real
*
matRot
,
int
dimM
,
int
dimN
,
bool
clockWise
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
<
dimM
*
dimN
)
{
if
(
idx
<
dimM
*
dimN
)
{
int
i
=
idx
/
dimN
;
int
i
=
idx
/
dimN
;
int
j
=
idx
%
dimN
;
int
j
=
idx
%
dimN
;
if
(
clockWise
)
{
if
(
clockWise
)
{
matRot
[
j
*
dimM
+
i
]
=
mat
[(
dimM
-
i
-
1
)
*
dimN
+
j
];
matRot
[
j
*
dimM
+
i
]
=
mat
[(
dimM
-
i
-
1
)
*
dimN
+
j
];
}
else
{
}
else
{
matRot
[
j
*
dimM
+
i
]
=
mat
[
i
*
dimN
+
(
dimN
-
j
-
1
)];
matRot
[
j
*
dimM
+
i
]
=
mat
[
i
*
dimN
+
(
dimN
-
j
-
1
)];
}
}
}
}
}
}
void
hl_matrix_rotate
(
real
*
mat
,
real
*
matRot
,
void
hl_matrix_rotate
(
int
dimM
,
int
dimN
,
bool
clockWise
)
{
real
*
mat
,
real
*
matRot
,
int
dimM
,
int
dimN
,
bool
clockWise
)
{
CHECK_NOTNULL
(
mat
);
CHECK_NOTNULL
(
mat
);
CHECK_NOTNULL
(
matRot
);
CHECK_NOTNULL
(
matRot
);
const
int
threads
=
512
;
const
int
threads
=
512
;
const
int
blocks
=
DIVUP
(
dimM
*
dimN
,
threads
);
const
int
blocks
=
DIVUP
(
dimM
*
dimN
,
threads
);
keMatrixRotate
<<<
blocks
,
threads
,
0
,
STREAM_DEFAULT
>>>
keMatrixRotate
<<<
blocks
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
mat
,
matRot
,
dimM
,
dimN
,
clockWise
);
mat
,
matRot
,
dimM
,
dimN
,
clockWise
);
CHECK_SYNC
(
"hl_matrix_rotate failed"
);
CHECK_SYNC
(
"hl_matrix_rotate failed"
);
}
}
paddle/cuda/src/hl_cuda_sequence.cu
浏览文件 @
1d4fa243
...
@@ -16,36 +16,36 @@ limitations under the License. */
...
@@ -16,36 +16,36 @@ limitations under the License. */
#include "hl_device_functions.cuh"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
__global__
void
KeMaxSequenceForward
(
real
*
input
,
__global__
void
KeMaxSequenceForward
(
real
*
input
,
const
int
*
sequence
,
const
int
*
sequence
,
real
*
output
,
real
*
output
,
int
*
index
,
int
*
index
,
int
numSequences
,
int
numSequences
,
int
dim
)
{
int
dim
)
{
int
dimIdx
=
threadIdx
.
x
;
int
dimIdx
=
threadIdx
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
sequenceId
=
blockIdx
.
x
;
if
(
sequenceId
>=
numSequences
)
return
;
if
(
sequenceId
>=
numSequences
)
return
;
int
start
=
sequence
[
sequenceId
];
int
start
=
sequence
[
sequenceId
];
int
end
=
sequence
[
sequenceId
+
1
];
int
end
=
sequence
[
sequenceId
+
1
];
for
(
int
i
=
dimIdx
;
i
<
dim
;
i
+=
blockDim
.
x
)
{
for
(
int
i
=
dimIdx
;
i
<
dim
;
i
+=
blockDim
.
x
)
{
real
tmp
=
-
HL_FLOAT_MAX
;
real
tmp
=
-
HL_FLOAT_MAX
;
int
tmpId
=
-
1
;
int
tmpId
=
-
1
;
for
(
int
insId
=
start
;
insId
<
end
;
insId
++
)
{
for
(
int
insId
=
start
;
insId
<
end
;
insId
++
)
{
if
(
tmp
<
input
[
insId
*
dim
+
i
])
{
if
(
tmp
<
input
[
insId
*
dim
+
i
])
{
tmp
=
input
[
insId
*
dim
+
i
];
tmp
=
input
[
insId
*
dim
+
i
];
tmpId
=
insId
;
tmpId
=
insId
;
}
}
}
}
output
[
sequenceId
*
dim
+
i
]
=
tmp
;
output
[
sequenceId
*
dim
+
i
]
=
tmp
;
index
[
sequenceId
*
dim
+
i
]
=
tmpId
;
index
[
sequenceId
*
dim
+
i
]
=
tmpId
;
}
}
}
}
void
hl_max_sequence_forward
(
real
*
input
,
void
hl_max_sequence_forward
(
real
*
input
,
const
int
*
sequence
,
const
int
*
sequence
,
real
*
output
,
real
*
output
,
int
*
index
,
int
*
index
,
int
numSequences
,
int
numSequences
,
int
dim
)
{
int
dim
)
{
CHECK_NOTNULL
(
input
);
CHECK_NOTNULL
(
input
);
...
@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
...
@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
dim3
threads
(
256
,
1
);
dim3
threads
(
256
,
1
);
dim3
grid
(
numSequences
,
1
);
dim3
grid
(
numSequences
,
1
);
KeMaxSequenceForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMaxSequenceForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
input
,
sequence
,
output
,
index
,
numSequences
,
dim
);
input
,
sequence
,
output
,
index
,
numSequences
,
dim
);
CHECK_SYNC
(
"hl_max_sequence_forward failed"
);
CHECK_SYNC
(
"hl_max_sequence_forward failed"
);
}
}
__global__
void
KeMaxSequenceBackward
(
real
*
outputGrad
,
__global__
void
KeMaxSequenceBackward
(
int
*
index
,
real
*
outputGrad
,
int
*
index
,
real
*
inputGrad
,
int
numSequences
,
int
dim
)
{
real
*
inputGrad
,
int
numSequences
,
int
dim
)
{
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
int
colIdx
=
idx
%
dim
;
int
colIdx
=
idx
%
dim
;
if
(
idx
<
numSequences
*
dim
)
{
if
(
idx
<
numSequences
*
dim
)
{
int
insId
=
index
[
idx
];
int
insId
=
index
[
idx
];
inputGrad
[
insId
*
dim
+
colIdx
]
+=
outputGrad
[
idx
];
inputGrad
[
insId
*
dim
+
colIdx
]
+=
outputGrad
[
idx
];
}
}
}
}
void
hl_max_sequence_backward
(
real
*
outputGrad
,
void
hl_max_sequence_backward
(
int
*
index
,
real
*
outputGrad
,
int
*
index
,
real
*
inputGrad
,
int
numSequences
,
int
dim
)
{
real
*
inputGrad
,
int
numSequences
,
int
dim
)
{
CHECK_NOTNULL
(
outputGrad
);
CHECK_NOTNULL
(
outputGrad
);
CHECK_NOTNULL
(
index
);
CHECK_NOTNULL
(
index
);
CHECK_NOTNULL
(
inputGrad
);
CHECK_NOTNULL
(
inputGrad
);
...
@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
...
@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
unsigned
int
blocks
=
(
numSequences
*
dim
+
128
-
1
)
/
128
;
unsigned
int
blocks
=
(
numSequences
*
dim
+
128
-
1
)
/
128
;
dim3
threads
(
128
,
1
);
dim3
threads
(
128
,
1
);
dim3
grid
(
blocks
,
1
);
dim3
grid
(
blocks
,
1
);
KeMaxSequenceBackward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMaxSequenceBackward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
outputGrad
,
index
,
inputGrad
,
numSequences
,
dim
);
outputGrad
,
index
,
inputGrad
,
numSequences
,
dim
);
CHECK_SYNC
(
"hl_max_sequence_backward failed"
);
CHECK_SYNC
(
"hl_max_sequence_backward failed"
);
}
}
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
AddRow
>
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
AddRow
>
__global__
void
KeMatrixAddRows
(
real
*
output
,
__global__
void
KeMatrixAddRows
(
real
*
output
,
real
*
table
,
real
*
table
,
int
*
ids
,
int
*
ids
,
...
@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
...
@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
while
(
sampleId
<
numSamples
)
{
while
(
sampleId
<
numSamples
)
{
int
tableId
=
ids
[
sampleId
];
int
tableId
=
ids
[
sampleId
];
if
((
0
<=
tableId
)
&&
(
tableId
<
tableSize
))
{
if
((
0
<=
tableId
)
&&
(
tableId
<
tableSize
))
{
real
*
outputData
=
output
+
sampleId
*
dim
;
real
*
outputData
=
output
+
sampleId
*
dim
;
real
*
tableData
=
table
+
tableId
*
dim
;
real
*
tableData
=
table
+
tableId
*
dim
;
for
(
int
i
=
idx
;
i
<
dim
;
i
+=
blockDimX
)
{
for
(
int
i
=
idx
;
i
<
dim
;
i
+=
blockDimX
)
{
if
(
AddRow
==
0
)
{
if
(
AddRow
==
0
)
{
outputData
[
i
]
+=
tableData
[
i
];
outputData
[
i
]
+=
tableData
[
i
];
...
@@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output,
...
@@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output,
}
}
}
}
}
}
sampleId
+=
blockDimY
*
gridDimX
;
sampleId
+=
blockDimY
*
gridDimX
;
}
}
}
}
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
seq2batch
,
bool
isAdd
>
template
<
int
blockDimX
,
__global__
int
blockDimY
,
void
KeSequence2Batch
(
real
*
batch
,
int
gridDimX
,
real
*
sequence
,
bool
seq2batch
,
const
int
*
batchIndex
,
bool
isAdd
>
int
seqWidth
,
__global__
void
KeSequence2Batch
(
real
*
batch
,
int
batchCount
)
{
real
*
sequence
,
const
int
*
batchIndex
,
int
seqWidth
,
int
batchCount
)
{
int
idx
=
threadIdx
.
x
;
int
idx
=
threadIdx
.
x
;
int
idy
=
threadIdx
.
y
;
int
idy
=
threadIdx
.
y
;
int
id
=
blockIdx
.
x
+
idy
*
gridDimX
;
int
id
=
blockIdx
.
x
+
idy
*
gridDimX
;
while
(
id
<
batchCount
)
{
while
(
id
<
batchCount
)
{
int
seqId
=
batchIndex
[
id
];
int
seqId
=
batchIndex
[
id
];
real
*
batchData
=
batch
+
id
*
seqWidth
;
real
*
batchData
=
batch
+
id
*
seqWidth
;
real
*
seqData
=
sequence
+
seqId
*
seqWidth
;
real
*
seqData
=
sequence
+
seqId
*
seqWidth
;
for
(
int
i
=
idx
;
i
<
seqWidth
;
i
+=
blockDimX
)
{
for
(
int
i
=
idx
;
i
<
seqWidth
;
i
+=
blockDimX
)
{
if
(
seq2batch
)
{
if
(
seq2batch
)
{
if
(
isAdd
)
{
if
(
isAdd
)
{
...
@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
...
@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
}
}
}
}
}
}
id
+=
blockDimY
*
gridDimX
;
id
+=
blockDimY
*
gridDimX
;
}
}
}
}
void
hl_sequence2batch_copy
(
real
*
batch
,
void
hl_sequence2batch_copy
(
real
*
batch
,
real
*
sequence
,
real
*
sequence
,
const
int
*
batchIndex
,
const
int
*
batchIndex
,
int
seqWidth
,
int
seqWidth
,
int
batchCount
,
int
batchCount
,
bool
seq2batch
)
{
bool
seq2batch
)
{
...
@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
...
@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
dim3
threads
(
128
,
8
);
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
if
(
seq2batch
)
{
if
(
seq2batch
)
{
KeSequence2Batch
<
128
,
8
,
8
,
1
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSequence2Batch
<
128
,
8
,
8
,
1
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
}
else
{
}
else
{
KeSequence2Batch
<
128
,
8
,
8
,
0
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSequence2Batch
<
128
,
8
,
8
,
0
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
}
}
CHECK_SYNC
(
"hl_sequence2batch_copy failed"
);
CHECK_SYNC
(
"hl_sequence2batch_copy failed"
);
}
}
void
hl_sequence2batch_add
(
real
*
batch
,
void
hl_sequence2batch_add
(
real
*
batch
,
real
*
sequence
,
real
*
sequence
,
int
*
batchIndex
,
int
*
batchIndex
,
int
seqWidth
,
int
seqWidth
,
int
batchCount
,
int
batchCount
,
bool
seq2batch
)
{
bool
seq2batch
)
{
...
@@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch,
...
@@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch,
dim3
threads
(
128
,
8
);
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
if
(
seq2batch
)
{
if
(
seq2batch
)
{
KeSequence2Batch
<
128
,
8
,
8
,
1
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSequence2Batch
<
128
,
8
,
8
,
1
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
}
else
{
}
else
{
KeSequence2Batch
<
128
,
8
,
8
,
0
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSequence2Batch
<
128
,
8
,
8
,
0
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
}
}
CHECK_SYNC
(
"hl_sequence2batch_add failed"
);
CHECK_SYNC
(
"hl_sequence2batch_add failed"
);
}
}
template
<
bool
normByTimes
,
bool
seq2batch
>
template
<
bool
normByTimes
,
bool
seq2batch
>
__global__
__global__
void
KeSequence2BatchPadding
(
real
*
batch
,
void
KeSequence2BatchPadding
(
real
*
batch
,
real
*
sequence
,
real
*
sequence
,
const
int
*
sequenceStartPositions
,
const
int
*
sequenceStartPositions
,
const
size_t
sequenceWidth
,
const
size_t
sequenceWidth
,
const
size_t
maxSequenceLength
,
const
size_t
maxSequenceLength
,
const
size_t
numSequences
)
{
const
size_t
numSequences
)
{
int
batchIdx
=
blockIdx
.
y
;
int
batchIdx
=
blockIdx
.
y
;
int
sequenceStart
=
sequenceStartPositions
[
batchIdx
];
int
sequenceStart
=
sequenceStartPositions
[
batchIdx
];
int
sequenceLength
=
sequenceStartPositions
[
batchIdx
+
1
]
-
sequenceStart
;
int
sequenceLength
=
sequenceStartPositions
[
batchIdx
+
1
]
-
sequenceStart
;
...
@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
...
@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
if
(
seq2batch
)
{
if
(
seq2batch
)
{
/* sequence -> batch */
/* sequence -> batch */
if
(
normByTimes
)
{
if
(
normByTimes
)
{
KeSequence2BatchPadding
<
1
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
KeSequence2BatchPadding
<
1
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
batch
,
sequence
,
sequenceStartPositions
,
batch
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
sequence
,
sequenceStartPositions
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
}
else
{
}
else
{
KeSequence2BatchPadding
<
0
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
KeSequence2BatchPadding
<
0
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
batch
,
sequence
,
sequenceStartPositions
,
batch
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
sequence
,
sequenceStartPositions
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
}
}
}
else
{
}
else
{
/* batch -> sequence */
/* batch -> sequence */
if
(
normByTimes
)
{
if
(
normByTimes
)
{
KeSequence2BatchPadding
<
1
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
KeSequence2BatchPadding
<
1
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
batch
,
sequence
,
sequenceStartPositions
,
batch
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
sequence
,
sequenceStartPositions
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
}
else
{
}
else
{
KeSequence2BatchPadding
<
0
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
KeSequence2BatchPadding
<
0
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
batch
,
sequence
,
sequenceStartPositions
,
batch
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
sequence
,
sequenceStartPositions
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
}
}
}
}
CHECK_SYNC
(
"hl_sequence2batch_copy_padding failed"
);
CHECK_SYNC
(
"hl_sequence2batch_copy_padding failed"
);
}
}
__device__
inline
float
my_rsqrt
(
float
x
)
{
__device__
inline
float
my_rsqrt
(
float
x
)
{
return
rsqrtf
(
x
);
}
return
rsqrtf
(
x
);
}
__device__
inline
double
my_rsqrt
(
double
x
)
{
__device__
inline
double
my_rsqrt
(
double
x
)
{
return
rsqrt
(
x
);
}
return
rsqrt
(
x
);
}
__global__
void
KeSequenceAvgForward
(
real
*
dst
,
__global__
void
KeSequenceAvgForward
(
real
*
dst
,
real
*
src
,
real
*
src
,
...
@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
...
@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
sum
+=
src
[
i
*
width
+
col
];
sum
+=
src
[
i
*
width
+
col
];
}
}
sum
=
mode
==
1
?
sum
:
sum
=
mode
==
1
?
sum
:
(
mode
==
0
?
sum
/
seqLength
(
mode
==
0
?
sum
/
seqLength
:
sum
*
my_rsqrt
((
real
)
seqLength
));
:
sum
*
my_rsqrt
((
real
)
seqLength
));
dst
[
gid
]
+=
sum
;
dst
[
gid
]
+=
sum
;
}
}
}
}
...
@@ -347,10 +355,10 @@ void hl_sequence_avg_forward(real* dst,
...
@@ -347,10 +355,10 @@ void hl_sequence_avg_forward(real* dst,
int
grid
=
DIVUP
(
width
*
height
,
512
);
int
grid
=
DIVUP
(
width
*
height
,
512
);
CHECK
(
mode
==
0
||
mode
==
1
||
mode
==
2
)
CHECK
(
mode
==
0
||
mode
==
1
||
mode
==
2
)
<<
"mode error in hl_sequence_avg_forward!"
;
<<
"mode error in hl_sequence_avg_forward!"
;
KeSequenceAvgForward
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
KeSequenceAvgForward
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
(
dst
,
src
,
starts
,
height
,
width
,
mode
);
dst
,
src
,
starts
,
height
,
width
,
mode
);
CHECK_SYNC
(
"hl_sequence_avg_forward failed"
);
CHECK_SYNC
(
"hl_sequence_avg_forward failed"
);
}
}
...
@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
...
@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
int
seqLength
=
end
-
start
;
int
seqLength
=
end
-
start
;
if
(
seqLength
==
0
)
return
;
if
(
seqLength
==
0
)
return
;
real
grad
=
src
[
gid
];
real
grad
=
src
[
gid
];
grad
=
mode
==
1
?
grad
:
grad
=
mode
==
1
?
grad
:
(
mode
==
0
?
grad
/
seqLength
(
mode
==
0
?
grad
/
seqLength
:
grad
*
my_rsqrt
((
real
)
seqLength
));
:
grad
*
my_rsqrt
((
real
)
seqLength
));
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
dst
[
i
*
width
+
col
]
+=
grad
;
dst
[
i
*
width
+
col
]
+=
grad
;
}
}
...
@@ -392,9 +400,9 @@ void hl_sequence_avg_backward(real* dst,
...
@@ -392,9 +400,9 @@ void hl_sequence_avg_backward(real* dst,
int
grid
=
DIVUP
(
width
*
height
,
512
);
int
grid
=
DIVUP
(
width
*
height
,
512
);
CHECK
(
mode
==
0
||
mode
==
1
||
mode
==
2
)
CHECK
(
mode
==
0
||
mode
==
1
||
mode
==
2
)
<<
"mode error in hl_sequence_avg_backward!"
;
<<
"mode error in hl_sequence_avg_backward!"
;
KeSequenceAvgBackward
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
KeSequenceAvgBackward
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
(
dst
,
src
,
starts
,
height
,
width
,
mode
);
dst
,
src
,
starts
,
height
,
width
,
mode
);
CHECK_SYNC
(
"hl_sequence_avg_backward failed"
);
CHECK_SYNC
(
"hl_sequence_avg_backward failed"
);
}
}
paddle/cuda/src/hl_cuda_sparse.cu
浏览文件 @
1d4fa243
...
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_cuda.h"
#include "hl_cuda.h"
#include "hl_cuda_sparse.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_ops.cuh"
#include "hl_sparse.h"
#include "hl_sparse.h"
#include "hl_sparse.ph"
#include "hl_sparse.ph"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_cuda_sparse.cuh"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
mul_scalar
,
ONE_PARAMETER
,
a
=
a
*
p
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
mul_scalar
,
ONE_PARAMETER
,
a
=
a
*
p
);
...
@@ -34,15 +33,15 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
...
@@ -34,15 +33,15 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
CHECK
(
A_d
->
format
==
HL_SPARSE_CSR
)
<<
"matrix format error!"
;
CHECK
(
A_d
->
format
==
HL_SPARSE_CSR
)
<<
"matrix format error!"
;
if
(
A_d
->
nnz
==
0
)
{
if
(
A_d
->
nnz
==
0
)
{
hl_gpu_apply_unary_op
(
hl_gpu_apply_unary_op
(
unary
::
Zero
<
real
>
(),
C_d
,
dimM
,
dimN
,
dimN
);
unary
::
Zero
<
real
>
(),
C_d
,
dimM
,
dimN
,
dimN
);
return
;
return
;
}
}
/* nnz != 0 */
/* nnz != 0 */
hl_csr_matrix
A_d2
=
(
hl_csr_matrix
)(
A_d
->
matrix
);
hl_csr_matrix
A_d2
=
(
hl_csr_matrix
)(
A_d
->
matrix
);
CHECK
((
A_d2
->
csr_val
||
A_d
->
type
==
HL_NO_VALUE
)
&&
CHECK
((
A_d2
->
csr_val
||
A_d
->
type
==
HL_NO_VALUE
)
&&
A_d2
->
csr_row
&&
A_d2
->
csr_row
&&
A_d2
->
csr_col
)
<<
"parameter transa error!"
;
A_d2
->
csr_col
)
<<
"parameter transa error!"
;
int
blocksX
=
(
dimN
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
int
blocksX
=
(
dimN
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
int
blocksY
=
(
dimM
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
int
blocksY
=
(
dimM
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
...
@@ -50,21 +49,11 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
...
@@ -50,21 +49,11 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixCsr2Dense
<
0
>
KeSMatrixCsr2Dense
<
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_row
,
A_d2
->
csr_col
,
C_d
,
dimM
,
dimN
);
A_d2
->
csr_row
,
A_d2
->
csr_col
,
C_d
,
dimM
,
dimN
);
}
else
if
(
A_d
->
type
==
HL_FLOAT_VALUE
)
{
}
else
if
(
A_d
->
type
==
HL_FLOAT_VALUE
)
{
KeSMatrixCsr2Dense
<
1
>
KeSMatrixCsr2Dense
<
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_row
,
A_d2
->
csr_col
,
C_d
,
dimM
,
dimN
);
A_d2
->
csr_row
,
A_d2
->
csr_col
,
C_d
,
dimM
,
dimN
);
}
else
{
}
else
{
}
}
CHECK_SYNC
(
"hl_matrix_csr2dense failed"
);
CHECK_SYNC
(
"hl_matrix_csr2dense failed"
);
...
@@ -80,15 +69,15 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
...
@@ -80,15 +69,15 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
CHECK
(
A_d
->
format
==
HL_SPARSE_CSC
)
<<
"matrix format error!"
;
CHECK
(
A_d
->
format
==
HL_SPARSE_CSC
)
<<
"matrix format error!"
;
if
(
A_d
->
nnz
==
0
)
{
if
(
A_d
->
nnz
==
0
)
{
hl_gpu_apply_unary_op
(
hl_gpu_apply_unary_op
(
unary
::
Zero
<
real
>
(),
C_d
,
dimM
,
dimN
,
dimN
);
unary
::
Zero
<
real
>
(),
C_d
,
dimM
,
dimN
,
dimN
);
return
;
return
;
}
}
/* nnz != 0 */
/* nnz != 0 */
hl_csc_matrix
A_d2
=
(
hl_csc_matrix
)(
A_d
->
matrix
);
hl_csc_matrix
A_d2
=
(
hl_csc_matrix
)(
A_d
->
matrix
);
CHECK
((
A_d2
->
csc_val
||
A_d
->
type
==
HL_NO_VALUE
)
&&
CHECK
((
A_d2
->
csc_val
||
A_d
->
type
==
HL_NO_VALUE
)
&&
A_d2
->
csc_row
&&
A_d2
->
csc_row
&&
A_d2
->
csc_col
)
<<
"parameter transa error!"
;
A_d2
->
csc_col
)
<<
"parameter transa error!"
;
int
blocksX
=
(
dimN
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
int
blocksX
=
(
dimN
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
int
blocksY
=
(
dimM
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
int
blocksY
=
(
dimM
+
CU_CSR2DENSE_THREAD_X
-
1
)
/
CU_CSR2DENSE_THREAD_X
;
...
@@ -96,21 +85,11 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
...
@@ -96,21 +85,11 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixCsc2Dense
<
0
>
KeSMatrixCsc2Dense
<
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
A_d2
->
csc_val
,
A_d2
->
csc_val
,
A_d2
->
csc_row
,
A_d2
->
csc_col
,
C_d
,
dimM
,
dimN
);
A_d2
->
csc_row
,
A_d2
->
csc_col
,
C_d
,
dimM
,
dimN
);
}
else
if
(
A_d
->
type
==
HL_FLOAT_VALUE
)
{
}
else
if
(
A_d
->
type
==
HL_FLOAT_VALUE
)
{
KeSMatrixCsc2Dense
<
1
>
KeSMatrixCsc2Dense
<
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
A_d2
->
csc_val
,
A_d2
->
csc_val
,
A_d2
->
csc_row
,
A_d2
->
csc_col
,
C_d
,
dimM
,
dimN
);
A_d2
->
csc_row
,
A_d2
->
csc_col
,
C_d
,
dimM
,
dimN
);
}
else
{
}
else
{
}
}
CHECK_SYNC
(
"hl_matrix_csc2dense failed"
);
CHECK_SYNC
(
"hl_matrix_csc2dense failed"
);
...
@@ -118,43 +97,43 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
...
@@ -118,43 +97,43 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
void
hl_malloc_sparse_matrix
(
hl_sparse_matrix_s
*
A_d
,
void
hl_malloc_sparse_matrix
(
hl_sparse_matrix_s
*
A_d
,
hl_matrix_format_t
format
,
hl_matrix_format_t
format
,
hl_matrix_value_t
value_type
,
hl_matrix_value_t
value_type
,
int
dimM
,
int
dimM
,
int
dimN
,
int
dimN
,
int
nnz
)
{
int
nnz
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK
(
format
==
HL_SPARSE_CSR
||
format
==
HL_SPARSE_CSC
)
CHECK
(
format
==
HL_SPARSE_CSR
||
format
==
HL_SPARSE_CSC
)
<<
"sparse matrix format error!"
;
<<
"sparse matrix format error!"
;
CHECK
(
value_type
==
HL_FLOAT_VALUE
||
value_type
==
HL_NO_VALUE
)
CHECK
(
value_type
==
HL_FLOAT_VALUE
||
value_type
==
HL_NO_VALUE
)
<<
"sparse matrix value type error!"
;
<<
"sparse matrix value type error!"
;
/* avoid malloc 0 bytes */
/* avoid malloc 0 bytes */
int
nnz_s
=
(
nnz
==
0
?
1
:
nnz
);
int
nnz_s
=
(
nnz
==
0
?
1
:
nnz
);
if
(
format
==
HL_SPARSE_CSR
)
{
if
(
format
==
HL_SPARSE_CSR
)
{
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
char
*
tmp
=
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
char
*
tmp
=
+
sizeof
(
_hl_csr_matrix
));
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
+
sizeof
(
_hl_csr_matrix
));
CHECK_NOTNULL
(
tmp
);
CHECK_NOTNULL
(
tmp
);
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
csr
->
sparsity
=
-
1.0
;
csr
->
sparsity
=
-
1.0
;
if
(
value_type
==
HL_NO_VALUE
)
{
if
(
value_type
==
HL_NO_VALUE
)
{
csr
->
csr_val
=
NULL
;
csr
->
csr_val
=
NULL
;
csr
->
nnz_s
=
nnz_s
;
csr
->
nnz_s
=
nnz_s
;
csr
->
row_s
=
dimM
+
1
;
csr
->
row_s
=
dimM
+
1
;
csr
->
csr_row
=
(
int
*
)
hl_malloc_device
((
dimM
+
1
)
*
sizeof
(
int
));
csr
->
csr_row
=
(
int
*
)
hl_malloc_device
((
dimM
+
1
)
*
sizeof
(
int
));
csr
->
csr_col
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
csr
->
csr_col
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
}
else
if
(
value_type
==
HL_FLOAT_VALUE
)
{
}
else
if
(
value_type
==
HL_FLOAT_VALUE
)
{
csr
->
nnz_s
=
nnz_s
;
csr
->
nnz_s
=
nnz_s
;
csr
->
row_s
=
dimM
+
1
;
csr
->
row_s
=
dimM
+
1
;
csr
->
csr_val
=
(
real
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
real
));
csr
->
csr_val
=
(
real
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
real
));
csr
->
csr_row
=
(
int
*
)
hl_malloc_device
((
dimM
+
1
)
*
sizeof
(
int
));
csr
->
csr_row
=
(
int
*
)
hl_malloc_device
((
dimM
+
1
)
*
sizeof
(
int
));
csr
->
csr_col
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
csr
->
csr_col
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
...
@@ -162,28 +141,28 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
...
@@ -162,28 +141,28 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
}
else
if
(
format
==
HL_SPARSE_CSC
)
{
}
else
if
(
format
==
HL_SPARSE_CSC
)
{
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
char
*
tmp
=
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
char
*
tmp
=
+
sizeof
(
_hl_csc_matrix
));
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
+
sizeof
(
_hl_csc_matrix
));
CHECK_NOTNULL
(
tmp
);
CHECK_NOTNULL
(
tmp
);
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
csc
->
sparsity
=
-
1.0
f
;
csc
->
sparsity
=
-
1.0
f
;
if
(
value_type
==
HL_NO_VALUE
)
{
if
(
value_type
==
HL_NO_VALUE
)
{
csc
->
csc_val
=
NULL
;
csc
->
csc_val
=
NULL
;
csc
->
nnz_s
=
nnz_s
;
csc
->
nnz_s
=
nnz_s
;
csc
->
col_s
=
dimN
+
1
;
csc
->
col_s
=
dimN
+
1
;
csc
->
csc_row
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
csc
->
csc_row
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
csc
->
csc_col
=
(
int
*
)
hl_malloc_device
((
dimN
+
1
)
*
sizeof
(
int
));
csc
->
csc_col
=
(
int
*
)
hl_malloc_device
((
dimN
+
1
)
*
sizeof
(
int
));
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csc
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csc
;
}
else
if
(
value_type
==
HL_FLOAT_VALUE
)
{
}
else
if
(
value_type
==
HL_FLOAT_VALUE
)
{
csc
->
nnz_s
=
nnz_s
;
csc
->
nnz_s
=
nnz_s
;
csc
->
col_s
=
dimN
+
1
;
csc
->
col_s
=
dimN
+
1
;
csc
->
csc_val
=
(
real
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
real
));
csc
->
csc_val
=
(
real
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
real
));
csc
->
csc_row
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
csc
->
csc_row
=
(
int
*
)
hl_malloc_device
((
nnz_s
)
*
sizeof
(
int
));
csc
->
csc_col
=
(
int
*
)
hl_malloc_device
((
dimN
+
1
)
*
sizeof
(
int
));
csc
->
csc_col
=
(
int
*
)
hl_malloc_device
((
dimN
+
1
)
*
sizeof
(
int
));
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csc
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csc
;
...
@@ -200,7 +179,7 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
...
@@ -200,7 +179,7 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
void
hl_free_sparse_matrix
(
hl_sparse_matrix_s
A_d
)
{
void
hl_free_sparse_matrix
(
hl_sparse_matrix_s
A_d
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK
(
A_d
->
format
==
HL_SPARSE_CSR
||
A_d
->
format
==
HL_SPARSE_CSC
)
CHECK
(
A_d
->
format
==
HL_SPARSE_CSR
||
A_d
->
format
==
HL_SPARSE_CSC
)
<<
"sparse matrix format error!"
;
<<
"sparse matrix format error!"
;
if
(
A_d
->
matrix
==
NULL
)
{
if
(
A_d
->
matrix
==
NULL
)
{
free
(
A_d
);
free
(
A_d
);
...
@@ -249,77 +228,77 @@ void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
...
@@ -249,77 +228,77 @@ void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
}
}
void
hl_construct_sparse_matrix
(
hl_sparse_matrix_s
*
A_d
,
void
hl_construct_sparse_matrix
(
hl_sparse_matrix_s
*
A_d
,
void
*
dest_d
,
void
*
dest_d
,
size_t
size
,
size_t
size
,
hl_matrix_format_t
format
,
hl_matrix_format_t
format
,
hl_matrix_value_t
value_type
,
hl_matrix_value_t
value_type
,
int
dimM
,
int
dimM
,
int
dimN
,
int
dimN
,
int
nnz
)
{
int
nnz
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK
(
format
==
HL_SPARSE_CSR
||
format
==
HL_SPARSE_CSC
)
CHECK
(
format
==
HL_SPARSE_CSR
||
format
==
HL_SPARSE_CSC
)
<<
"sparse matrix format error!"
;
<<
"sparse matrix format error!"
;
if
(
format
==
HL_SPARSE_CSR
)
{
if
(
format
==
HL_SPARSE_CSR
)
{
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
size_t
size_
=
(
dimM
+
1
)
*
sizeof
(
int
)
+
nnz
*
sizeof
(
int
);
size_t
size_
=
(
dimM
+
1
)
*
sizeof
(
int
)
+
nnz
*
sizeof
(
int
);
if
(
value_type
!=
HL_NO_VALUE
)
{
if
(
value_type
!=
HL_NO_VALUE
)
{
size_
+=
nnz
*
sizeof
(
real
);
size_
+=
nnz
*
sizeof
(
real
);
}
}
CHECK_LE
(
size_
,
size
)
<<
"dest_d size("
<<
size
CHECK_LE
(
size_
,
size
)
<<
"dest_d size("
<<
size
<<
") too small, should bigger than("
<<
size_
<<
")!"
;
<<
") too small, should bigger than("
<<
size_
<<
")!"
;
char
*
tmp
=
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
char
*
tmp
=
+
sizeof
(
_hl_csr_matrix
));
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
+
sizeof
(
_hl_csr_matrix
));
CHECK_NOTNULL
(
tmp
);
CHECK_NOTNULL
(
tmp
);
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
if
(
value_type
==
HL_NO_VALUE
)
{
if
(
value_type
==
HL_NO_VALUE
)
{
csr
->
csr_val
=
NULL
;
csr
->
csr_val
=
NULL
;
csr
->
csr_row
=
(
int
*
)
dest_d
;
csr
->
csr_row
=
(
int
*
)
dest_d
;
csr
->
csr_col
=
(
int
*
)((
char
*
)
dest_d
+
(
dimM
+
1
)
*
sizeof
(
int
));
csr
->
csr_col
=
(
int
*
)((
char
*
)
dest_d
+
(
dimM
+
1
)
*
sizeof
(
int
));
}
else
{
}
else
{
csr
->
csr_val
=
(
real
*
)
dest_d
;
csr
->
csr_val
=
(
real
*
)
dest_d
;
csr
->
csr_row
=
(
int
*
)((
char
*
)
dest_d
+
nnz
*
sizeof
(
real
));
csr
->
csr_row
=
(
int
*
)((
char
*
)
dest_d
+
nnz
*
sizeof
(
real
));
csr
->
csr_col
=
(
int
*
)((
char
*
)
dest_d
+
csr
->
csr_col
=
(
int
*
)((
char
*
)
dest_d
+
nnz
*
sizeof
(
real
)
+
nnz
*
sizeof
(
real
)
+
(
dimM
+
1
)
*
sizeof
(
int
));
(
dimM
+
1
)
*
sizeof
(
int
));
}
}
csr
->
nnz_s
=
nnz
;
csr
->
nnz_s
=
nnz
;
csr
->
row_s
=
dimM
+
1
;
csr
->
row_s
=
dimM
+
1
;
csr
->
sparsity
=
-
1.0
;
csr
->
sparsity
=
-
1.0
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
}
else
if
(
format
==
HL_SPARSE_CSC
)
{
}
else
if
(
format
==
HL_SPARSE_CSC
)
{
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
size_t
size_
=
(
dimN
+
1
)
*
sizeof
(
int
)
+
nnz
*
sizeof
(
int
);
size_t
size_
=
(
dimN
+
1
)
*
sizeof
(
int
)
+
nnz
*
sizeof
(
int
);
if
(
value_type
!=
HL_NO_VALUE
)
{
if
(
value_type
!=
HL_NO_VALUE
)
{
size_
+=
nnz
*
sizeof
(
real
);
size_
+=
nnz
*
sizeof
(
real
);
}
}
CHECK_LE
(
size_
,
size
)
<<
"dest_d size("
<<
size
CHECK_LE
(
size_
,
size
)
<<
"dest_d size("
<<
size
<<
") too small, should bigger than("
<<
size_
<<
")!"
;
<<
") too small, should bigger than("
<<
size_
<<
")!"
;
char
*
tmp
=
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
char
*
tmp
=
+
sizeof
(
_hl_csc_matrix
));
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
+
sizeof
(
_hl_csc_matrix
));
CHECK_NOTNULL
(
tmp
);
CHECK_NOTNULL
(
tmp
);
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
if
(
value_type
==
HL_NO_VALUE
)
{
if
(
value_type
==
HL_NO_VALUE
)
{
csc
->
csc_val
=
NULL
;
csc
->
csc_val
=
NULL
;
csc
->
csc_col
=
(
int
*
)
dest_d
;
csc
->
csc_col
=
(
int
*
)
dest_d
;
csc
->
csc_row
=
(
int
*
)((
char
*
)
dest_d
+
(
dimN
+
1
)
*
sizeof
(
int
));
csc
->
csc_row
=
(
int
*
)((
char
*
)
dest_d
+
(
dimN
+
1
)
*
sizeof
(
int
));
}
else
{
}
else
{
csc
->
csc_val
=
(
real
*
)
dest_d
;
csc
->
csc_val
=
(
real
*
)
dest_d
;
csc
->
csc_col
=
(
int
*
)((
char
*
)
dest_d
+
nnz
*
sizeof
(
real
));
csc
->
csc_col
=
(
int
*
)((
char
*
)
dest_d
+
nnz
*
sizeof
(
real
));
csc
->
csc_row
=
(
int
*
)((
char
*
)
dest_d
+
csc
->
csc_row
=
(
int
*
)((
char
*
)
dest_d
+
nnz
*
sizeof
(
real
)
+
nnz
*
sizeof
(
real
)
+
(
dimN
+
1
)
*
sizeof
(
int
));
(
dimN
+
1
)
*
sizeof
(
int
));
}
}
csc
->
nnz_s
=
nnz
;
csc
->
nnz_s
=
nnz
;
csc
->
col_s
=
dimN
+
1
;
csc
->
col_s
=
dimN
+
1
;
csc
->
sparsity
=
-
1.0
f
;
csc
->
sparsity
=
-
1.0
f
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csc
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csc
;
...
@@ -333,11 +312,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
...
@@ -333,11 +312,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
}
}
void
hl_construct_sparse_matrix
(
hl_sparse_matrix_s
*
A_d
,
void
hl_construct_sparse_matrix
(
hl_sparse_matrix_s
*
A_d
,
real
*
value_d
,
real
*
value_d
,
int
*
rows_d
,
int
*
rows_d
,
int
*
cols_d
,
int
*
cols_d
,
hl_matrix_format_t
format
,
hl_matrix_format_t
format
,
hl_matrix_value_t
value_type
,
hl_matrix_value_t
value_type
,
int
dimM
,
int
dimM
,
int
dimN
,
int
dimN
,
int
nnz
)
{
int
nnz
)
{
...
@@ -345,11 +324,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
...
@@ -345,11 +324,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
CHECK
(
dimM
>
0
&&
nnz
>=
0
)
<<
"sparse matrix size error!"
;
CHECK
(
format
==
HL_SPARSE_CSR
||
format
==
HL_SPARSE_CSC
)
CHECK
(
format
==
HL_SPARSE_CSR
||
format
==
HL_SPARSE_CSC
)
<<
"sparse matrix format error!"
;
<<
"sparse matrix format error!"
;
if
(
format
==
HL_SPARSE_CSR
)
{
if
(
format
==
HL_SPARSE_CSR
)
{
char
*
tmp
=
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
char
*
tmp
=
+
sizeof
(
_hl_csr_matrix
));
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
+
sizeof
(
_hl_csr_matrix
));
CHECK_NOTNULL
(
tmp
);
CHECK_NOTNULL
(
tmp
);
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
...
@@ -362,8 +341,8 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
...
@@ -362,8 +341,8 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
*
A_d
=
(
hl_sparse_matrix_s
)
tmp
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
(
*
A_d
)
->
matrix
=
(
hl_matrix_s
)
csr
;
}
else
if
(
format
==
HL_SPARSE_CSC
)
{
}
else
if
(
format
==
HL_SPARSE_CSC
)
{
char
*
tmp
=
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
char
*
tmp
=
+
sizeof
(
_hl_csc_matrix
));
(
char
*
)
malloc
(
sizeof
(
_hl_sparse_matrix_s
)
+
sizeof
(
_hl_csc_matrix
));
CHECK_NOTNULL
(
tmp
);
CHECK_NOTNULL
(
tmp
);
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
tmp
+
sizeof
(
_hl_sparse_matrix_s
));
...
@@ -396,35 +375,30 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
...
@@ -396,35 +375,30 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
hl_stream_t
stream
)
{
hl_stream_t
stream
)
{
CHECK_NOTNULL
(
csr_matrix
);
CHECK_NOTNULL
(
csr_matrix
);
CHECK_EQ
(
csr_matrix
->
format
,
HL_SPARSE_CSR
)
CHECK_EQ
(
csr_matrix
->
format
,
HL_SPARSE_CSR
)
<<
"csr_matrix is not csr format!"
;
<<
"csr_matrix is not csr format!"
;
CHECK_NOTNULL
(
csr_matrix
->
matrix
);
CHECK_NOTNULL
(
csr_matrix
->
matrix
);
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
csr_matrix
->
matrix
);
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
csr_matrix
->
matrix
);
CHECK_LE
(
csr_matrix
->
nnz
,
csr
->
nnz_s
)
CHECK_LE
(
csr_matrix
->
nnz
,
csr
->
nnz_s
)
<<
"copy size "
<<
csr_matrix
->
nnz
<<
"copy size "
<<
csr_matrix
->
nnz
<<
" is big than alloc size "
<<
" is big than alloc size "
<<
csr
->
nnz_s
;
<<
csr
->
nnz_s
;
CHECK_LE
((
csr_matrix
->
rows
+
1
),
csr
->
row_s
)
CHECK_LE
((
csr_matrix
->
rows
+
1
),
csr
->
row_s
)
<<
"copy size "
<<
(
csr_matrix
->
rows
+
1
)
<<
"copy size "
<<
(
csr_matrix
->
rows
+
1
)
<<
" is big than alloc size "
<<
" is big than alloc size "
<<
csr
->
row_s
;
<<
csr
->
row_s
;
CHECK
(
csr_matrix
->
type
==
HL_FLOAT_VALUE
||
CHECK
(
csr_matrix
->
type
==
HL_FLOAT_VALUE
||
csr_matrix
->
type
==
HL_NO_VALUE
)
csr_matrix
->
type
==
HL_NO_VALUE
)
<<
"sparse matrix value type error!"
;
<<
"sparse matrix value type error!"
;
if
(
csr_matrix
->
type
==
HL_NO_VALUE
)
{
if
(
csr_matrix
->
type
==
HL_NO_VALUE
)
{
if
(
csr_row
==
NULL
&&
csr_col
==
NULL
)
{
if
(
csr_row
==
NULL
&&
csr_col
==
NULL
)
{
return
;
return
;
}
else
if
(
csr_row
!=
NULL
&&
csr_col
!=
NULL
)
{
}
else
if
(
csr_row
!=
NULL
&&
csr_col
!=
NULL
)
{
hl_memcpy_async
(
csr
->
csr_row
,
hl_memcpy_async
(
csr_row
,
csr
->
csr_row
,
csr_row
,
(
csr_matrix
->
rows
+
1
)
*
sizeof
(
int
),
stream
);
(
csr_matrix
->
rows
+
1
)
*
sizeof
(
int
),
stream
);
hl_memcpy_async
(
csr
->
csr_col
,
hl_memcpy_async
(
csr_col
,
csr
->
csr_col
,
csr_col
,
(
csr_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
(
csr_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter csr_row or csr_col is null pointer!"
;
LOG
(
FATAL
)
<<
"parameter csr_row or csr_col is null pointer!"
;
}
}
...
@@ -432,30 +406,21 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
...
@@ -432,30 +406,21 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
if
(
csr_val
==
NULL
&&
csr_row
==
NULL
&&
csr_col
==
NULL
)
{
if
(
csr_val
==
NULL
&&
csr_row
==
NULL
&&
csr_col
==
NULL
)
{
return
;
return
;
}
else
if
(
csr_val
!=
NULL
&&
csr_row
==
NULL
&&
csr_col
==
NULL
)
{
}
else
if
(
csr_val
!=
NULL
&&
csr_row
==
NULL
&&
csr_col
==
NULL
)
{
hl_memcpy_async
(
csr
->
csr_val
,
hl_memcpy_async
(
csr_val
,
csr
->
csr_val
,
csr_val
,
(
csr_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
(
csr_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
}
else
if
(
csr_val
!=
NULL
&&
csr_row
!=
NULL
&&
csr_col
!=
NULL
)
{
}
else
if
(
csr_val
!=
NULL
&&
csr_row
!=
NULL
&&
csr_col
!=
NULL
)
{
hl_memcpy_async
(
csr
->
csr_val
,
hl_memcpy_async
(
csr_val
,
csr
->
csr_val
,
csr_val
,
(
csr_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
(
csr_matrix
->
nnz
)
*
sizeof
(
real
),
hl_memcpy_async
(
stream
);
csr
->
csr_row
,
csr_row
,
(
csr_matrix
->
rows
+
1
)
*
sizeof
(
int
),
stream
);
hl_memcpy_async
(
csr
->
csr_row
,
hl_memcpy_async
(
csr_row
,
csr
->
csr_col
,
csr_col
,
(
csr_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
(
csr_matrix
->
rows
+
1
)
*
sizeof
(
int
),
stream
);
hl_memcpy_async
(
csr
->
csr_col
,
csr_col
,
(
csr_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter csr_row or csr_col is null pointer!"
;
LOG
(
FATAL
)
<<
"parameter csr_row or csr_col is null pointer!"
;
}
}
}
}
csr
->
sparsity
=
((
float
)
csr_matrix
->
nnz
)
/
csr
->
sparsity
=
((
float
)
csr_matrix
->
nnz
)
/
((
float
)
csr_matrix
->
rows
)
/
((
float
)
csr_matrix
->
rows
)
/
((
float
)
csr_matrix
->
cols
);
((
float
)
csr_matrix
->
cols
);
}
}
...
@@ -466,33 +431,28 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
...
@@ -466,33 +431,28 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
hl_stream_t
stream
)
{
hl_stream_t
stream
)
{
CHECK_NOTNULL
(
csc_matrix
);
CHECK_NOTNULL
(
csc_matrix
);
CHECK_EQ
(
csc_matrix
->
format
,
HL_SPARSE_CSC
)
CHECK_EQ
(
csc_matrix
->
format
,
HL_SPARSE_CSC
)
<<
"csc_matrix is not csc format error!"
;
<<
"csc_matrix is not csc format error!"
;
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
csc_matrix
->
matrix
);
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
csc_matrix
->
matrix
);
CHECK_LE
(
csc_matrix
->
nnz
,
csc
->
nnz_s
)
CHECK_LE
(
csc_matrix
->
nnz
,
csc
->
nnz_s
)
<<
"copy size "
<<
csc_matrix
->
nnz
<<
"copy size "
<<
csc_matrix
->
nnz
<<
" is big than alloc size "
<<
" is big than alloc size "
<<
csc
->
nnz_s
;
<<
csc
->
nnz_s
;
CHECK_LE
((
csc_matrix
->
cols
+
1
),
csc
->
col_s
)
CHECK_LE
((
csc_matrix
->
cols
+
1
),
csc
->
col_s
)
<<
"copy size "
<<
(
csc_matrix
->
cols
+
1
)
<<
"copy size "
<<
(
csc_matrix
->
cols
+
1
)
<<
" is big than alloc size "
<<
" is big than alloc size "
<<
csc
->
col_s
;
<<
csc
->
col_s
;
CHECK
(
csc_matrix
->
type
==
HL_FLOAT_VALUE
||
CHECK
(
csc_matrix
->
type
==
HL_FLOAT_VALUE
||
csc_matrix
->
type
==
HL_NO_VALUE
)
csc_matrix
->
type
==
HL_NO_VALUE
)
<<
"sparse matrix value type error!"
;
<<
"sparse matrix value type error!"
;
if
(
csc_matrix
->
type
==
HL_NO_VALUE
)
{
if
(
csc_matrix
->
type
==
HL_NO_VALUE
)
{
if
(
csc_row
==
NULL
&&
csc_col
==
NULL
)
{
if
(
csc_row
==
NULL
&&
csc_col
==
NULL
)
{
return
;
return
;
}
else
if
(
csc_row
!=
NULL
&&
csc_col
!=
NULL
)
{
}
else
if
(
csc_row
!=
NULL
&&
csc_col
!=
NULL
)
{
hl_memcpy_async
(
csc
->
csc_row
,
hl_memcpy_async
(
csc_row
,
csc
->
csc_row
,
csc_row
,
(
csc_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
(
csc_matrix
->
nnz
)
*
sizeof
(
int
),
hl_memcpy_async
(
stream
);
csc
->
csc_col
,
csc_col
,
(
csc_matrix
->
cols
+
1
)
*
sizeof
(
int
),
stream
);
hl_memcpy_async
(
csc
->
csc_col
,
csc_col
,
(
csc_matrix
->
cols
+
1
)
*
sizeof
(
int
),
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter csc_row or csc_col is null pointer!"
;
LOG
(
FATAL
)
<<
"parameter csc_row or csc_col is null pointer!"
;
}
}
...
@@ -500,30 +460,21 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
...
@@ -500,30 +460,21 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
if
(
csc_val
==
NULL
&&
csc_row
==
NULL
&&
csc_col
==
NULL
)
{
if
(
csc_val
==
NULL
&&
csc_row
==
NULL
&&
csc_col
==
NULL
)
{
return
;
return
;
}
else
if
(
csc_val
!=
NULL
&&
csc_row
==
NULL
&&
csc_col
==
NULL
)
{
}
else
if
(
csc_val
!=
NULL
&&
csc_row
==
NULL
&&
csc_col
==
NULL
)
{
hl_memcpy_async
(
csc
->
csc_val
,
hl_memcpy_async
(
csc_val
,
csc
->
csc_val
,
csc_val
,
(
csc_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
(
csc_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
}
else
if
(
csc_val
!=
NULL
&&
csc_row
!=
NULL
&&
csc_col
!=
NULL
)
{
}
else
if
(
csc_val
!=
NULL
&&
csc_row
!=
NULL
&&
csc_col
!=
NULL
)
{
hl_memcpy_async
(
csc
->
csc_val
,
hl_memcpy_async
(
csc_val
,
csc
->
csc_val
,
csc_val
,
(
csc_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
(
csc_matrix
->
nnz
)
*
sizeof
(
real
),
hl_memcpy_async
(
stream
);
csc
->
csc_row
,
csc_row
,
(
csc_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
hl_memcpy_async
(
csc
->
csc_row
,
hl_memcpy_async
(
csc_row
,
csc
->
csc_col
,
csc_col
,
(
csc_matrix
->
cols
+
1
)
*
sizeof
(
int
),
stream
);
(
csc_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
hl_memcpy_async
(
csc
->
csc_col
,
csc_col
,
(
csc_matrix
->
cols
+
1
)
*
sizeof
(
int
),
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter csc_row or csc_col is null pointer!"
;
LOG
(
FATAL
)
<<
"parameter csc_row or csc_col is null pointer!"
;
}
}
}
}
csc
->
sparsity
=
((
float
)
csc_matrix
->
nnz
)
/
csc
->
sparsity
=
((
float
)
csc_matrix
->
nnz
)
/
((
float
)
csc_matrix
->
rows
)
/
((
float
)
csc_matrix
->
rows
)
/
((
float
)
csc_matrix
->
cols
);
((
float
)
csc_matrix
->
cols
);
}
}
...
@@ -531,32 +482,23 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
...
@@ -531,32 +482,23 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
hl_sparse_matrix_s
src
,
hl_sparse_matrix_s
src
,
hl_stream_t
stream
)
{
hl_stream_t
stream
)
{
CHECK
(
dst
&&
src
&&
dst
->
matrix
&&
src
->
matrix
)
CHECK
(
dst
&&
src
&&
dst
->
matrix
&&
src
->
matrix
)
<<
"parameter dst or src is null pointer!"
;
<<
"parameter dst or src is null pointer!"
;
CHECK_EQ
(
dst
->
format
,
src
->
format
)
CHECK_EQ
(
dst
->
format
,
src
->
format
)
<<
"sparse matrix format does not match!"
;
<<
"sparse matrix format does not match!"
;
CHECK
(
dst
->
type
!=
HL_FLOAT_VALUE
||
src
->
type
!=
HL_NO_VALUE
)
CHECK
(
dst
->
type
!=
HL_FLOAT_VALUE
||
src
->
type
!=
HL_NO_VALUE
)
<<
"src sparse matrix is no value, dst sparse matrix has value!"
;
<<
"src sparse matrix is no value, dst sparse matrix has value!"
;
if
(
dst
->
format
==
HL_SPARSE_CSR
)
{
if
(
dst
->
format
==
HL_SPARSE_CSR
)
{
dst
->
rows
=
src
->
rows
;
dst
->
rows
=
src
->
rows
;
dst
->
cols
=
src
->
cols
;
dst
->
cols
=
src
->
cols
;
dst
->
nnz
=
src
->
nnz
;
dst
->
nnz
=
src
->
nnz
;
hl_csr_matrix
csr
=
(
hl_csr_matrix
)
src
->
matrix
;
hl_csr_matrix
csr
=
(
hl_csr_matrix
)
src
->
matrix
;
hl_memcpy_csr_matrix
(
dst
,
hl_memcpy_csr_matrix
(
dst
,
csr
->
csr_val
,
csr
->
csr_row
,
csr
->
csr_col
,
stream
);
csr
->
csr_val
,
csr
->
csr_row
,
csr
->
csr_col
,
stream
);
}
else
if
(
dst
->
format
==
HL_SPARSE_CSC
)
{
}
else
if
(
dst
->
format
==
HL_SPARSE_CSC
)
{
dst
->
rows
=
src
->
rows
;
dst
->
rows
=
src
->
rows
;
dst
->
cols
=
src
->
cols
;
dst
->
cols
=
src
->
cols
;
dst
->
nnz
=
src
->
nnz
;
dst
->
nnz
=
src
->
nnz
;
hl_csc_matrix
csc
=
(
hl_csc_matrix
)
src
->
matrix
;
hl_csc_matrix
csc
=
(
hl_csc_matrix
)
src
->
matrix
;
hl_memcpy_csc_matrix
(
dst
,
hl_memcpy_csc_matrix
(
dst
,
csc
->
csc_val
,
csc
->
csc_row
,
csc
->
csc_col
,
stream
);
csc
->
csc_val
,
csc
->
csc_row
,
csc
->
csc_col
,
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"sparse matrix format error!"
;
LOG
(
FATAL
)
<<
"sparse matrix format error!"
;
}
}
...
@@ -569,20 +511,24 @@ static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
...
@@ -569,20 +511,24 @@ static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
if
(
beta
==
0.0
)
{
if
(
beta
==
0.0
)
{
hl_gpu_apply_unary_op
(
unary
::
Zero
<
real
>
(),
c
,
dimM
,
dimN
,
dimN
);
hl_gpu_apply_unary_op
(
unary
::
Zero
<
real
>
(),
c
,
dimM
,
dimN
,
dimN
);
}
else
{
}
else
{
if
(
beta
!=
1.0
){
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
c
,
dimM
,
dimN
,
dimN
);
unary
::
mul_scalar
<
real
>
(
beta
),
c
,
dimM
,
dimN
,
dimN
);
}
}
}
}
return
;
return
;
}
}
void
hl_matrix_csr_mul_dense
(
hl_sparse_matrix_s
A_d
,
hl_trans_op_t
transa
,
void
hl_matrix_csr_mul_dense
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
hl_trans_op_t
transb
,
hl_trans_op_t
transa
,
real
*
B_d
,
hl_trans_op_t
transb
,
real
*
C_d
,
real
*
C_d
,
int
dimM
,
int
dimN
,
int
dimK
,
int
dimM
,
real
alpha
,
real
beta
)
{
int
dimN
,
int
dimK
,
real
alpha
,
real
beta
)
{
CHECK_EQ
(
transb
,
HPPL_OP_N
);
CHECK_EQ
(
transb
,
HPPL_OP_N
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
...
@@ -592,7 +538,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -592,7 +538,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
if
((
HPPL_OP_N
==
transa
&&
(
A_d
->
rows
!=
dimM
||
A_d
->
cols
!=
dimK
))
||
if
((
HPPL_OP_N
==
transa
&&
(
A_d
->
rows
!=
dimM
||
A_d
->
cols
!=
dimK
))
||
(
HPPL_OP_T
==
transa
&&
(
A_d
->
rows
!=
dimK
||
A_d
->
cols
!=
dimM
)))
{
(
HPPL_OP_T
==
transa
&&
(
A_d
->
rows
!=
dimK
||
A_d
->
cols
!=
dimM
)))
{
LOG
(
FATAL
)
<<
"parameter error!"
;
LOG
(
FATAL
)
<<
"parameter error!"
;
}
}
if
(
A_d
->
nnz
==
0
)
{
if
(
A_d
->
nnz
==
0
)
{
...
@@ -603,8 +549,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -603,8 +549,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
/* nnz != 0 */
/* nnz != 0 */
hl_csr_matrix
A_d2
=
(
hl_csr_matrix
)(
A_d
->
matrix
);
hl_csr_matrix
A_d2
=
(
hl_csr_matrix
)(
A_d
->
matrix
);
if
((
A_d2
->
csr_val
==
NULL
&&
A_d
->
type
!=
HL_NO_VALUE
)
||
if
((
A_d2
->
csr_val
==
NULL
&&
A_d
->
type
!=
HL_NO_VALUE
)
||
A_d2
->
csr_row
==
NULL
||
A_d2
->
csr_row
==
NULL
||
A_d2
->
csr_col
==
NULL
)
{
A_d2
->
csr_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter error!"
;
LOG
(
FATAL
)
<<
"parameter error!"
;
}
}
...
@@ -617,63 +562,63 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -617,63 +562,63 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
/* sparsity pattern */
/* sparsity pattern */
// A_d->sparsity;
// A_d->sparsity;
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixCsrMulDense
<
0
>
KeSMatrixCsrMulDense
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_col
,
A_d2
->
csr_col
,
A_d2
->
csr_row
,
A_d2
->
csr_row
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixCsrMulDense
<
1
>
KeSMatrixCsrMulDense
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_col
,
A_d2
->
csr_col
,
A_d2
->
csr_row
,
A_d2
->
csr_row
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
if
(
HPPL_OP_T
==
transa
)
{
}
else
if
(
HPPL_OP_T
==
transa
)
{
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
int
blocksX
=
(
dimN
+
CU_CSC_MUL_DENSE_BLOCK_N
-
1
)
/
int
blocksX
=
CU_CSC_MUL_DENSE_BLOCK_N
;
(
dimN
+
CU_CSC_MUL_DENSE_BLOCK_N
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_N
;
int
blocksY
=
(
dimK
+
CU_CSC_MUL_DENSE_BLOCK_K
-
1
)
/
int
blocksY
=
CU_CSC_MUL_DENSE_BLOCK_K
;
(
dimK
+
CU_CSC_MUL_DENSE_BLOCK_K
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_K
;
dim3
threads
(
CU_CSC_MUL_DENSE_THREAD_X
,
CU_CSC_MUL_DENSE_THREAD_Y
);
dim3
threads
(
CU_CSC_MUL_DENSE_THREAD_X
,
CU_CSC_MUL_DENSE_THREAD_Y
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixCscMulDense
<
0
>
KeSMatrixCscMulDense
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_col
,
A_d2
->
csr_col
,
A_d2
->
csr_row
,
A_d2
->
csr_row
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixCscMulDense
<
1
>
KeSMatrixCscMulDense
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_col
,
A_d2
->
csr_col
,
A_d2
->
csr_row
,
A_d2
->
csr_row
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter transa error!"
;
LOG
(
FATAL
)
<<
"parameter transa error!"
;
...
@@ -682,11 +627,16 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -682,11 +627,16 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
CHECK_SYNC
(
"hl_matrix_csr_mul_dense failed"
);
CHECK_SYNC
(
"hl_matrix_csr_mul_dense failed"
);
}
}
void
hl_matrix_dense_mul_csc
(
real
*
A_d
,
hl_trans_op_t
transa
,
void
hl_matrix_dense_mul_csc
(
real
*
A_d
,
hl_sparse_matrix_s
B_d
,
hl_trans_op_t
transb
,
hl_trans_op_t
transa
,
hl_sparse_matrix_s
B_d
,
hl_trans_op_t
transb
,
real
*
C_d
,
real
*
C_d
,
int
dimM
,
int
dimN
,
int
dimK
,
int
dimM
,
real
alpha
,
real
beta
)
{
int
dimN
,
int
dimK
,
real
alpha
,
real
beta
)
{
CHECK_EQ
(
transa
,
HPPL_OP_N
);
CHECK_EQ
(
transa
,
HPPL_OP_N
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
...
@@ -698,8 +648,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
...
@@ -698,8 +648,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
LOG
(
FATAL
)
<<
"parameter dims error!"
;
LOG
(
FATAL
)
<<
"parameter dims error!"
;
}
}
CHECK_EQ
(
B_d
->
format
,
HL_SPARSE_CSC
)
CHECK_EQ
(
B_d
->
format
,
HL_SPARSE_CSC
)
<<
"matrix format error!"
;
<<
"matrix format error!"
;
if
(
B_d
->
nnz
==
0
)
{
if
(
B_d
->
nnz
==
0
)
{
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
...
@@ -709,8 +658,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
...
@@ -709,8 +658,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
/* nnz != 0 */
/* nnz != 0 */
hl_csc_matrix
B_d2
=
(
hl_csc_matrix
)(
B_d
->
matrix
);
hl_csc_matrix
B_d2
=
(
hl_csc_matrix
)(
B_d
->
matrix
);
if
((
B_d2
->
csc_val
==
NULL
&&
B_d
->
type
!=
HL_NO_VALUE
)
||
if
((
B_d2
->
csc_val
==
NULL
&&
B_d
->
type
!=
HL_NO_VALUE
)
||
B_d2
->
csc_row
==
NULL
||
B_d2
->
csc_row
==
NULL
||
B_d2
->
csc_col
==
NULL
)
{
B_d2
->
csc_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter B is null!"
;
LOG
(
FATAL
)
<<
"parameter B is null!"
;
}
}
...
@@ -721,60 +669,60 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
...
@@ -721,60 +669,60 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixDenseMulCsc
<
0
>
KeSMatrixDenseMulCsc
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csc_val
,
B_d2
->
csc_val
,
B_d2
->
csc_row
,
B_d2
->
csc_row
,
B_d2
->
csc_col
,
B_d2
->
csc_col
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixDenseMulCsc
<
1
>
KeSMatrixDenseMulCsc
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csc_val
,
B_d2
->
csc_val
,
B_d2
->
csc_row
,
B_d2
->
csc_row
,
B_d2
->
csc_col
,
B_d2
->
csc_col
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
if
(
transb
==
HPPL_OP_T
)
{
}
else
if
(
transb
==
HPPL_OP_T
)
{
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
int
blocksX
=
1
+
(
dimK
-
1
)
/
CU_DM_CSR_THREAD_X
;
int
blocksX
=
1
+
(
dimK
-
1
)
/
CU_DM_CSR_THREAD_X
;
int
blocksY
=
1
+
(
dimM
-
1
)
/
CU_DM_CSR_BLOCK_M
;
int
blocksY
=
1
+
(
dimM
-
1
)
/
CU_DM_CSR_BLOCK_M
;
dim3
threads
(
CU_DM_CSR_THREAD_X
,
CU_DM_CSR_THREAD_Y
);
dim3
threads
(
CU_DM_CSR_THREAD_X
,
CU_DM_CSR_THREAD_Y
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixDenseMulCsr
<
0
>
KeSMatrixDenseMulCsr
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csc_val
,
B_d2
->
csc_val
,
B_d2
->
csc_col
,
B_d2
->
csc_col
,
B_d2
->
csc_row
,
B_d2
->
csc_row
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixDenseMulCsr
<
1
>
KeSMatrixDenseMulCsr
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csc_val
,
B_d2
->
csc_val
,
B_d2
->
csc_col
,
B_d2
->
csc_col
,
B_d2
->
csc_row
,
B_d2
->
csc_row
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter transb error!"
;
LOG
(
FATAL
)
<<
"parameter transb error!"
;
...
@@ -783,24 +731,28 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
...
@@ -783,24 +731,28 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
CHECK_SYNC
(
"hl_matrix_dense_mul_csc failed"
);
CHECK_SYNC
(
"hl_matrix_dense_mul_csc failed"
);
}
}
void
hl_matrix_dense_mul_csr
(
real
*
A_d
,
hl_trans_op_t
transa
,
void
hl_matrix_dense_mul_csr
(
real
*
A_d
,
hl_sparse_matrix_s
B_d
,
hl_trans_op_t
transb
,
hl_trans_op_t
transa
,
hl_sparse_matrix_s
B_d
,
hl_trans_op_t
transb
,
real
*
C_d
,
real
*
C_d
,
int
dimM
,
int
dimN
,
int
dimK
,
int
dimM
,
real
alpha
,
real
beta
)
{
int
dimN
,
int
dimK
,
real
alpha
,
real
beta
)
{
CHECK_EQ
(
transa
,
HPPL_OP_N
);
CHECK_EQ
(
transa
,
HPPL_OP_N
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
if
(
dimM
<=
0
||
dimN
<=
0
||
dimK
<=
0
if
(
dimM
<=
0
||
dimN
<=
0
||
dimK
<=
0
||
||
(
transb
==
HPPL_OP_N
&&
(
B_d
->
rows
!=
dimK
||
B_d
->
cols
!=
dimN
))
(
transb
==
HPPL_OP_N
&&
(
B_d
->
rows
!=
dimK
||
B_d
->
cols
!=
dimN
))
||
||
(
transb
==
HPPL_OP_T
&&
(
B_d
->
rows
!=
dimN
||
B_d
->
cols
!=
dimK
)))
{
(
transb
==
HPPL_OP_T
&&
(
B_d
->
rows
!=
dimN
||
B_d
->
cols
!=
dimK
)))
{
LOG
(
FATAL
)
<<
"parameter dims error!"
;
LOG
(
FATAL
)
<<
"parameter dims error!"
;
}
}
CHECK_EQ
(
B_d
->
format
,
HL_SPARSE_CSR
)
CHECK_EQ
(
B_d
->
format
,
HL_SPARSE_CSR
)
<<
"matrix format error!"
;
<<
"matrix format error!"
;
if
(
B_d
->
nnz
==
0
)
{
if
(
B_d
->
nnz
==
0
)
{
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
...
@@ -810,41 +762,40 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
...
@@ -810,41 +762,40 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
/* nnz != 0 */
/* nnz != 0 */
hl_csr_matrix
B_d2
=
(
hl_csr_matrix
)(
B_d
->
matrix
);
hl_csr_matrix
B_d2
=
(
hl_csr_matrix
)(
B_d
->
matrix
);
if
((
B_d2
->
csr_val
==
NULL
&&
B_d
->
type
!=
HL_NO_VALUE
)
||
if
((
B_d2
->
csr_val
==
NULL
&&
B_d
->
type
!=
HL_NO_VALUE
)
||
B_d2
->
csr_row
==
NULL
||
B_d2
->
csr_row
==
NULL
||
B_d2
->
csr_col
==
NULL
)
{
B_d2
->
csr_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter transa error!"
;
LOG
(
FATAL
)
<<
"parameter transa error!"
;
}
}
if
(
transb
==
HPPL_OP_N
)
{
if
(
transb
==
HPPL_OP_N
)
{
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
int
blocksX
=
1
+
(
dimK
-
1
)
/
CU_DM_CSR_THREAD_X
;
int
blocksX
=
1
+
(
dimK
-
1
)
/
CU_DM_CSR_THREAD_X
;
int
blocksY
=
1
+
(
dimM
-
1
)
/
CU_DM_CSR_BLOCK_M
;
int
blocksY
=
1
+
(
dimM
-
1
)
/
CU_DM_CSR_BLOCK_M
;
dim3
threads
(
CU_DM_CSR_THREAD_X
,
CU_DM_CSR_THREAD_Y
);
dim3
threads
(
CU_DM_CSR_THREAD_X
,
CU_DM_CSR_THREAD_Y
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixDenseMulCsr
<
0
>
KeSMatrixDenseMulCsr
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csr_val
,
B_d2
->
csr_val
,
B_d2
->
csr_row
,
B_d2
->
csr_row
,
B_d2
->
csr_col
,
B_d2
->
csr_col
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixDenseMulCsr
<
1
>
KeSMatrixDenseMulCsr
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csr_val
,
B_d2
->
csr_val
,
B_d2
->
csr_row
,
B_d2
->
csr_row
,
B_d2
->
csr_col
,
B_d2
->
csr_col
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
if
(
transb
==
HPPL_OP_T
)
{
}
else
if
(
transb
==
HPPL_OP_T
)
{
int
blocksX
=
(
dimM
+
CU_CSCMM_BLOCK_M_BEST
-
1
)
/
CU_CSCMM_BLOCK_M_BEST
;
int
blocksX
=
(
dimM
+
CU_CSCMM_BLOCK_M_BEST
-
1
)
/
CU_CSCMM_BLOCK_M_BEST
;
...
@@ -852,29 +803,29 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
...
@@ -852,29 +803,29 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
dim3
threads
(
CU_CSCMM_THREAD_X_BEST
,
CU_CSCMM_THREAD_Y_BEST
);
dim3
threads
(
CU_CSCMM_THREAD_X_BEST
,
CU_CSCMM_THREAD_Y_BEST
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
if
(
B_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixDenseMulCsc
<
0
>
KeSMatrixDenseMulCsc
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csr_val
,
B_d2
->
csr_val
,
B_d2
->
csr_col
,
B_d2
->
csr_col
,
B_d2
->
csr_row
,
B_d2
->
csr_row
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixDenseMulCsc
<
1
>
KeSMatrixDenseMulCsc
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d
,
A_d
,
B_d2
->
csr_val
,
B_d2
->
csr_val
,
B_d2
->
csr_col
,
B_d2
->
csr_col
,
B_d2
->
csr_row
,
B_d2
->
csr_row
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter transb error!"
;
LOG
(
FATAL
)
<<
"parameter transb error!"
;
...
@@ -883,11 +834,16 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
...
@@ -883,11 +834,16 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
CHECK_SYNC
(
"hl_matrix_dense_mul_csr failed"
);
CHECK_SYNC
(
"hl_matrix_dense_mul_csr failed"
);
}
}
void
hl_matrix_csc_mul_dense
(
hl_sparse_matrix_s
A_d
,
hl_trans_op_t
transa
,
void
hl_matrix_csc_mul_dense
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
hl_trans_op_t
transb
,
hl_trans_op_t
transa
,
real
*
B_d
,
hl_trans_op_t
transb
,
real
*
C_d
,
real
*
C_d
,
int
dimM
,
int
dimN
,
int
dimK
,
int
dimM
,
real
alpha
,
real
beta
)
{
int
dimN
,
int
dimK
,
real
alpha
,
real
beta
)
{
CHECK_EQ
(
transb
,
HPPL_OP_N
);
CHECK_EQ
(
transb
,
HPPL_OP_N
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
...
@@ -908,42 +864,43 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -908,42 +864,43 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
/* nnz != 0 */
/* nnz != 0 */
hl_csc_matrix
A_d2
=
(
hl_csc_matrix
)(
A_d
->
matrix
);
hl_csc_matrix
A_d2
=
(
hl_csc_matrix
)(
A_d
->
matrix
);
if
((
A_d2
->
csc_val
==
NULL
&&
A_d
->
type
!=
HL_NO_VALUE
)
||
if
((
A_d2
->
csc_val
==
NULL
&&
A_d
->
type
!=
HL_NO_VALUE
)
||
A_d2
->
csc_row
==
NULL
||
A_d2
->
csc_row
==
NULL
||
A_d2
->
csc_col
==
NULL
)
{
A_d2
->
csc_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter error!"
;
LOG
(
FATAL
)
<<
"parameter error!"
;
}
}
if
(
HPPL_OP_N
==
transa
)
{
if
(
HPPL_OP_N
==
transa
)
{
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
_beta_mul_c
(
C_d
,
dimM
,
dimN
,
beta
);
int
blocksX
=
(
dimN
+
CU_CSC_MUL_DENSE_BLOCK_N
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_N
;
int
blocksX
=
int
blocksY
=
(
dimK
+
CU_CSC_MUL_DENSE_BLOCK_K
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_K
;
(
dimN
+
CU_CSC_MUL_DENSE_BLOCK_N
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_N
;
int
blocksY
=
(
dimK
+
CU_CSC_MUL_DENSE_BLOCK_K
-
1
)
/
CU_CSC_MUL_DENSE_BLOCK_K
;
dim3
threads
(
CU_CSC_MUL_DENSE_THREAD_X
,
CU_CSC_MUL_DENSE_THREAD_Y
);
dim3
threads
(
CU_CSC_MUL_DENSE_THREAD_X
,
CU_CSC_MUL_DENSE_THREAD_Y
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixCscMulDense
<
0
>
KeSMatrixCscMulDense
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csc_val
,
A_d2
->
csc_val
,
A_d2
->
csc_row
,
A_d2
->
csc_row
,
A_d2
->
csc_col
,
A_d2
->
csc_col
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixCscMulDense
<
1
>
KeSMatrixCscMulDense
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csc_val
,
A_d2
->
csc_val
,
A_d2
->
csc_row
,
A_d2
->
csc_row
,
A_d2
->
csc_col
,
A_d2
->
csc_col
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
if
(
HPPL_OP_T
==
transa
)
{
}
else
if
(
HPPL_OP_T
==
transa
)
{
int
blocksX
=
(
dimN
+
CU_CSRMM_BLOCK_N
-
1
)
/
CU_CSRMM_BLOCK_N
;
int
blocksX
=
(
dimN
+
CU_CSRMM_BLOCK_N
-
1
)
/
CU_CSRMM_BLOCK_N
;
...
@@ -954,29 +911,29 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -954,29 +911,29 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
/* sparsity pattern */
/* sparsity pattern */
// A_d->sparsity;
// A_d->sparsity;
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
if
(
A_d
->
type
==
HL_NO_VALUE
)
{
KeSMatrixCsrMulDense
<
0
>
KeSMatrixCsrMulDense
<
0
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csc_val
,
A_d2
->
csc_val
,
A_d2
->
csc_row
,
A_d2
->
csc_row
,
A_d2
->
csc_col
,
A_d2
->
csc_col
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
else
{
}
else
{
KeSMatrixCsrMulDense
<
1
>
KeSMatrixCsrMulDense
<
1
>
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
C_d
,
A_d2
->
csc_val
,
A_d2
->
csc_val
,
A_d2
->
csc_row
,
A_d2
->
csc_row
,
A_d2
->
csc_col
,
A_d2
->
csc_col
,
B_d
,
B_d
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
}
}
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter transa error!"
;
LOG
(
FATAL
)
<<
"parameter transa error!"
;
...
@@ -985,11 +942,16 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
...
@@ -985,11 +942,16 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
CHECK_SYNC
(
"hl_matrix_csc_mul_dense failed"
);
CHECK_SYNC
(
"hl_matrix_csc_mul_dense failed"
);
}
}
void
hl_sparse_matrix_mul
(
real
*
A_d
,
hl_trans_op_t
transa
,
void
hl_sparse_matrix_mul
(
real
*
A_d
,
real
*
B_d
,
hl_trans_op_t
transb
,
hl_trans_op_t
transa
,
hl_sparse_matrix_s
C_d
,
real
*
B_d
,
int
dimM
,
int
dimN
,
int
dimK
,
hl_trans_op_t
transb
,
real
alpha
,
real
beta
)
{
hl_sparse_matrix_s
C_d
,
int
dimM
,
int
dimN
,
int
dimK
,
real
alpha
,
real
beta
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
...
@@ -1000,18 +962,14 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
...
@@ -1000,18 +962,14 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
if
(
C_d
->
format
==
HL_SPARSE_CSC
)
{
if
(
C_d
->
format
==
HL_SPARSE_CSC
)
{
hl_csc_matrix
C_d2
=
(
hl_csc_matrix
)(
C_d
->
matrix
);
hl_csc_matrix
C_d2
=
(
hl_csc_matrix
)(
C_d
->
matrix
);
if
(
C_d2
->
csc_val
==
NULL
||
if
(
C_d2
->
csc_val
==
NULL
||
C_d2
->
csc_row
==
NULL
||
C_d2
->
csc_row
==
NULL
||
C_d2
->
csc_col
==
NULL
)
{
C_d2
->
csc_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter error!"
;
LOG
(
FATAL
)
<<
"parameter error!"
;
}
}
if
(
beta
!=
1.0
)
{
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
hl_gpu_apply_unary_op
(
C_d2
->
csc_val
,
unary
::
mul_scalar
<
real
>
(
beta
),
C_d2
->
csc_val
,
1
,
C_d
->
nnz
,
C_d
->
nnz
);
1
,
C_d
->
nnz
,
C_d
->
nnz
);
}
}
int
blocksX
=
dimN
;
int
blocksX
=
dimN
;
...
@@ -1020,34 +978,30 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
...
@@ -1020,34 +978,30 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
bool
transA
=
transa
==
HPPL_OP_T
?
1
:
0
;
bool
transA
=
transa
==
HPPL_OP_T
?
1
:
0
;
bool
transB
=
transb
==
HPPL_OP_T
?
1
:
0
;
bool
transB
=
transb
==
HPPL_OP_T
?
1
:
0
;
KeSMatrixDenseMulDense2CSC
KeSMatrixDenseMulDense2CSC
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d2
->
csc_val
,
C_d2
->
csc_val
,
C_d2
->
csc_row
,
C_d2
->
csc_row
,
C_d2
->
csc_col
,
C_d2
->
csc_col
,
A_d
,
A_d
,
B_d
,
B_d
,
transA
,
transA
,
transB
,
transB
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
CHECK_SYNC
(
"hl_sparse_matrix_mul failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_mul failed"
);
}
else
{
}
else
{
hl_csr_matrix
C_d2
=
(
hl_csr_matrix
)(
C_d
->
matrix
);
hl_csr_matrix
C_d2
=
(
hl_csr_matrix
)(
C_d
->
matrix
);
if
((
C_d2
->
csr_val
==
NULL
&&
C_d
->
type
!=
HL_NO_VALUE
)
||
if
((
C_d2
->
csr_val
==
NULL
&&
C_d
->
type
!=
HL_NO_VALUE
)
||
C_d2
->
csr_row
==
NULL
||
C_d2
->
csr_row
==
NULL
||
C_d2
->
csr_col
==
NULL
)
{
C_d2
->
csr_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter error!"
;
LOG
(
FATAL
)
<<
"parameter error!"
;
}
}
if
(
beta
!=
1.0
)
{
if
(
beta
!=
1.0
)
{
hl_gpu_apply_unary_op
(
unary
::
mul_scalar
<
real
>
(
beta
),
hl_gpu_apply_unary_op
(
C_d2
->
csr_val
,
unary
::
mul_scalar
<
real
>
(
beta
),
C_d2
->
csr_val
,
1
,
C_d
->
nnz
,
C_d
->
nnz
);
1
,
C_d
->
nnz
,
C_d
->
nnz
);
}
}
bool
transA
=
transa
==
HPPL_OP_T
?
1
:
0
;
bool
transA
=
transa
==
HPPL_OP_T
?
1
:
0
;
...
@@ -1058,20 +1012,20 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
...
@@ -1058,20 +1012,20 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
dim3
threads
(
CU_CSCMM_DMD2CSR_THREAD_X
,
1
);
dim3
threads
(
CU_CSCMM_DMD2CSR_THREAD_X
,
1
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeSMatrixDenseMulDense2CSR
KeSMatrixDenseMulDense2CSR
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
C_d2
->
csr_val
,
C_d2
->
csr_val
,
C_d2
->
csr_row
,
C_d2
->
csr_row
,
C_d2
->
csr_col
,
C_d2
->
csr_col
,
A_d
,
A_d
,
B_d
,
B_d
,
transA
,
transA
,
transB
,
transB
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
CHECK_SYNC
(
"hl_sparse_matrix_mul failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_mul failed"
);
}
else
{
}
else
{
CHECK
(
!
transA
)
<<
"Not supported A is trans and B is not trans!"
;
CHECK
(
!
transA
)
<<
"Not supported A is trans and B is not trans!"
;
...
@@ -1080,21 +1034,21 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
...
@@ -1080,21 +1034,21 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
avgNnzPerRow
=
avgNnzPerRow
>
0
?
avgNnzPerRow
:
1
;
avgNnzPerRow
=
avgNnzPerRow
>
0
?
avgNnzPerRow
:
1
;
int
gridx
=
DIVUP
(
avgNnzPerRow
,
CU_BLOCK_SIZE
);
int
gridx
=
DIVUP
(
avgNnzPerRow
,
CU_BLOCK_SIZE
);
dim3
grid
(
gridx
,
dimM
);
dim3
grid
(
gridx
,
dimM
);
KeSMatrixDenseMulDenseTrans2CSR
KeSMatrixDenseMulDenseTrans2CSR
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d2
->
csr_val
,
C_d2
->
csr_val
,
C_d2
->
csr_row
,
C_d2
->
csr_row
,
C_d2
->
csr_col
,
C_d2
->
csr_col
,
A_d
,
A_d
,
B_d
,
B_d
,
transA
,
transA
,
transB
,
transB
,
dimM
,
dimM
,
dimN
,
dimN
,
dimK
,
dimK
,
alpha
,
alpha
,
beta
);
beta
);
CHECK_SYNC
(
"hl_sparse_matrix_mul failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_mul failed"
);
}
}
}
}
}
}
...
@@ -1111,7 +1065,7 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
...
@@ -1111,7 +1065,7 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
CHECK_NOTNULL
(
csc_col
);
CHECK_NOTNULL
(
csc_col
);
CHECK_EQ
(
csc_matrix
->
format
,
HL_SPARSE_CSC
)
CHECK_EQ
(
csc_matrix
->
format
,
HL_SPARSE_CSC
)
<<
"csc_matrix is not csc format error!"
;
<<
"csc_matrix is not csc format error!"
;
if
(
csc_matrix
->
nnz
>
row_size
||
if
(
csc_matrix
->
nnz
>
row_size
||
csc_matrix
->
cols
+
1
>
static_cast
<
int
>
(
col_size
))
{
csc_matrix
->
cols
+
1
>
static_cast
<
int
>
(
col_size
))
{
...
@@ -1119,20 +1073,20 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
...
@@ -1119,20 +1073,20 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
}
}
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
csc_matrix
->
matrix
);
hl_csc_matrix
csc
=
(
hl_csc_matrix
)(
csc_matrix
->
matrix
);
hl_memcpy_async
((
void
*
)
csc_row
,
hl_memcpy_async
((
void
*
)
csc_row
,
(
void
*
)
csc
->
csc_row
,
(
void
*
)
csc
->
csc_row
,
(
csc_matrix
->
nnz
)
*
sizeof
(
int
),
(
csc_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
stream
);
hl_memcpy_async
((
void
*
)
csc_col
,
hl_memcpy_async
((
void
*
)
csc_col
,
(
void
*
)
csc
->
csc_col
,
(
void
*
)
csc
->
csc_col
,
(
csc_matrix
->
cols
+
1
)
*
sizeof
(
int
),
(
csc_matrix
->
cols
+
1
)
*
sizeof
(
int
),
stream
);
stream
);
if
(
csc_matrix
->
type
==
HL_FLOAT_VALUE
)
{
if
(
csc_matrix
->
type
==
HL_FLOAT_VALUE
)
{
if
(
csc_val
!=
NULL
)
{
if
(
csc_val
!=
NULL
)
{
CHECK_LE
(
csc_matrix
->
nnz
,
val_size
)
<<
"size not match!"
;
CHECK_LE
(
csc_matrix
->
nnz
,
val_size
)
<<
"size not match!"
;
hl_memcpy_async
((
void
*
)
csc_val
,
hl_memcpy_async
((
void
*
)
csc_val
,
(
void
*
)
csc
->
csc_val
,
(
void
*
)
csc
->
csc_val
,
(
csc_matrix
->
nnz
)
*
sizeof
(
real
),
(
csc_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter csr_val is null pointer!"
;
LOG
(
FATAL
)
<<
"parameter csr_val is null pointer!"
;
...
@@ -1152,7 +1106,7 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
...
@@ -1152,7 +1106,7 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
CHECK_NOTNULL
(
csr_row
);
CHECK_NOTNULL
(
csr_row
);
CHECK_NOTNULL
(
csr_col
);
CHECK_NOTNULL
(
csr_col
);
CHECK_EQ
(
csr_matrix
->
format
,
HL_SPARSE_CSR
)
CHECK_EQ
(
csr_matrix
->
format
,
HL_SPARSE_CSR
)
<<
"csr_matrix is not csr format error!"
;
<<
"csr_matrix is not csr format error!"
;
if
(
csr_matrix
->
nnz
>
col_size
||
if
(
csr_matrix
->
nnz
>
col_size
||
csr_matrix
->
rows
+
1
>
static_cast
<
int
>
(
row_size
))
{
csr_matrix
->
rows
+
1
>
static_cast
<
int
>
(
row_size
))
{
...
@@ -1160,20 +1114,20 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
...
@@ -1160,20 +1114,20 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
}
}
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
csr_matrix
->
matrix
);
hl_csr_matrix
csr
=
(
hl_csr_matrix
)(
csr_matrix
->
matrix
);
hl_memcpy_async
((
void
*
)
csr_row
,
hl_memcpy_async
((
void
*
)
csr_row
,
(
void
*
)
csr
->
csr_row
,
(
void
*
)
csr
->
csr_row
,
(
csr_matrix
->
rows
+
1
)
*
sizeof
(
int
),
(
csr_matrix
->
rows
+
1
)
*
sizeof
(
int
),
stream
);
stream
);
hl_memcpy_async
((
void
*
)
csr_col
,
hl_memcpy_async
((
void
*
)
csr_col
,
(
void
*
)
csr
->
csr_col
,
(
void
*
)
csr
->
csr_col
,
(
csr_matrix
->
nnz
)
*
sizeof
(
int
),
(
csr_matrix
->
nnz
)
*
sizeof
(
int
),
stream
);
stream
);
if
(
csr_matrix
->
type
==
HL_FLOAT_VALUE
)
{
if
(
csr_matrix
->
type
==
HL_FLOAT_VALUE
)
{
if
(
csr_val
!=
NULL
)
{
if
(
csr_val
!=
NULL
)
{
CHECK_LE
(
csr_matrix
->
nnz
,
val_size
)
<<
"size not match!"
;
CHECK_LE
(
csr_matrix
->
nnz
,
val_size
)
<<
"size not match!"
;
hl_memcpy_async
((
void
*
)
csr_val
,
hl_memcpy_async
((
void
*
)
csr_val
,
(
void
*
)
csr
->
csr_val
,
(
void
*
)
csr
->
csr_val
,
(
csr_matrix
->
nnz
)
*
sizeof
(
real
),
(
csr_matrix
->
nnz
)
*
sizeof
(
real
),
stream
);
stream
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"parameter csr_val is null pointer!"
;
LOG
(
FATAL
)
<<
"parameter csr_val is null pointer!"
;
...
@@ -1181,8 +1135,8 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
...
@@ -1181,8 +1135,8 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
}
}
}
}
void
hl_sparse_matrix_column_sum
(
real
*
A_d
,
hl_sparse_matrix_s
B_d
,
int
dimM
,
void
hl_sparse_matrix_column_sum
(
int
dimN
,
real
scale
)
{
real
*
A_d
,
hl_sparse_matrix_s
B_d
,
int
dimM
,
int
dimN
,
real
scale
)
{
if
(
B_d
->
format
==
HL_SPARSE_CSR
)
{
if
(
B_d
->
format
==
HL_SPARSE_CSR
)
{
hl_matrix_csr_column_sum
(
A_d
,
B_d
,
dimM
,
dimN
,
scale
);
hl_matrix_csr_column_sum
(
A_d
,
B_d
,
dimM
,
dimN
,
scale
);
}
else
{
}
else
{
...
@@ -1190,8 +1144,8 @@ void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM,
...
@@ -1190,8 +1144,8 @@ void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM,
}
}
}
}
void
hl_matrix_csr_column_sum
(
real
*
A_d
,
hl_sparse_matrix_s
B_d
,
void
hl_matrix_csr_column_sum
(
int
dimM
,
int
dimN
,
real
scale
)
{
real
*
A_d
,
hl_sparse_matrix_s
B_d
,
int
dimM
,
int
dimN
,
real
scale
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
...
@@ -1216,8 +1170,7 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
...
@@ -1216,8 +1170,7 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
CHECK_SYNC
(
"hl_matrix_csr_column_sum failed"
);
CHECK_SYNC
(
"hl_matrix_csr_column_sum failed"
);
}
}
void
hl_sparse_matrix_add_bias
(
hl_sparse_matrix_s
A_d
,
void
hl_sparse_matrix_add_bias
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
real
scale
)
{
real
*
B_d
,
real
scale
)
{
if
(
A_d
->
format
==
HL_SPARSE_CSR
)
{
if
(
A_d
->
format
==
HL_SPARSE_CSR
)
{
hl_matrix_csr_add_bias
(
A_d
,
B_d
,
scale
);
hl_matrix_csr_add_bias
(
A_d
,
B_d
,
scale
);
}
else
{
}
else
{
...
@@ -1225,8 +1178,7 @@ void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
...
@@ -1225,8 +1178,7 @@ void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
}
}
}
}
void
hl_matrix_csr_add_bias
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
void
hl_matrix_csr_add_bias
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
real
scale
)
{
real
scale
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
...
@@ -1247,8 +1199,12 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
...
@@ -1247,8 +1199,12 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
CHECK_SYNC
(
"hl_sparse_matrix_add_bias failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_add_bias failed"
);
}
}
void
hl_sparse_matrix_add_dense
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
int
dimM
,
void
hl_sparse_matrix_add_dense
(
hl_sparse_matrix_s
A_d
,
int
dimN
,
real
alpha
,
real
beta
)
{
real
*
B_d
,
int
dimM
,
int
dimN
,
real
alpha
,
real
beta
)
{
if
(
A_d
->
format
==
HL_SPARSE_CSR
)
{
if
(
A_d
->
format
==
HL_SPARSE_CSR
)
{
hl_matrix_csr_add_dense
(
A_d
,
B_d
,
dimM
,
dimN
,
alpha
,
beta
);
hl_matrix_csr_add_dense
(
A_d
,
B_d
,
dimM
,
dimN
,
alpha
,
beta
);
}
else
{
}
else
{
...
@@ -1256,8 +1212,12 @@ void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM,
...
@@ -1256,8 +1212,12 @@ void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM,
}
}
}
}
void
hl_matrix_csr_add_dense
(
hl_sparse_matrix_s
A_d
,
real
*
B_d
,
int
dimM
,
void
hl_matrix_csr_add_dense
(
hl_sparse_matrix_s
A_d
,
int
dimN
,
real
alpha
,
real
beta
)
{
real
*
B_d
,
int
dimM
,
int
dimN
,
real
alpha
,
real
beta
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
...
@@ -1277,20 +1237,26 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
...
@@ -1277,20 +1237,26 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
gridX
=
gridX
>
0
?
gridX
:
1
;
gridX
=
gridX
>
0
?
gridX
:
1
;
dim3
block
(
512
,
1
);
dim3
block
(
512
,
1
);
dim3
grid
(
gridX
,
dimM
);
dim3
grid
(
gridX
,
dimM
);
KeSMatrixCsrAddDense
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
KeSMatrixCsrAddDense
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
A_d2
->
csr_val
,
A_d2
->
csr_val
,
A_d2
->
csr_row
,
A_d2
->
csr_col
,
B_d
,
alpha
,
beta
,
dimM
,
dimN
);
A_d2
->
csr_row
,
A_d2
->
csr_col
,
B_d
,
alpha
,
beta
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_sparse_matrix_add_dense failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_add_dense failed"
);
}
}
int
*
hl_sparse_matrix_get_rows
(
hl_sparse_matrix_s
sMat
)
{
int
*
hl_sparse_matrix_get_rows
(
hl_sparse_matrix_s
sMat
)
{
__sparse_get_return__
(
sMat
,
row
);
__sparse_get_return__
(
sMat
,
row
);
}
}
int
*
hl_sparse_matrix_get_cols
(
hl_sparse_matrix_s
sMat
)
{
int
*
hl_sparse_matrix_get_cols
(
hl_sparse_matrix_s
sMat
)
{
__sparse_get_return__
(
sMat
,
col
);
__sparse_get_return__
(
sMat
,
col
);
}
}
real
*
hl_sparse_matrix_get_value
(
hl_sparse_matrix_s
sMat
)
{
real
*
hl_sparse_matrix_get_value
(
hl_sparse_matrix_s
sMat
)
{
__sparse_get_return__
(
sMat
,
val
);
__sparse_get_return__
(
sMat
,
val
);
}
}
paddle/cuda/src/hl_perturbation_util.cu
浏览文件 @
1d4fa243
...
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <cmath>
#include <stdlib.h>
#include <stdlib.h>
#include "hl_cuda.h"
#include <cmath>
#include "hl_time.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_cuda.h"
#include "hl_perturbation_util.cuh"
#include "hl_perturbation_util.cuh"
#include "hl_time.h"
#define _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
...
@@ -30,10 +29,16 @@ limitations under the License. */
...
@@ -30,10 +29,16 @@ limitations under the License. */
* centerX, centerY: translation.
* centerX, centerY: translation.
* sourceX, sourceY: output coordinates in the original image.
* sourceX, sourceY: output coordinates in the original image.
*/
*/
__device__
void
getTranformCoord
(
int
x
,
int
y
,
real
theta
,
real
scale
,
__device__
void
getTranformCoord
(
int
x
,
real
tgtCenter
,
real
imgCenter
,
int
y
,
real
centerR
,
real
centerC
,
real
theta
,
int
*
sourceX
,
int
*
sourceY
)
{
real
scale
,
real
tgtCenter
,
real
imgCenter
,
real
centerR
,
real
centerC
,
int
*
sourceX
,
int
*
sourceY
)
{
real
H
[
4
]
=
{
cosf
(
-
theta
),
-
sinf
(
-
theta
),
sinf
(
-
theta
),
cosf
(
-
theta
)};
real
H
[
4
]
=
{
cosf
(
-
theta
),
-
sinf
(
-
theta
),
sinf
(
-
theta
),
cosf
(
-
theta
)};
// compute coornidates in the rotated and scaled image
// compute coornidates in the rotated and scaled image
...
@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
...
@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
* created by Wei Xu (genome), converted by Jiang Wang
* created by Wei Xu (genome), converted by Jiang Wang
*/
*/
__global__
void
kSamplingPatches
(
const
real
*
imgs
,
real
*
targets
,
__global__
void
kSamplingPatches
(
const
real
*
imgs
,
int
imgSize
,
int
tgtSize
,
const
int
channels
,
real
*
targets
,
int
samplingRate
,
const
real
*
thetas
,
int
imgSize
,
const
real
*
scales
,
const
int
*
centerRs
,
int
tgtSize
,
const
int
*
centerCs
,
const
real
padValue
,
const
int
channels
,
int
samplingRate
,
const
real
*
thetas
,
const
real
*
scales
,
const
int
*
centerRs
,
const
int
*
centerCs
,
const
real
padValue
,
const
int
numImages
)
{
const
int
numImages
)
{
const
int
caseIdx
=
blockIdx
.
x
*
4
+
threadIdx
.
x
;
const
int
caseIdx
=
blockIdx
.
x
*
4
+
threadIdx
.
x
;
const
int
pxIdx
=
blockIdx
.
y
*
128
+
threadIdx
.
y
;
const
int
pxIdx
=
blockIdx
.
y
*
128
+
threadIdx
.
y
;
...
@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
...
@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
const
int
pxY
=
pxIdx
/
tgtSize
;
const
int
pxY
=
pxIdx
/
tgtSize
;
int
srcPxX
,
srcPxY
;
int
srcPxX
,
srcPxY
;
getTranformCoord
(
pxX
,
pxY
,
thetas
[
imgIdx
],
scales
[
imgIdx
],
tgtCenter
,
getTranformCoord
(
pxX
,
imgCenter
,
centerCs
[
caseIdx
],
centerRs
[
caseIdx
],
&
srcPxX
,
pxY
,
thetas
[
imgIdx
],
scales
[
imgIdx
],
tgtCenter
,
imgCenter
,
centerCs
[
caseIdx
],
centerRs
[
caseIdx
],
&
srcPxX
,
&
srcPxY
);
&
srcPxY
);
imgs
+=
(
imgIdx
*
imgPixels
+
srcPxY
*
imgSize
+
srcPxX
)
*
channels
;
imgs
+=
(
imgIdx
*
imgPixels
+
srcPxY
*
imgSize
+
srcPxX
)
*
channels
;
...
@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
...
@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
*
*
* created by Wei Xu
* created by Wei Xu
*/
*/
void
hl_generate_disturb_params
(
real
*&
gpuAngle
,
real
*&
gpuScaleRatio
,
void
hl_generate_disturb_params
(
real
*&
gpuAngle
,
int
*&
gpuCenterR
,
int
*&
gpuCenterC
,
real
*&
gpuScaleRatio
,
int
numImages
,
int
imgSize
,
real
rotateAngle
,
int
*&
gpuCenterR
,
real
scaleRatio
,
int
samplingRate
,
int
*&
gpuCenterC
,
int
numImages
,
int
imgSize
,
real
rotateAngle
,
real
scaleRatio
,
int
samplingRate
,
bool
isTrain
)
{
bool
isTrain
)
{
// The number of output samples.
// The number of output samples.
int
numPatches
=
numImages
*
samplingRate
;
int
numPatches
=
numImages
*
samplingRate
;
...
@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
...
@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
for
(
int
i
=
0
;
i
<
numImages
;
i
++
)
{
for
(
int
i
=
0
;
i
<
numImages
;
i
++
)
{
r_angle
[
i
]
=
r_angle
[
i
]
=
(
rotateAngle
*
M_PI
/
180.0
)
*
(
rand
()
/
(
RAND_MAX
+
1.0
)
// NOLINT
(
rotateAngle
*
M_PI
/
180.0
)
*
(
rand
()
/
(
RAND_MAX
+
1.0
)
// NOLINT
-
0.5
);
-
0.5
);
s_ratio
[
i
]
=
s_ratio
[
i
]
=
1
+
(
rand
()
/
(
RAND_MAX
+
1.0
)
-
0.5
)
*
scaleRatio
;
// NOLINT
1
+
(
rand
()
/
(
RAND_MAX
+
1.0
)
-
0.5
)
*
scaleRatio
;
// NOLINT
}
}
...
@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
...
@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int
pxY
=
int
pxY
=
(
int
)(
real
(
imgSize
-
1
)
*
rand
()
/
(
RAND_MAX
+
1.0
));
// NOLINT
(
int
)(
real
(
imgSize
-
1
)
*
rand
()
/
(
RAND_MAX
+
1.0
));
// NOLINT
const
real
H
[
4
]
=
{
cos
(
-
r_angle
[
i
]),
-
sin
(
-
r_angle
[
i
]),
const
real
H
[
4
]
=
{
cos
(
-
r_angle
[
i
]),
sin
(
-
r_angle
[
i
]),
cos
(
-
r_angle
[
i
])};
-
sin
(
-
r_angle
[
i
]),
sin
(
-
r_angle
[
i
]),
cos
(
-
r_angle
[
i
])};
real
x
=
pxX
-
imgCenter
;
real
x
=
pxX
-
imgCenter
;
real
y
=
pxY
-
imgCenter
;
real
y
=
pxY
-
imgCenter
;
real
xx
=
H
[
0
]
*
x
+
H
[
1
]
*
y
;
real
xx
=
H
[
0
]
*
x
+
H
[
1
]
*
y
;
...
@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
...
@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
delete
[]
center_c
;
delete
[]
center_c
;
}
}
void
hl_conv_random_disturb_with_params
(
const
real
*
images
,
int
imgSize
,
void
hl_conv_random_disturb_with_params
(
const
real
*
images
,
int
tgtSize
,
int
channels
,
int
imgSize
,
int
numImages
,
int
samplingRate
,
int
tgtSize
,
int
channels
,
int
numImages
,
int
samplingRate
,
const
real
*
gpuRotationAngle
,
const
real
*
gpuRotationAngle
,
const
real
*
gpuScaleRatio
,
const
real
*
gpuScaleRatio
,
const
int
*
gpuCenterR
,
const
int
*
gpuCenterR
,
...
@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
...
@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
dim3
threadsPerBlock
(
4
,
128
);
dim3
threadsPerBlock
(
4
,
128
);
dim3
numBlocks
(
DIVUP
(
numPatches
,
4
),
DIVUP
(
targetSize
,
128
));
dim3
numBlocks
(
DIVUP
(
numPatches
,
4
),
DIVUP
(
targetSize
,
128
));
kSamplingPatches
<<<
numBlocks
,
threadsPerBlock
>>>
kSamplingPatches
<<<
numBlocks
,
threadsPerBlock
>>>
(
images
,
(
images
,
target
,
imgSize
,
tgtSize
,
channels
,
samplingRate
,
target
,
gpuRotationAngle
,
gpuScaleRatio
,
gpuCenterR
,
gpuCenterC
,
imgSize
,
paddingValue
,
numImages
);
tgtSize
,
channels
,
samplingRate
,
gpuRotationAngle
,
gpuScaleRatio
,
gpuCenterR
,
gpuCenterC
,
paddingValue
,
numImages
);
hl_device_synchronize
();
hl_device_synchronize
();
}
}
void
hl_conv_random_disturb
(
const
real
*
images
,
int
imgSize
,
void
hl_conv_random_disturb
(
const
real
*
images
,
int
tgtSize
,
int
channels
,
int
numImages
,
int
imgSize
,
real
scaleRatio
,
real
rotateAngle
,
int
tgtSize
,
int
samplingRate
,
real
*
gpu_r_angle
,
int
channels
,
real
*
gpu_s_ratio
,
int
*
gpu_center_r
,
int
numImages
,
int
*
gpu_center_c
,
int
paddingValue
,
real
scaleRatio
,
bool
isTrain
,
real
*
targets
)
{
real
rotateAngle
,
int
samplingRate
,
real
*
gpu_r_angle
,
real
*
gpu_s_ratio
,
int
*
gpu_center_r
,
int
*
gpu_center_c
,
int
paddingValue
,
bool
isTrain
,
real
*
targets
)
{
// generate the random disturbance sequence and the sampling locations
// generate the random disturbance sequence and the sampling locations
hl_generate_disturb_params
(
gpu_r_angle
,
gpu_s_ratio
,
gpu_center_r
,
hl_generate_disturb_params
(
gpu_r_angle
,
gpu_center_c
,
numImages
,
imgSize
,
rotateAngle
,
gpu_s_ratio
,
scaleRatio
,
samplingRate
,
isTrain
);
gpu_center_r
,
gpu_center_c
,
hl_conv_random_disturb_with_params
(
numImages
,
images
,
imgSize
,
tgtSize
,
channels
,
numImages
,
imgSize
,
samplingRate
,
gpu_r_angle
,
gpu_s_ratio
,
rotateAngle
,
gpu_center_r
,
gpu_center_r
,
paddingValue
,
scaleRatio
,
targets
);
samplingRate
,
isTrain
);
hl_conv_random_disturb_with_params
(
images
,
imgSize
,
tgtSize
,
channels
,
numImages
,
samplingRate
,
gpu_r_angle
,
gpu_s_ratio
,
gpu_center_r
,
gpu_center_r
,
paddingValue
,
targets
);
}
}
paddle/cuda/src/hl_table_apply.cu
浏览文件 @
1d4fa243
...
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_cuda.h"
#include "hl_cuda.h"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
AddRow
>
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
AddRow
>
__global__
void
KeMatrixAddRows
(
real
*
output
,
int
ldo
,
__global__
void
KeMatrixAddRows
(
real
*
output
,
real
*
table
,
int
ldt
,
int
ldo
,
real
*
table
,
int
ldt
,
int
*
ids
,
int
*
ids
,
int
numSamples
,
int
numSamples
,
int
tableSize
,
int
tableSize
,
...
@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
...
@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
while
(
idy
<
numSamples
)
{
while
(
idy
<
numSamples
)
{
int
tableId
=
ids
[
idy
];
int
tableId
=
ids
[
idy
];
if
((
0
<=
tableId
)
&&
(
tableId
<
tableSize
))
{
if
((
0
<=
tableId
)
&&
(
tableId
<
tableSize
))
{
real
*
out
=
output
+
idy
*
ldo
;
real
*
out
=
output
+
idy
*
ldo
;
real
*
tab
=
table
+
tableId
*
ldt
;
real
*
tab
=
table
+
tableId
*
ldt
;
for
(
int
i
=
idx
;
i
<
dim
;
i
+=
blockDimX
)
{
for
(
int
i
=
idx
;
i
<
dim
;
i
+=
blockDimX
)
{
if
(
AddRow
)
{
if
(
AddRow
)
{
paddle
::
paddleAtomicAdd
(
&
tab
[
i
],
out
[
i
]);
paddle
::
paddleAtomicAdd
(
&
tab
[
i
],
out
[
i
]);
...
@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
...
@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
}
}
}
}
void
hl_matrix_select_rows
(
real
*
output
,
int
ldo
,
void
hl_matrix_select_rows
(
real
*
output
,
real
*
table
,
int
ldt
,
int
ldo
,
real
*
table
,
int
ldt
,
int
*
ids
,
int
*
ids
,
int
numSamples
,
int
numSamples
,
int
tableSize
,
int
tableSize
,
...
@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
...
@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
dim3
threads
(
128
,
8
);
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
KeMatrixAddRows
<
128
,
8
,
8
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixAddRows
<
128
,
8
,
8
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
ldo
,
table
,
ldt
,
ids
,
numSamples
,
tableSize
,
dim
);
output
,
ldo
,
table
,
ldt
,
ids
,
numSamples
,
tableSize
,
dim
);
CHECK_SYNC
(
"hl_matrix_select_rows failed"
);
CHECK_SYNC
(
"hl_matrix_select_rows failed"
);
}
}
void
hl_matrix_add_to_rows
(
real
*
table
,
int
ldt
,
void
hl_matrix_add_to_rows
(
real
*
table
,
real
*
input
,
int
ldi
,
int
ldt
,
real
*
input
,
int
ldi
,
int
*
ids
,
int
*
ids
,
int
numSamples
,
int
numSamples
,
int
tableSize
,
int
tableSize
,
...
@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
...
@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
dim3
threads
(
128
,
8
);
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
KeMatrixAddRows
<
128
,
8
,
8
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixAddRows
<
128
,
8
,
8
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
input
,
ldi
,
table
,
ldt
,
ids
,
numSamples
,
tableSize
,
dim
);
input
,
ldi
,
table
,
ldt
,
ids
,
numSamples
,
tableSize
,
dim
);
CHECK_SYNC
(
"hl_matrix_add_to_rows failed"
);
CHECK_SYNC
(
"hl_matrix_add_to_rows failed"
);
}
}
template
<
class
T
,
int
blockDimX
,
int
gridDimX
>
template
<
class
T
,
int
blockDimX
,
int
gridDimX
>
__global__
void
KeVectorSelect
(
T
*
dst
,
int
sized
,
__global__
void
KeVectorSelect
(
const
T
*
src
,
int
sizes
,
T
*
dst
,
int
sized
,
const
T
*
src
,
int
sizes
,
const
int
*
ids
,
int
sizei
)
{
const
int
*
ids
,
int
sizei
)
{
int
idx
=
threadIdx
.
x
+
blockDimX
*
blockIdx
.
x
;
int
idx
=
threadIdx
.
x
+
blockDimX
*
blockIdx
.
x
;
while
(
idx
<
sizei
)
{
while
(
idx
<
sizei
)
{
int
index
=
ids
[
idx
];
int
index
=
ids
[
idx
];
...
@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
...
@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
}
}
template
<
class
T
>
template
<
class
T
>
void
hl_vector_select_from
(
T
*
dst
,
int
sized
,
void
hl_vector_select_from
(
const
T
*
src
,
int
sizes
,
T
*
dst
,
int
sized
,
const
T
*
src
,
int
sizes
,
const
int
*
ids
,
int
sizei
)
{
const
int
*
ids
,
int
sizei
)
{
CHECK_NOTNULL
(
dst
);
CHECK_NOTNULL
(
dst
);
CHECK_NOTNULL
(
src
);
CHECK_NOTNULL
(
src
);
CHECK_NOTNULL
(
ids
);
CHECK_NOTNULL
(
ids
);
...
@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
...
@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
dim3
threads
(
512
,
1
);
dim3
threads
(
512
,
1
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
KeVectorSelect
<
T
,
512
,
8
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorSelect
<
T
,
512
,
8
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
dst
,
sized
,
src
,
sizes
,
ids
,
sizei
);
dst
,
sized
,
src
,
sizes
,
ids
,
sizei
);
CHECK_SYNC
(
"hl_vector_select_from failed"
);
CHECK_SYNC
(
"hl_vector_select_from failed"
);
}
}
template
template
void
hl_vector_select_from
(
real
*
dst
,
void
hl_vector_select_from
(
real
*
dst
,
int
sized
,
int
sized
,
const
real
*
src
,
int
sizes
,
const
real
*
src
,
const
int
*
ids
,
int
sizei
);
int
sizes
,
template
const
int
*
ids
,
void
hl_vector_select_from
(
int
*
dst
,
int
sized
,
int
sizei
);
const
int
*
src
,
int
sizes
,
template
void
hl_vector_select_from
(
const
int
*
ids
,
int
sizei
);
int
*
dst
,
int
sized
,
const
int
*
src
,
int
sizes
,
const
int
*
ids
,
int
sizei
);
paddle/cuda/src/hl_top_k.cu
浏览文件 @
1d4fa243
...
@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "hl_base.h"
#include "hl_top_k.h"
#include "hl_sparse.ph"
#include "hl_sparse.ph"
#include "hl_top_k.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
// using namespace hppl;
// using namespace hppl;
struct
Pair
{
struct
Pair
{
__device__
__forceinline__
__device__
__forceinline__
Pair
()
{}
Pair
()
{}
__device__
__forceinline__
__device__
__forceinline__
Pair
(
real
value
,
int
id
)
:
v_
(
value
),
id_
(
id
)
{}
Pair
(
real
value
,
int
id
)
:
v_
(
value
),
id_
(
id
)
{}
__device__
__forceinline__
__device__
__forceinline__
void
set
(
real
value
,
int
id
)
{
void
set
(
real
value
,
int
id
)
{
v_
=
value
;
v_
=
value
;
id_
=
id
;
id_
=
id
;
}
}
__device__
__forceinline__
__device__
__forceinline__
void
operator
=
(
const
Pair
&
in
)
{
void
operator
=
(
const
Pair
&
in
)
{
v_
=
in
.
v_
;
v_
=
in
.
v_
;
id_
=
in
.
id_
;
id_
=
in
.
id_
;
}
}
__device__
__forceinline__
__device__
__forceinline__
bool
operator
<
(
const
real
value
)
const
{
bool
operator
<
(
const
real
value
)
const
{
return
(
v_
<
value
);
return
(
v_
<
value
);
}
}
__device__
__forceinline__
__device__
__forceinline__
bool
operator
<
(
const
Pair
&
in
)
const
{
bool
operator
<
(
const
Pair
&
in
)
const
{
return
(
v_
<
in
.
v_
)
||
((
v_
==
in
.
v_
)
&&
(
id_
>
in
.
id_
));
return
(
v_
<
in
.
v_
)
||
((
v_
==
in
.
v_
)
&&
(
id_
>
in
.
id_
));
}
}
__device__
__forceinline__
__device__
__forceinline__
bool
operator
>
(
const
Pair
&
in
)
const
{
bool
operator
>
(
const
Pair
&
in
)
const
{
return
(
v_
>
in
.
v_
)
||
((
v_
==
in
.
v_
)
&&
(
id_
<
in
.
id_
));
return
(
v_
>
in
.
v_
)
||
((
v_
==
in
.
v_
)
&&
(
id_
<
in
.
id_
));
}
}
...
@@ -58,8 +50,9 @@ struct Pair {
...
@@ -58,8 +50,9 @@ struct Pair {
int
id_
;
int
id_
;
};
};
__device__
__forceinline__
__device__
__forceinline__
void
addTo
(
Pair
topK
[],
void
addTo
(
Pair
topK
[],
const
Pair
&
p
,
int
beamSize
)
{
const
Pair
&
p
,
int
beamSize
)
{
for
(
int
k
=
beamSize
-
2
;
k
>=
0
;
k
--
)
{
for
(
int
k
=
beamSize
-
2
;
k
>=
0
;
k
--
)
{
if
(
topK
[
k
]
<
p
)
{
if
(
topK
[
k
]
<
p
)
{
topK
[
k
+
1
]
=
topK
[
k
];
topK
[
k
+
1
]
=
topK
[
k
];
...
@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
...
@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
topK
[
0
]
=
p
;
topK
[
0
]
=
p
;
}
}
template
<
int
beamSize
>
template
<
int
beamSize
>
__device__
__forceinline__
__device__
__forceinline__
void
addTo
(
Pair
topK
[],
const
Pair
&
p
)
{
void
addTo
(
Pair
topK
[],
const
Pair
&
p
)
{
for
(
int
k
=
beamSize
-
2
;
k
>=
0
;
k
--
)
{
for
(
int
k
=
beamSize
-
2
;
k
>=
0
;
k
--
)
{
if
(
topK
[
k
]
<
p
)
{
if
(
topK
[
k
]
<
p
)
{
topK
[
k
+
1
]
=
topK
[
k
];
topK
[
k
+
1
]
=
topK
[
k
];
...
@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
...
@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
topK
[
0
]
=
p
;
topK
[
0
]
=
p
;
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
getTopK
(
void
getTopK
(
Pair
topK
[],
real
*
src
,
int
idx
,
int
dim
,
int
beamSize
)
{
Pair
topK
[],
real
*
src
,
int
idx
,
int
dim
,
int
beamSize
)
{
while
(
idx
<
dim
)
{
while
(
idx
<
dim
)
{
if
(
topK
[
beamSize
-
1
]
<
src
[
idx
])
{
if
(
topK
[
beamSize
-
1
]
<
src
[
idx
])
{
Pair
tmp
(
src
[
idx
],
idx
);
Pair
tmp
(
src
[
idx
],
idx
);
...
@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
...
@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
}
}
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
getTopK
(
void
getTopK
(
Pair
topK
[],
real
*
src
,
int
idx
,
int
dim
,
Pair
topK
[],
real
*
src
,
int
idx
,
int
dim
,
const
Pair
&
max
,
int
beamSize
)
{
const
Pair
&
max
,
int
beamSize
)
{
while
(
idx
<
dim
)
{
while
(
idx
<
dim
)
{
if
(
topK
[
beamSize
-
1
]
<
src
[
idx
])
{
if
(
topK
[
beamSize
-
1
]
<
src
[
idx
])
{
Pair
tmp
(
src
[
idx
],
idx
);
Pair
tmp
(
src
[
idx
],
idx
);
...
@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
...
@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
}
}
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
getTopK
(
void
getTopK
(
Pair
topK
[],
real
*
val
,
int
*
col
,
Pair
topK
[],
real
*
val
,
int
*
col
,
int
idx
,
int
dim
,
int
beamSize
)
{
int
idx
,
int
dim
,
int
beamSize
)
{
while
(
idx
<
dim
)
{
while
(
idx
<
dim
)
{
if
(
topK
[
beamSize
-
1
]
<
val
[
idx
])
{
if
(
topK
[
beamSize
-
1
]
<
val
[
idx
])
{
Pair
tmp
(
val
[
idx
],
col
[
idx
]);
Pair
tmp
(
val
[
idx
],
col
[
idx
]);
...
@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
...
@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
}
}
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
getTopK
(
Pair
topK
[],
void
getTopK
(
Pair
topK
[],
real
*
val
,
int
*
col
,
int
idx
,
int
dim
,
real
*
val
,
const
Pair
&
max
,
int
beamSize
)
{
int
*
col
,
int
idx
,
int
dim
,
const
Pair
&
max
,
int
beamSize
)
{
while
(
idx
<
dim
)
{
while
(
idx
<
dim
)
{
if
(
topK
[
beamSize
-
1
]
<
val
[
idx
])
{
if
(
topK
[
beamSize
-
1
]
<
val
[
idx
])
{
Pair
tmp
(
val
[
idx
],
col
[
idx
]);
Pair
tmp
(
val
[
idx
],
col
[
idx
]);
...
@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
...
@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
}
}
}
}
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
threadGetTopK
(
Pair
topK
[],
void
threadGetTopK
(
Pair
topK
[],
int
&
beam
,
int
beamSize
,
int
&
beam
,
real
*
src
,
int
beamSize
,
bool
&
firstStep
,
bool
&
isEmpty
,
Pair
&
max
,
real
*
src
,
int
dim
,
const
int
tid
)
{
bool
&
firstStep
,
bool
&
isEmpty
,
Pair
&
max
,
int
dim
,
const
int
tid
)
{
if
(
beam
>
0
)
{
if
(
beam
>
0
)
{
int
length
=
beam
<
beamSize
?
beam
:
beamSize
;
int
length
=
beam
<
beamSize
?
beam
:
beamSize
;
if
(
firstStep
)
{
if
(
firstStep
)
{
...
@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
...
@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
}
}
if
(
!
isEmpty
)
{
if
(
!
isEmpty
)
{
getTopK
<
blockSize
>
(
topK
+
maxLength
-
beam
,
src
,
tid
,
dim
,
getTopK
<
blockSize
>
(
topK
+
maxLength
-
beam
,
src
,
tid
,
dim
,
max
,
length
);
max
,
length
);
}
}
}
}
...
@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
...
@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
}
}
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
threadGetTopK
(
Pair
topK
[],
void
threadGetTopK
(
Pair
topK
[],
int
&
beam
,
int
beamSize
,
int
&
beam
,
real
*
val
,
int
*
col
,
int
beamSize
,
bool
&
firstStep
,
bool
&
isEmpty
,
Pair
&
max
,
real
*
val
,
int
dim
,
const
int
tid
)
{
int
*
col
,
bool
&
firstStep
,
bool
&
isEmpty
,
Pair
&
max
,
int
dim
,
const
int
tid
)
{
if
(
beam
>
0
)
{
if
(
beam
>
0
)
{
int
length
=
beam
<
beamSize
?
beam
:
beamSize
;
int
length
=
beam
<
beamSize
?
beam
:
beamSize
;
if
(
firstStep
)
{
if
(
firstStep
)
{
...
@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
...
@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
}
}
if
(
!
isEmpty
)
{
if
(
!
isEmpty
)
{
getTopK
<
blockSize
>
(
topK
+
maxLength
-
beam
,
val
,
col
,
tid
,
dim
,
getTopK
<
blockSize
>
(
max
,
length
);
topK
+
maxLength
-
beam
,
val
,
col
,
tid
,
dim
,
max
,
length
);
}
}
}
}
...
@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
...
@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
}
}
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
blockReduce
(
Pair
*
shTopK
,
void
blockReduce
(
Pair
*
shTopK
,
int
*
maxId
,
Pair
topK
[],
int
*
maxId
,
real
**
topVal
,
int
**
topIds
,
Pair
topK
[],
int
&
beam
,
int
&
beamSize
,
real
**
topVal
,
const
int
tid
,
const
int
warp
)
{
int
**
topIds
,
int
&
beam
,
int
&
beamSize
,
const
int
tid
,
const
int
warp
)
{
while
(
true
)
{
while
(
true
)
{
__syncthreads
();
__syncthreads
();
if
(
tid
<
blockSize
/
2
)
{
if
(
tid
<
blockSize
/
2
)
{
...
@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
...
@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
}
}
}
}
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
blockSize
/
4
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
blockSize
/
4
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
tid
<
stride
)
{
if
(
tid
<
stride
)
{
if
(
shTopK
[
maxId
[
tid
]]
<
shTopK
[
maxId
[
tid
+
stride
]])
{
if
(
shTopK
[
maxId
[
tid
]]
<
shTopK
[
maxId
[
tid
+
stride
]])
{
maxId
[
tid
]
=
maxId
[
tid
+
stride
];
maxId
[
tid
]
=
maxId
[
tid
+
stride
];
...
@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
...
@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
* 3. go to the second setp, until one thread's topK value is null;
* 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value.
* 4. go to the first setp, until get the topK value.
*/
*/
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__global__
void
KeMatrixTopK
(
real
*
topVal
,
int
ldv
,
__global__
void
KeMatrixTopK
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
real
*
src
,
int
lds
,
int
*
topIds
,
real
*
src
,
int
lds
,
int
dim
,
int
dim
,
int
beamSize
)
{
int
beamSize
)
{
__shared__
Pair
shTopK
[
blockSize
];
__shared__
Pair
shTopK
[
blockSize
];
...
@@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
...
@@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
topVal
+=
blockIdx
.
x
*
ldv
;
topVal
+=
blockIdx
.
x
*
ldv
;
topIds
+=
blockIdx
.
x
*
beamSize
;
topIds
+=
blockIdx
.
x
*
beamSize
;
Pair
topK
[
maxLength
];
// NOLINT
Pair
topK
[
maxLength
];
// NOLINT
int
beam
=
maxLength
;
int
beam
=
maxLength
;
Pair
max
;
Pair
max
;
bool
isEmpty
=
false
;
bool
isEmpty
=
false
;
...
@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
...
@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
topK
[
k
].
set
(
-
HL_FLOAT_MAX
,
-
1
);
topK
[
k
].
set
(
-
HL_FLOAT_MAX
,
-
1
);
}
}
while
(
beamSize
)
{
while
(
beamSize
)
{
threadGetTopK
<
maxLength
,
blockSize
>
threadGetTopK
<
maxLength
,
blockSize
>
(
(
topK
,
beam
,
beamSize
,
src
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
topK
,
beam
,
beamSize
,
src
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
shTopK
[
tid
]
=
topK
[
0
];
shTopK
[
tid
]
=
topK
[
0
];
blockReduce
<
maxLength
,
blockSize
>
blockReduce
<
maxLength
,
blockSize
>
(
(
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
}
}
}
}
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__global__
void
KeSMatrixTopK
(
real
*
topVal
,
int
ldv
,
__global__
void
KeSMatrixTopK
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
int
*
topIds
,
real
*
val
,
real
*
val
,
int
*
row
,
int
*
row
,
int
*
col
,
int
*
col
,
...
@@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
...
@@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
topVal
+=
blockIdx
.
x
*
ldv
;
topVal
+=
blockIdx
.
x
*
ldv
;
topIds
+=
blockIdx
.
x
*
beamSize
;
topIds
+=
blockIdx
.
x
*
beamSize
;
Pair
topK
[
maxLength
];
// NOLINT
Pair
topK
[
maxLength
];
// NOLINT
int
beam
=
maxLength
;
int
beam
=
maxLength
;
Pair
max
;
Pair
max
;
bool
isEmpty
=
false
;
bool
isEmpty
=
false
;
...
@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
...
@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
topK
[
k
].
set
(
-
HL_FLOAT_MAX
,
-
1
);
topK
[
k
].
set
(
-
HL_FLOAT_MAX
,
-
1
);
}
}
while
(
beamSize
)
{
while
(
beamSize
)
{
threadGetTopK
<
maxLength
,
blockSize
>
threadGetTopK
<
maxLength
,
blockSize
>
(
(
topK
,
beam
,
beamSize
,
val
,
col
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
topK
,
beam
,
beamSize
,
val
,
col
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
shTopK
[
tid
]
=
topK
[
0
];
shTopK
[
tid
]
=
topK
[
0
];
blockReduce
<
maxLength
,
blockSize
>
blockReduce
<
maxLength
,
blockSize
>
(
(
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
}
}
}
}
void
hl_matrix_top_k
(
real
*
topVal
,
int
ldv
,
void
hl_matrix_top_k
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
real
*
src
,
int
lds
,
int
*
topIds
,
real
*
src
,
int
lds
,
int
dim
,
int
dim
,
int
beamSize
,
int
beamSize
,
int
numSamples
)
{
int
numSamples
)
{
...
@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
...
@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
dim3
threads
(
256
,
1
);
dim3
threads
(
256
,
1
);
dim3
grid
(
numSamples
,
1
);
dim3
grid
(
numSamples
,
1
);
KeMatrixTopK
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixTopK
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
topVal
,
ldv
,
topIds
,
src
,
lds
,
dim
,
beamSize
);
topVal
,
ldv
,
topIds
,
src
,
lds
,
dim
,
beamSize
);
CHECK_SYNC
(
"hl_matrix_top_k failed"
);
CHECK_SYNC
(
"hl_matrix_top_k failed"
);
}
}
void
hl_sparse_matrix_top_k
(
real
*
topVal
,
int
ldv
,
void
hl_sparse_matrix_top_k
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
int
*
topIds
,
hl_sparse_matrix_s
src
,
hl_sparse_matrix_s
src
,
int
beamSize
,
int
beamSize
,
int
numSamples
)
{
int
numSamples
)
{
CHECK_NOTNULL
(
topVal
);
CHECK_NOTNULL
(
topVal
);
CHECK_NOTNULL
(
topIds
);
CHECK_NOTNULL
(
topIds
);
CHECK_NOTNULL
(
src
);
CHECK_NOTNULL
(
src
);
CHECK_EQ
(
src
->
format
,
HL_SPARSE_CSR
)
CHECK_EQ
(
src
->
format
,
HL_SPARSE_CSR
)
<<
"sparse matrix format error!"
;
<<
"sparse matrix format error!"
;
hl_csr_matrix
csr
=
(
hl_csr_matrix
)
src
->
matrix
;
hl_csr_matrix
csr
=
(
hl_csr_matrix
)
src
->
matrix
;
if
(
csr
->
csr_val
==
NULL
||
csr
->
csr_row
==
NULL
||
if
(
csr
->
csr_val
==
NULL
||
csr
->
csr_row
==
NULL
||
csr
->
csr_col
==
NULL
)
{
csr
->
csr_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter src is null!"
;
LOG
(
FATAL
)
<<
"parameter src is null!"
;
}
}
dim3
threads
(
256
,
1
);
dim3
threads
(
256
,
1
);
dim3
grid
(
numSamples
,
1
);
dim3
grid
(
numSamples
,
1
);
KeSMatrixTopK
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSMatrixTopK
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
topVal
,
ldv
,
topIds
,
csr
->
csr_val
,
csr
->
csr_row
,
csr
->
csr_col
,
beamSize
);
topVal
,
ldv
,
topIds
,
csr
->
csr_val
,
csr
->
csr_row
,
csr
->
csr_col
,
beamSize
);
CHECK_SYNC
(
"hl_sparse_matrix_top_k failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_top_k failed"
);
}
}
...
@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
...
@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
* 3. go to the second setp, until one thread's topK value is null;
* 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value.
* 4. go to the first setp, until get the topK value.
*/
*/
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__global__
void
KeMatrixTopKClassificationError
(
real
*
topVal
,
int
ldv
,
__global__
void
KeMatrixTopKClassificationError
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
real
*
src
,
int
lds
,
int
*
topIds
,
real
*
src
,
int
lds
,
int
dim
,
int
dim
,
int
beamSize
,
int
beamSize
,
int
*
label
,
int
*
label
,
...
@@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
...
@@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
topVal
+=
blockIdx
.
x
*
ldv
;
topVal
+=
blockIdx
.
x
*
ldv
;
topIds
+=
blockIdx
.
x
*
beamSize
;
topIds
+=
blockIdx
.
x
*
beamSize
;
Pair
topK
[
maxLength
];
// NOLINT
Pair
topK
[
maxLength
];
// NOLINT
int
beam
=
maxLength
;
int
beam
=
maxLength
;
Pair
max
;
Pair
max
;
bool
isEmpty
=
false
;
bool
isEmpty
=
false
;
...
@@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
...
@@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
}
}
while
(
beamSize
)
{
while
(
beamSize
)
{
threadGetTopK
<
maxLength
,
blockSize
>
threadGetTopK
<
maxLength
,
blockSize
>
(
(
topK
,
beam
,
beamSize
,
src
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
topK
,
beam
,
beamSize
,
src
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
shTopK
[
tid
]
=
topK
[
0
];
shTopK
[
tid
]
=
topK
[
0
];
blockReduce
<
maxLength
,
blockSize
>
blockReduce
<
maxLength
,
blockSize
>
(
(
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
}
}
__syncthreads
();
__syncthreads
();
if
(
tid
==
0
)
{
if
(
tid
==
0
)
{
for
(
int
i
=
0
;
i
<
topkSize
;
i
++
)
{
for
(
int
i
=
0
;
i
<
topkSize
;
i
++
)
{
if
(
*--
topIds
==
label
[
blockIdx
.
x
])
{
if
(
*--
topIds
==
label
[
blockIdx
.
x
])
{
recResult
[
blockIdx
.
x
]
=
0
;
recResult
[
blockIdx
.
x
]
=
0
;
break
;
break
;
}
}
recResult
[
blockIdx
.
x
]
=
1.0
f
;
recResult
[
blockIdx
.
x
]
=
1.0
f
;
}
}
}
}
}
}
void
hl_matrix_classification_error
(
real
*
topVal
,
int
ldv
,
void
hl_matrix_classification_error
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
real
*
src
,
int
lds
,
int
*
topIds
,
int
dim
,
real
*
src
,
int
topkSize
,
int
lds
,
int
numSamples
,
int
dim
,
int
*
label
,
int
topkSize
,
real
*
recResult
)
{
int
numSamples
,
int
*
label
,
real
*
recResult
)
{
CHECK_NOTNULL
(
topVal
);
CHECK_NOTNULL
(
topVal
);
CHECK_NOTNULL
(
topIds
);
CHECK_NOTNULL
(
topIds
);
CHECK_NOTNULL
(
src
);
CHECK_NOTNULL
(
src
);
...
@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
...
@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
dim3
threads
(
256
,
1
);
dim3
threads
(
256
,
1
);
dim3
grid
(
numSamples
,
1
);
dim3
grid
(
numSamples
,
1
);
KeMatrixTopKClassificationError
<
5
,
256
>
KeMatrixTopKClassificationError
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
topVal
,
ldv
,
topIds
,
src
,
lds
,
dim
,
topkSize
,
label
,
recResult
);
(
topVal
,
ldv
,
topIds
,
src
,
lds
,
dim
,
topkSize
,
label
,
recResult
);
CHECK_SYNC
(
"hl_matrix_top_k classification error failed"
);
CHECK_SYNC
(
"hl_matrix_top_k classification error failed"
);
}
}
paddle/framework/attr_type.proto
浏览文件 @
1d4fa243
...
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
syntax
=
"proto2"
;
syntax
=
"proto2"
;
package
paddle
.
framework
;
package
paddle
.
framework
;
// Attribute Type for paddle's Op.
// Attribute Type for paddle's Op.
// Op contains many attributes. Each type of attributes could be different.
// Op contains many attributes. Each type of attributes could be different.
// The AttrType will be shared between AttrDesc and AttrProto.
// The AttrType will be shared between AttrDesc and AttrProto.
enum
AttrType
{
enum
AttrType
{
INT
=
0
;
INT
=
0
;
FLOAT
=
1
;
FLOAT
=
1
;
STRING
=
2
;
STRING
=
2
;
INTS
=
3
;
INTS
=
3
;
FLOATS
=
4
;
FLOATS
=
4
;
STRINGS
=
5
;
STRINGS
=
5
;
}
}
\ No newline at end of file
paddle/framework/op_desc.proto
浏览文件 @
1d4fa243
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
syntax
=
"proto2"
;
syntax
=
"proto2"
;
package
paddle
.
framework
;
package
paddle
.
framework
;
import
"attr_type.proto"
;
import
"attr_type.proto"
;
...
@@ -22,14 +22,14 @@ import "attr_type.proto";
...
@@ -22,14 +22,14 @@ import "attr_type.proto";
//
//
// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
// e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
message
AttrDesc
{
message
AttrDesc
{
required
string
name
=
1
;
required
string
name
=
1
;
required
AttrType
type
=
2
;
required
AttrType
type
=
2
;
optional
int32
i
=
3
;
optional
int32
i
=
3
;
optional
float
f
=
4
;
optional
float
f
=
4
;
optional
string
s
=
5
;
optional
string
s
=
5
;
repeated
int32
ints
=
6
;
repeated
int32
ints
=
6
;
repeated
float
floats
=
7
;
repeated
float
floats
=
7
;
repeated
string
strings
=
8
;
repeated
string
strings
=
8
;
};
};
// Protocol Message to describe an Operator.
// Protocol Message to describe an Operator.
...
@@ -42,15 +42,15 @@ message AttrDesc {
...
@@ -42,15 +42,15 @@ message AttrDesc {
// 3rd-party language can build this proto message and call
// 3rd-party language can build this proto message and call
// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
// AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
message
OpDesc
{
message
OpDesc
{
// input names of this Operator.
// input names of this Operator.
repeated
string
inputs
=
1
;
repeated
string
inputs
=
1
;
// output names of this Operator.
// output names of this Operator.
repeated
string
outputs
=
2
;
repeated
string
outputs
=
2
;
// type of this Operator, such as "add", "sub", "fc".
// type of this Operator, such as "add", "sub", "fc".
required
string
type
=
3
;
required
string
type
=
3
;
// Attributes of this Operator. e.g., scale=3.0 in cosine op.
// Attributes of this Operator. e.g., scale=3.0 in cosine op.
repeated
AttrDesc
attrs
=
4
;
repeated
AttrDesc
attrs
=
4
;
};
};
\ No newline at end of file
paddle/framework/op_proto.proto
浏览文件 @
1d4fa243
...
@@ -15,10 +15,11 @@ limitations under the License. */
...
@@ -15,10 +15,11 @@ limitations under the License. */
// Protocol Message for 3rd-party language binding.
// Protocol Message for 3rd-party language binding.
//
//
// Paddle Python package will use `OpProto` to generate op creation methods.
// Paddle Python package will use `OpProto` to generate op creation methods.
// The op creation methods take user's input and generate `OpDesc` proto message,
// The op creation methods take user's input and generate `OpDesc` proto
// message,
// then pass `OpDesc` to C++ side and create Op pointer.
// then pass `OpDesc` to C++ side and create Op pointer.
//
//
syntax
=
"proto2"
;
syntax
=
"proto2"
;
package
paddle
.
framework
;
package
paddle
.
framework
;
import
"attr_type.proto"
;
import
"attr_type.proto"
;
...
@@ -26,89 +27,90 @@ import "attr_type.proto";
...
@@ -26,89 +27,90 @@ import "attr_type.proto";
// Attribute protocol message for 3rd-party language binding.
// Attribute protocol message for 3rd-party language binding.
// It will store the Op support what attribute and what type.
// It will store the Op support what attribute and what type.
message
AttrProto
{
message
AttrProto
{
// Supported attribute name. e.g. `scale` for cosine op.
// Supported attribute name. e.g. `scale` for cosine op.
required
string
name
=
1
;
required
string
name
=
1
;
// Supported attribute type.
// Supported attribute type.
required
AttrType
type
=
2
;
required
AttrType
type
=
2
;
// Supported attribute comments. It helps 3rd-party language generate doc-string.
// Supported attribute comments. It helps 3rd-party language generate
required
string
comment
=
3
;
// doc-string.
required
string
comment
=
3
;
// If that attribute is generated, it means the Paddle third language
// If that attribute is generated, it means the Paddle third language
// binding has responsibility to fill that attribute. End-User should
// binding has responsibility to fill that attribute. End-User should
// not set that attribute.
// not set that attribute.
optional
bool
generated
=
4
[
default
=
false
];
optional
bool
generated
=
4
[
default
=
false
];
}
}
// Input or output message for 3rd-party language binding.
// Input or output message for 3rd-party language binding.
// It contains parameter name and its comments.
// It contains parameter name and its comments.
message
VarProto
{
message
VarProto
{
// Input or output name in that op creation function.
// Input or output name in that op creation function.
// e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
// e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
required
string
name
=
1
;
required
string
name
=
1
;
// The comment for that input. It helps 3rd-party language generate doc-string.
// The comment for that input. It helps 3rd-party language generate
required
string
comment
=
2
;
// doc-string.
required
string
comment
=
2
;
// Is that input/output could be a list or not.
// If so, that Op should write a attributed named `input_format` or
// Is that input/output could be a list or not.
// `output_format`.
// If so, that Op should write a attributed named `input_format` or
//
// `output_format`.
// e.g.
//
// If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
// e.g.
// could be multiple, so the multiple of `X` and `W` is True, and OpDesc
// If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
// will hold a attribute of them.
// could be multiple, so the multiple of `X` and `W` is True, and OpDesc
//
// will hold a attribute of them.
// The Op desc of same fc could be
//
// {
// The Op desc of same fc could be
// "type": "fc",
// {
// "input": ["X1", "X2", "W1", "W2", "b"],
// "type": "fc",
// "output": "fc.out",
// "input": ["X1", "X2", "W1", "W2", "b"],
// "attrs" : {
// "output": "fc.out",
// "input_format": [0, 2, 4, 5]
// "attrs" : {
// }
// "input_format": [0, 2, 4, 5]
// }
// }
//
// }
optional
bool
multiple
=
3
[
default
=
false
];
//
optional
bool
multiple
=
3
[
default
=
false
];
// It marks that output is a temporary output. That output is not used by
// user, but used by other op internally as input. If other op is not use
// It marks that output is a temporary output. That output is not used by
// that output, it could be optimized early.
// user, but used by other op internally as input. If other op is not use
//
// that output, it could be optimized early.
// Attribute temporary_index will be set in OpDesc if there is some
//
// outputs are temporary.
// Attribute temporary_index will be set in OpDesc if there is some
//
// outputs are temporary.
// output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
//
// attrs = {
// output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
// "temporary_index": [1]
// attrs = {
// }
// "temporary_index": [1]
optional
bool
temporary
=
4
[
default
=
false
];
// }
optional
bool
temporary
=
4
[
default
=
false
];
// The gradient of operator can be ignored immediately
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
// The gradient of operator can be ignored immediately
// can be ignored for the future optimized on graph.
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
optional
bool
ignore_gradient
=
6
;
// can be ignored for the future optimized on graph.
optional
bool
ignore_gradient
=
6
;
}
}
// Op protocol message for 3rd-party language binding.
// Op protocol message for 3rd-party language binding.
// It contains all information for generating op creation method.
// It contains all information for generating op creation method.
message
OpProto
{
message
OpProto
{
// The input information to generate op creation method.
// The input information to generate op creation method.
repeated
VarProto
inputs
=
1
;
repeated
VarProto
inputs
=
1
;
// The output information to generate op creation method.
// The output information to generate op creation method.
repeated
VarProto
outputs
=
2
;
repeated
VarProto
outputs
=
2
;
// The attribute information to generate op creation method.
// The attribute information to generate op creation method.
repeated
AttrProto
attrs
=
3
;
repeated
AttrProto
attrs
=
3
;
// The comments for that Op. It helps 3rd-party language generate
// The comments for that Op. It helps 3rd-party language generate
// doc-string. The whole documentation of that Op is generated by comment,
// doc-string. The whole documentation of that Op is generated by comment,
// inputs, outputs, attrs together.
// inputs, outputs, attrs together.
required
string
comment
=
4
;
required
string
comment
=
4
;
// The type of that Op.
required
string
type
=
5
;
// The type of that Op.
required
string
type
=
5
;
}
}
paddle/function/ContextProjectionOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "ContextProjectionOp.h"
#include "ContextProjectionOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
...
@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
int
block_size
=
blockDim
.
x
;
int
block_size
=
blockDim
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
seq_start
=
sequence
[
sequenceId
];
int
seq_start
=
sequence
[
sequenceId
];
int
seq_end
=
sequence
[
sequenceId
+
1
];
int
seq_end
=
sequence
[
sequenceId
+
1
];
real
value
=
0
;
real
value
=
0
;
int
instances
=
seq_end
-
seq_start
+
context_length
-
1
;
int
instances
=
seq_end
-
seq_start
+
context_length
-
1
;
...
@@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input,
...
@@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input,
}
else
if
((
i
+
context_start
)
>=
(
seq_end
-
seq_start
))
{
}
else
if
((
i
+
context_start
)
>=
(
seq_end
-
seq_start
))
{
if
(
padding
)
{
if
(
padding
)
{
value
=
value
=
weight
[(
begin_pad
+
i
+
context_start
-
(
seq_end
-
seq_start
))
*
weight
[(
begin_pad
+
i
+
context_start
-
(
seq_end
-
seq_start
))
*
input_dim
+
idx
];
input_dim
+
idx
];
}
else
{
}
else
{
continue
;
continue
;
}
}
...
@@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input,
...
@@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input,
int
outx
=
(
i
-
context_length
)
<
0
?
i
:
(
context_length
-
1
);
int
outx
=
(
i
-
context_length
)
<
0
?
i
:
(
context_length
-
1
);
int
outy
=
(
i
-
context_length
)
<
0
?
0
:
(
i
-
(
context_length
-
1
));
int
outy
=
(
i
-
context_length
)
<
0
?
0
:
(
i
-
(
context_length
-
1
));
real
*
output_r
=
real
*
output_r
=
output
+
outy
*
input_dim
*
context_length
+
outx
*
input_dim
;
output
+
outy
*
input_dim
*
context_length
+
outx
*
input_dim
;
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
output_r
[
idx
]
+=
value
;
output_r
[
idx
]
+=
value
;
if
(
j
-
outy
==
outx
)
break
;
if
(
j
-
outy
==
outx
)
break
;
...
@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
...
@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
dim3
grid
(
blocks_x
,
blocks_y
);
dim3
grid
(
blocks_x
,
blocks_y
);
if
(
weight
)
{
if
(
weight
)
{
KeContextProjectionForward
<
true
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeContextProjectionForward
<
true
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
input
,
sequence
,
weight
,
output
,
input_dim
,
input
,
context_length
,
context_start
,
begin_pad
);
sequence
,
}
else
{
weight
,
KeContextProjectionForward
<
false
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
output
,
(
input
,
sequence
,
weight
,
output
,
input_dim
,
input_dim
,
context_length
,
context_start
,
begin_pad
);
context_length
,
context_start
,
begin_pad
);
}
else
{
KeContextProjectionForward
<
false
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
input
,
sequence
,
weight
,
output
,
input_dim
,
context_length
,
context_start
,
begin_pad
);
}
}
CHECK_SYNC
(
"hl_context_projection_forward failed"
);
CHECK_SYNC
(
"hl_context_projection_forward failed"
);
}
}
...
@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
...
@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
int
block_size
=
blockDim
.
x
;
int
block_size
=
blockDim
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
seq_start
=
sequence
[
sequenceId
];
int
seq_start
=
sequence
[
sequenceId
];
int
seq_end
=
sequence
[
sequenceId
+
1
];
int
seq_end
=
sequence
[
sequenceId
+
1
];
real
value
=
0
;
real
value
=
0
;
int
instances
=
seq_end
-
seq_start
+
context_length
-
1
;
int
instances
=
seq_end
-
seq_start
+
context_length
-
1
;
...
@@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
...
@@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
int
outx
=
(
i
-
context_length
)
<
0
?
i
:
(
context_length
-
1
);
int
outx
=
(
i
-
context_length
)
<
0
?
i
:
(
context_length
-
1
);
int
outy
=
(
i
-
context_length
)
<
0
?
0
:
(
i
-
(
context_length
-
1
));
int
outy
=
(
i
-
context_length
)
<
0
?
0
:
(
i
-
(
context_length
-
1
));
real
*
output_r
=
real
*
output_r
=
out
+
outy
*
input_dim
*
context_length
+
outx
*
input_dim
;
out
+
outy
*
input_dim
*
context_length
+
outx
*
input_dim
;
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
value
+=
output_r
[
idx
];
value
+=
output_r
[
idx
];
if
(
j
-
outy
==
outx
)
break
;
if
(
j
-
outy
==
outx
)
break
;
...
@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
...
@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
int
blocks_y
=
1
;
int
blocks_y
=
1
;
dim3
threads
(
block_size
,
1
);
dim3
threads
(
block_size
,
1
);
dim3
grid
(
blocks_x
,
blocks_y
);
dim3
grid
(
blocks_x
,
blocks_y
);
KeContextProjectionBackwardData
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeContextProjectionBackwardData
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
out_grad
,
sequence
,
input_grad
,
input_dim
,
context_length
,
context_start
);
out_grad
,
sequence
,
input_grad
,
input_dim
,
context_length
,
context_start
);
CHECK_SYNC
(
"hl_context_projection_backward_data failed"
);
CHECK_SYNC
(
"hl_context_projection_backward_data failed"
);
}
}
...
@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
...
@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
context_start
);
context_start
);
}
}
template
<
int
THREADS_X
,
int
THREADS_Y
>
template
<
int
THREADS_X
,
int
THREADS_Y
>
__global__
void
KeContextProjectionBackwardWeight
(
const
real
*
out_grad
,
__global__
void
KeContextProjectionBackwardWeight
(
const
real
*
out_grad
,
const
int
*
sequence
,
const
int
*
sequence
,
real
*
w_grad
,
real
*
w_grad
,
...
@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
...
@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
if
(
weight_idx
<
w_dim
)
{
if
(
weight_idx
<
w_dim
)
{
for
(
int
seqId
=
idy
;
seqId
<
num_sequences
;
seqId
+=
THREADS_Y
)
{
for
(
int
seqId
=
idy
;
seqId
<
num_sequences
;
seqId
+=
THREADS_Y
)
{
int
seq_start
=
sequence
[
seqId
];
int
seq_start
=
sequence
[
seqId
];
int
seq_end
=
sequence
[
seqId
+
1
];
int
seq_end
=
sequence
[
seqId
+
1
];
output_r
=
const_cast
<
real
*>
(
out_grad
)
output_r
=
+
seq_start
*
w_dim
*
context_length
;
const_cast
<
real
*>
(
out_grad
)
+
seq_start
*
w_dim
*
context_length
;
if
(
context_start
<
0
)
{
if
(
context_start
<
0
)
{
if
(
padId
+
context_start
<
0
)
{
if
(
padId
+
context_start
<
0
)
{
instanceId
=
padId
;
instanceId
=
padId
;
}
else
{
}
else
{
// begin_pad > 0;
// begin_pad > 0;
instanceId
=
(
padId
-
begin_pad
)
+
instanceId
=
(
seq_end
-
seq_start
)
-
context_start
;
(
padId
-
begin_pad
)
+
(
seq_end
-
seq_start
)
-
context_start
;
}
}
}
else
{
}
else
{
if
(
padId
+
(
seq_end
-
seq_start
)
<
context_start
)
{
if
(
padId
+
(
seq_end
-
seq_start
)
<
context_start
)
{
...
@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
...
@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
}
}
}
}
int
outx
=
(
instanceId
-
context_length
)
<
0
?
int
outx
=
instanceId
:
(
context_length
-
1
);
(
instanceId
-
context_length
)
<
0
?
instanceId
:
(
context_length
-
1
);
int
outy
=
(
instanceId
-
context_length
)
<
0
?
int
outy
=
(
instanceId
-
context_length
)
<
0
0
:
(
instanceId
-
(
context_length
-
1
));
?
0
:
(
instanceId
-
(
context_length
-
1
));
output_r
+=
outy
*
w_dim
*
context_length
+
outx
*
w_dim
;
output_r
+=
outy
*
w_dim
*
context_length
+
outx
*
w_dim
;
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
value
+=
output_r
[
weight_idx
];
value
+=
output_r
[
weight_idx
];
...
@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
...
@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
}
}
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
THREADS_Y
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
THREADS_Y
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
idy
<
stride
)
{
if
(
idy
<
stride
)
{
sum_s
[
idy
][
idx
]
+=
sum_s
[
idy
+
stride
][
idx
];
sum_s
[
idy
][
idx
]
+=
sum_s
[
idy
+
stride
][
idx
];
}
}
...
@@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad,
...
@@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad,
dim3
threads
(
threads_x
,
threads_y
);
dim3
threads
(
threads_x
,
threads_y
);
dim3
grid
(
blocks_x
,
1
);
dim3
grid
(
blocks_x
,
1
);
KeContextProjectionBackwardWeight
<
32
,
32
>
KeContextProjectionBackwardWeight
<
32
,
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
out_grad
,
sequence
,
w_grad
,
num_sequences
,
w_dim
,
out_grad
,
context_length
,
context_start
,
begin_pad
);
sequence
,
w_grad
,
num_sequences
,
w_dim
,
context_length
,
context_start
,
begin_pad
);
CHECK_SYNC
(
"hl_context_projection_backward_weight failed"
);
CHECK_SYNC
(
"hl_context_projection_backward_weight failed"
);
}
}
template
<
>
template
<
>
void
ContextProjectionBackwardWeight
<
DEVICE_TYPE_GPU
>
(
void
ContextProjectionBackwardWeight
<
DEVICE_TYPE_GPU
>
(
const
GpuMatrix
&
out_grad
,
const
GpuMatrix
&
out_grad
,
GpuMatrix
&
w_grad
,
GpuMatrix
&
w_grad
,
const
GpuIVector
&
seq_vec
,
const
GpuIVector
&
seq_vec
,
size_t
context_length
,
size_t
context_length
,
int
context_start
,
int
context_start
,
size_t
total_pad
,
size_t
total_pad
,
size_t
begin_pad
)
{
size_t
begin_pad
)
{
hl_context_projection_backward_weight
(
out_grad
.
getData
(),
hl_context_projection_backward_weight
(
out_grad
.
getData
(),
seq_vec
.
getData
(),
seq_vec
.
getData
(),
w_grad
.
getData
(),
w_grad
.
getData
(),
...
@@ -376,23 +395,18 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
...
@@ -376,23 +395,18 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
size_t
begin_pad
,
size_t
begin_pad
,
bool
is_padding
,
bool
is_padding
,
size_t
total_pad
)
{
size_t
total_pad
)
{
if
(
in_grad
)
{
if
(
in_grad
)
{
ContextProjectionBackwardData
<
DEVICE_TYPE_GPU
>
(
ContextProjectionBackwardData
<
DEVICE_TYPE_GPU
>
(
out_grad
,
out_grad
,
in_grad
,
sequence
,
context_length
,
context_start
);
in_grad
,
}
sequence
,
if
(
is_padding
&&
w_grad
)
{
context_length
,
ContextProjectionBackwardWeight
<
DEVICE_TYPE_GPU
>
(
out_grad
,
context_start
);
w_grad
,
}
sequence
,
if
(
is_padding
&&
w_grad
)
{
context_length
,
ContextProjectionBackwardWeight
<
DEVICE_TYPE_GPU
>
(
context_start
,
out_grad
,
total_pad
,
w_grad
,
begin_pad
);
sequence
,
context_length
,
context_start
,
total_pad
,
begin_pad
);
}
}
}
}
...
...
paddle/function/CosSimOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "CosSimOp.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_device_functions.cuh"
#include "CosSimOp.h"
namespace
paddle
{
namespace
paddle
{
template
<
int
block_size
>
template
<
int
block_size
>
__global__
void
KeCosSim
(
real
*
output
,
__global__
void
KeCosSim
(
real
*
output
,
const
real
*
input1
,
const
real
*
input1
,
const
real
*
input2
,
const
real
*
input2
,
...
@@ -78,8 +78,8 @@ void hlCossim(real* output,
...
@@ -78,8 +78,8 @@ void hlCossim(real* output,
dim3
threads
(
block_size
,
1
);
dim3
threads
(
block_size
,
1
);
dim3
grid
(
1
,
input1_height
);
dim3
grid
(
1
,
input1_height
);
KeCosSim
<
block_size
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeCosSim
<
block_size
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
input1
,
input2
,
width
,
input1_height
,
input2_height
,
scale
);
output
,
input1
,
input2
,
width
,
input1_height
,
input2_height
,
scale
);
CHECK_SYNC
(
"hlCossim failed"
);
CHECK_SYNC
(
"hlCossim failed"
);
}
}
...
@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
...
@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
hlCossim
(
out
,
x
,
y
,
dim
,
in1_mat
.
getHeight
(),
in2_mat
.
getHeight
(),
scale
);
hlCossim
(
out
,
x
,
y
,
dim
,
in1_mat
.
getHeight
(),
in2_mat
.
getHeight
(),
scale
);
}
}
template
<
int
block_size
>
template
<
int
block_size
>
__global__
void
KeCosSimDerivative
(
const
real
*
grad
,
__global__
void
KeCosSimDerivative
(
const
real
*
grad
,
const
real
*
output
,
const
real
*
output
,
const
real
*
prev_out_x
,
const
real
*
prev_out_x
,
...
@@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad,
...
@@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad,
if
(
xy
[
0
]
==
0
)
{
if
(
xy
[
0
]
==
0
)
{
real
reciprocal
=
1.0
/
(
sqrt
(
xx
[
0
])
*
sqrt
(
yy
[
0
]));
real
reciprocal
=
1.0
/
(
sqrt
(
xx
[
0
])
*
sqrt
(
yy
[
0
]));
for
(
int
index
=
tid
;
index
<
width
;
index
+=
block_size
)
{
for
(
int
index
=
tid
;
index
<
width
;
index
+=
block_size
)
{
prev_grad_x
[
index
]
+=
prev_grad_x
[
index
]
+=
scale
*
grad
[
ty
]
*
prev_out_y
[
index
]
*
reciprocal
;
scale
*
grad
[
ty
]
*
prev_out_y
[
index
]
*
reciprocal
;
if
(
input2_height
>
1
)
{
if
(
input2_height
>
1
)
{
prev_grad_y
[
index
]
+=
prev_grad_y
[
index
]
+=
scale
*
grad
[
ty
]
*
prev_out_x
[
index
]
*
reciprocal
;
scale
*
grad
[
ty
]
*
prev_out_x
[
index
]
*
reciprocal
;
}
else
{
}
else
{
paddle
::
paddleAtomicAdd
(
prev_grad_y
+
index
,
paddle
::
paddleAtomicAdd
(
scale
*
grad
[
ty
]
*
prev_out_x
[
index
]
*
reciprocal
);
prev_grad_y
+
index
,
scale
*
grad
[
ty
]
*
prev_out_x
[
index
]
*
reciprocal
);
}
}
}
}
}
else
{
}
else
{
...
@@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad,
...
@@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad,
real
reciprocalSquareSumX
=
1.0
/
xx
[
0
];
real
reciprocalSquareSumX
=
1.0
/
xx
[
0
];
real
reciprocalSquareSumY
=
1.0
/
yy
[
0
];
real
reciprocalSquareSumY
=
1.0
/
yy
[
0
];
for
(
int
index
=
tid
;
index
<
width
;
index
+=
block_size
)
{
for
(
int
index
=
tid
;
index
<
width
;
index
+=
block_size
)
{
prev_grad_x
[
index
]
+=
output
[
ty
]
*
grad
[
ty
]
*
prev_grad_x
[
index
]
+=
(
prev_out_y
[
index
]
*
reciprocalXY
-
output
[
ty
]
*
grad
[
ty
]
*
(
prev_out_y
[
index
]
*
reciprocalXY
-
prev_out_x
[
index
]
*
reciprocalSquareSumX
);
prev_out_x
[
index
]
*
reciprocalSquareSumX
);
if
(
input2_height
>
1
)
{
if
(
input2_height
>
1
)
{
prev_grad_y
[
index
]
+=
output
[
ty
]
*
grad
[
ty
]
*
prev_grad_y
[
index
]
+=
(
prev_out_x
[
index
]
*
reciprocalXY
-
output
[
ty
]
*
grad
[
ty
]
*
(
prev_out_x
[
index
]
*
reciprocalXY
-
prev_out_y
[
index
]
*
reciprocalSquareSumY
);
prev_out_y
[
index
]
*
reciprocalSquareSumY
);
}
else
{
}
else
{
paddle
::
paddleAtomicAdd
(
prev_grad_y
+
index
,
output
[
ty
]
*
grad
[
ty
]
*
paddle
::
paddleAtomicAdd
(
(
prev_out_x
[
index
]
*
reciprocalXY
-
prev_grad_y
+
index
,
prev_out_y
[
index
]
*
reciprocalSquareSumY
));
output
[
ty
]
*
grad
[
ty
]
*
(
prev_out_x
[
index
]
*
reciprocalXY
-
prev_out_y
[
index
]
*
reciprocalSquareSumY
));
}
}
}
}
}
}
...
@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
...
@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
const
int
block_size
=
256
;
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
threads
(
block_size
,
1
);
dim3
grid
(
1
,
input1_height
);
dim3
grid
(
1
,
input1_height
);
KeCosSimDerivative
<
block_size
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeCosSimDerivative
<
block_size
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad
,
output
,
prev_out_x
,
prev_out_y
,
prev_grad_x
,
prev_grad_y
,
width
,
grad
,
input1_height
,
input2_height
,
scale
);
output
,
prev_out_x
,
prev_out_y
,
prev_grad_x
,
prev_grad_y
,
width
,
input1_height
,
input2_height
,
scale
);
CHECK_SYNC
(
"hlCossimDerivate failed"
);
CHECK_SYNC
(
"hlCossimDerivate failed"
);
}
}
...
@@ -214,9 +222,9 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
...
@@ -214,9 +222,9 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
real
scale
)
{
real
scale
)
{
CHECK
(
out_grad
.
getData
()
&&
out_val
.
getData
()
&&
in1_val
.
getData
()
&&
CHECK
(
out_grad
.
getData
()
&&
out_val
.
getData
()
&&
in1_val
.
getData
()
&&
in2_val
.
getData
()
&&
in1_grad
.
getData
()
&&
in2_grad
.
getData
());
in2_val
.
getData
()
&&
in1_grad
.
getData
()
&&
in2_grad
.
getData
());
CHECK
(
out_grad
.
useGpu_
&&
out_val
.
useGpu_
&&
in1_val
.
useGpu_
CHECK
(
out_grad
.
useGpu_
&&
out_val
.
useGpu_
&&
in1_val
.
useGpu_
&&
&&
in2_val
.
useGpu_
&&
in1_grad
.
useGpu_
&&
in2_grad
.
useGpu_
)
in2_val
.
useGpu_
&&
in1_grad
.
useGpu_
&&
in2_grad
.
useGpu_
)
<<
"Matrix types are not equally GPU"
;
<<
"Matrix types are not equally GPU"
;
size_t
dim
=
in1_val
.
getWidth
();
size_t
dim
=
in1_val
.
getWidth
();
const
real
*
grad
=
out_grad
.
getData
();
const
real
*
grad
=
out_grad
.
getData
();
...
...
paddle/function/CropOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "CropOp.h"
#include "CropOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
__global__
void
KeCrop
(
real
*
outputs
,
const
real
*
inputs
,
__global__
void
KeCrop
(
real
*
outputs
,
int
inC
,
int
inH
,
int
inW
,
const
real
*
inputs
,
int
cropC
,
int
cropH
,
int
cropW
,
int
inC
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
int
inH
,
int
inW
,
int
cropC
,
int
cropH
,
int
cropW
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
nthreads
)
{
if
(
idx
<
nthreads
)
{
const
int
w
=
idx
%
outW
;
const
int
w
=
idx
%
outW
;
...
@@ -35,12 +43,12 @@ __global__ void KeCrop(real* outputs, const real* inputs,
...
@@ -35,12 +43,12 @@ __global__ void KeCrop(real* outputs, const real* inputs,
template
<
>
template
<
>
void
Crop
<
DEVICE_TYPE_GPU
>
(
real
*
outputs
,
void
Crop
<
DEVICE_TYPE_GPU
>
(
real
*
outputs
,
const
real
*
inputs
,
const
real
*
inputs
,
const
TensorShape
inShape
,
const
TensorShape
inShape
,
const
TensorShape
outShape
,
const
TensorShape
outShape
,
const
FuncConfig
&
conf
)
{
const
FuncConfig
&
conf
)
{
std
::
vector
<
uint32_t
>
crop_corner
=
std
::
vector
<
uint32_t
>
crop_corner
=
conf
.
get
<
std
::
vector
<
uint32_t
>>
(
"crop_corner"
);
conf
.
get
<
std
::
vector
<
uint32_t
>>
(
"crop_corner"
);
int
cropC
=
crop_corner
[
1
];
int
cropC
=
crop_corner
[
1
];
int
cropH
=
crop_corner
[
2
];
int
cropH
=
crop_corner
[
2
];
int
cropW
=
crop_corner
[
3
];
int
cropW
=
crop_corner
[
3
];
...
@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
...
@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
int
blockSize
=
1024
;
int
blockSize
=
1024
;
int
gridSize
=
(
nth
+
blockSize
-
1
)
/
blockSize
;
int
gridSize
=
(
nth
+
blockSize
-
1
)
/
blockSize
;
KeCrop
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCrop
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
outputs
,
(
outputs
,
inputs
,
inC
,
inH
,
inW
,
cropC
,
cropH
,
cropW
,
inputs
,
outC
,
outH
,
outW
,
nth
);
inC
,
inH
,
inW
,
cropC
,
cropH
,
cropW
,
outC
,
outH
,
outW
,
nth
);
CHECK_SYNC
(
"Crop"
);
CHECK_SYNC
(
"Crop"
);
}
}
__global__
void
KeCropDiff
(
const
real
*
inGrad
,
real
*
outGrad
,
__global__
void
KeCropDiff
(
const
real
*
inGrad
,
int
inC
,
int
inH
,
int
inW
,
real
*
outGrad
,
int
cropC
,
int
cropH
,
int
cropW
,
int
inC
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
int
inH
,
int
inW
,
int
cropC
,
int
cropH
,
int
cropW
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
nthreads
)
{
if
(
idx
<
nthreads
)
{
const
int
w
=
idx
%
inW
;
const
int
w
=
idx
%
inW
;
...
@@ -84,12 +109,12 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad,
...
@@ -84,12 +109,12 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad,
template
<
>
template
<
>
void
CropGrad
<
DEVICE_TYPE_GPU
>
(
const
real
*
inGrad
,
void
CropGrad
<
DEVICE_TYPE_GPU
>
(
const
real
*
inGrad
,
real
*
outGrad
,
real
*
outGrad
,
const
TensorShape
inShape
,
const
TensorShape
inShape
,
const
TensorShape
outShape
,
const
TensorShape
outShape
,
const
FuncConfig
&
conf
)
{
const
FuncConfig
&
conf
)
{
std
::
vector
<
uint32_t
>
crop_corner
=
std
::
vector
<
uint32_t
>
crop_corner
=
conf
.
get
<
std
::
vector
<
uint32_t
>>
(
"crop_corner"
);
conf
.
get
<
std
::
vector
<
uint32_t
>>
(
"crop_corner"
);
int
cropC
=
crop_corner
[
1
];
int
cropC
=
crop_corner
[
1
];
int
cropH
=
crop_corner
[
2
];
int
cropH
=
crop_corner
[
2
];
int
cropW
=
crop_corner
[
3
];
int
cropW
=
crop_corner
[
3
];
...
@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
...
@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
int
blockSize
=
1024
;
int
blockSize
=
1024
;
int
gridSize
=
(
nth
+
blockSize
-
1
)
/
blockSize
;
int
gridSize
=
(
nth
+
blockSize
-
1
)
/
blockSize
;
KeCropDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCropDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
inGrad
,
(
inGrad
,
outGrad
,
inC
,
inH
,
inW
,
cropC
,
cropH
,
cropW
,
outGrad
,
outC
,
outH
,
outW
,
nth
);
inC
,
inH
,
inW
,
cropC
,
cropH
,
cropW
,
outC
,
outH
,
outW
,
nth
);
CHECK_SYNC
(
"CropGrad"
);
CHECK_SYNC
(
"CropGrad"
);
}
}
...
...
paddle/function/CrossMapNormalOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "CrossMapNormalOp.h"
#include "CrossMapNormalOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
__global__
void
KeCMRNormFillScale
(
size_t
imageSize
,
const
real
*
in
,
__global__
void
KeCMRNormFillScale
(
size_t
imageSize
,
real
*
scale
,
size_t
channels
,
const
real
*
in
,
size_t
height
,
size_t
width
,
size_t
size
,
real
*
scale
,
size_t
channels
,
size_t
height
,
size_t
width
,
size_t
size
,
real
alpha
)
{
real
alpha
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
imageSize
)
{
if
(
idx
<
imageSize
)
{
...
@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
...
@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
}
}
}
}
__global__
void
KeCMRNormOutput
(
size_t
inputSize
,
const
real
*
in
,
__global__
void
KeCMRNormOutput
(
size_t
inputSize
,
const
real
*
scale
,
real
negative_beta
,
const
real
*
in
,
const
real
*
scale
,
real
negative_beta
,
real
*
out
)
{
real
*
out
)
{
const
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
index
<
inputSize
)
{
if
(
index
<
inputSize
)
{
...
@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
...
@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
size_t
imageSize
=
numSamples
*
height
*
width
;
size_t
imageSize
=
numSamples
*
height
*
width
;
int
blockSize
=
1024
;
int
blockSize
=
1024
;
int
gridSize
=
(
imageSize
+
1024
-
1
)
/
1024
;
int
gridSize
=
(
imageSize
+
1024
-
1
)
/
1024
;
KeCMRNormFillScale
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCMRNormFillScale
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
(
imageSize
,
inputs
,
denoms
,
channels
,
height
,
width
,
size
,
scale
);
imageSize
,
inputs
,
denoms
,
channels
,
height
,
width
,
size
,
scale
);
size_t
inputSize
=
numSamples
*
height
*
width
*
channels
;
size_t
inputSize
=
numSamples
*
height
*
width
*
channels
;
blockSize
=
1024
;
blockSize
=
1024
;
gridSize
=
(
inputSize
+
1024
-
1
)
/
1024
;
gridSize
=
(
inputSize
+
1024
-
1
)
/
1024
;
KeCMRNormOutput
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCMRNormOutput
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
(
inputSize
,
inputs
,
denoms
,
-
pow
,
outputs
);
inputSize
,
inputs
,
denoms
,
-
pow
,
outputs
);
CHECK_SYNC
(
"CrossMapNormal"
);
CHECK_SYNC
(
"CrossMapNormal"
);
}
}
__global__
void
KeCMRNormDiff
(
size_t
imageSize
,
const
real
*
bottom_data
,
__global__
void
KeCMRNormDiff
(
size_t
imageSize
,
const
real
*
top_data
,
const
real
*
scale
,
const
real
*
bottom_data
,
const
real
*
top_diff
,
size_t
channels
,
const
real
*
top_data
,
size_t
height
,
size_t
width
,
size_t
size
,
const
real
*
scale
,
real
negative_beta
,
real
cache_ratio
,
const
real
*
top_diff
,
real
*
bottom_diff
)
{
size_t
channels
,
size_t
height
,
size_t
width
,
size_t
size
,
real
negative_beta
,
real
cache_ratio
,
real
*
bottom_diff
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
imageSize
)
{
if
(
idx
<
imageSize
)
{
const
int
w
=
idx
%
width
;
const
int
w
=
idx
%
width
;
...
@@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
...
@@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
while
(
index
<
channels
+
post_pad
)
{
while
(
index
<
channels
+
post_pad
)
{
if
(
index
<
channels
)
{
if
(
index
<
channels
)
{
accum
+=
top_diff
[
index
*
step
]
*
top_data
[
index
*
step
]
/
accum
+=
top_diff
[
index
*
step
]
*
top_data
[
index
*
step
]
/
scale
[
index
*
step
];
scale
[
index
*
step
];
}
}
if
(
index
>=
size
)
{
if
(
index
>=
size
)
{
accum
-=
top_diff
[(
index
-
size
)
*
step
]
*
accum
-=
top_diff
[(
index
-
size
)
*
step
]
*
top_data
[(
index
-
size
)
*
step
]
/
scale
[(
index
-
size
)
*
step
];
top_data
[(
index
-
size
)
*
step
]
/
scale
[(
index
-
size
)
*
step
];
}
}
if
(
index
>=
post_pad
)
{
if
(
index
>=
post_pad
)
{
bottom_diff
[(
index
-
post_pad
)
*
step
]
+=
bottom_diff
[(
index
-
post_pad
)
*
step
]
+=
top_diff
[(
index
-
post_pad
)
*
step
]
*
top_diff
[(
index
-
post_pad
)
*
step
]
*
pow
(
scale
[(
index
-
post_pad
)
*
step
],
negative_beta
)
-
cache_ratio
*
pow
(
scale
[(
index
-
post_pad
)
*
step
],
negative_beta
)
-
bottom_data
[(
index
-
post_pad
)
*
step
]
*
accum
;
cache_ratio
*
bottom_data
[(
index
-
post_pad
)
*
step
]
*
accum
;
}
}
++
index
;
++
index
;
}
}
...
@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
...
@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
int
blockSize
=
1024
;
int
blockSize
=
1024
;
int
gridSize
=
(
imageSize
+
1024
-
1
)
/
1024
;
int
gridSize
=
(
imageSize
+
1024
-
1
)
/
1024
;
KeCMRNormDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCMRNormDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
imageSize
,
(
imageSize
,
inputsValue
,
outputsValue
,
denoms
,
outputsGrad
,
channels
,
inputsValue
,
height
,
width
,
size
,
-
pow
,
2.0
f
*
pow
*
scale
,
inputsGrad
);
outputsValue
,
denoms
,
outputsGrad
,
channels
,
height
,
width
,
size
,
-
pow
,
2.0
f
*
pow
*
scale
,
inputsGrad
);
CHECK_SYNC
(
"CrossMapNormalGrad"
);
CHECK_SYNC
(
"CrossMapNormalGrad"
);
}
}
...
...
paddle/function/DepthwiseConvOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -20,17 +20,25 @@ namespace paddle {
...
@@ -20,17 +20,25 @@ namespace paddle {
// CUDA kernel to compute the depthwise convolution forward pass
// CUDA kernel to compute the depthwise convolution forward pass
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
ConvolutionDepthwiseForward
(
const
int
nthreads
,
void
ConvolutionDepthwiseForward
(
const
int
nthreads
,
const
T
*
const
inputData
,
const
T
*
const
inputData
,
const
T
*
const
filterData
,
const
T
*
const
filterData
,
const
int
batchSize
,
const
int
outputChannels
,
const
int
outputHeight
,
const
int
batchSize
,
const
int
outputWidth
,
const
int
inputChannels
,
const
int
inputHeight
,
const
int
outputChannels
,
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
outputHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
outputWidth
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
outputData
)
{
const
int
inputChannels
,
const
int
inputHeight
,
int
index
=
const
int
inputWidth
,
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
outputData
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
const
int
batch
=
index
/
outputChannels
/
outputHeight
/
outputWidth
;
const
int
batch
=
index
/
outputChannels
/
outputHeight
/
outputWidth
;
...
@@ -45,32 +53,36 @@ void ConvolutionDepthwiseForward(const int nthreads,
...
@@ -45,32 +53,36 @@ void ConvolutionDepthwiseForward(const int nthreads,
const
int
w_in_start
=
-
paddingW
+
w_out
*
strideW
;
const
int
w_in_start
=
-
paddingW
+
w_out
*
strideW
;
const
int
h_in_end
=
-
paddingH
+
h_out
*
strideH
+
filterHeight
-
1
;
const
int
h_in_end
=
-
paddingH
+
h_out
*
strideH
+
filterHeight
-
1
;
const
int
w_in_end
=
-
paddingW
+
w_out
*
strideW
+
filterWidth
-
1
;
const
int
w_in_end
=
-
paddingW
+
w_out
*
strideW
+
filterWidth
-
1
;
if
((
h_in_start
>=
0
)
&&
(
h_in_end
<
inputHeight
)
if
((
h_in_start
>=
0
)
&&
(
h_in_end
<
inputHeight
)
&&
(
w_in_start
>=
0
)
&&
&&
(
w_in_start
>=
0
)
&&
(
w_in_end
<
inputWidth
))
{
(
w_in_end
<
inputWidth
))
{
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
const
int
offset
=
((
batch
*
inputChannels
+
c_in
)
const
int
offset
=
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
((
batch
*
inputChannels
+
c_in
)
*
inputHeight
+
h_in
)
*
value
+=
(
*
weight
)
*
inputData
[
offset
];
inputWidth
+
++
weight
;
w_in
;
}
value
+=
(
*
weight
)
*
inputData
[
offset
];
++
weight
;
}
}
}
}
else
{
}
else
{
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
&&
(
w_in
>=
0
)
&&
&&
(
w_in
>=
0
)
&&
(
w_in
<
inputWidth
))
{
(
w_in
<
inputWidth
))
{
const
int
offset
=
((
batch
*
inputChannels
+
c_in
)
const
int
offset
=
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
((
batch
*
inputChannels
+
c_in
)
*
inputHeight
+
h_in
)
*
value
+=
(
*
weight
)
*
inputData
[
offset
];
inputWidth
+
}
w_in
;
++
weight
;
value
+=
(
*
weight
)
*
inputData
[
offset
];
}
}
}
++
weight
;
}
}
}
}
outputData
[
index
]
=
value
;
outputData
[
index
]
=
value
;
}
}
...
@@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads,
...
@@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads,
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
ConvolutionDepthwiseInputBackward
(
const
int
nthreads
,
void
ConvolutionDepthwiseInputBackward
(
const
int
nthreads
,
const
T
*
const
top_diff
,
const
T
*
const
top_diff
,
const
T
*
const
weight_data
,
const
T
*
const
weight_data
,
const
int
num
,
const
int
outputChannels
,
const
int
outputHeight
,
const
int
num
,
const
int
outputWidth
,
const
int
inputChannels
,
const
int
inputHeight
,
const
int
outputChannels
,
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
outputHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
outputWidth
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
bottom_diff
)
{
const
int
inputChannels
,
int
index
=
const
int
inputHeight
,
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
bottom_diff
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
const
int
batch
=
index
/
inputChannels
/
inputHeight
/
inputWidth
;
const
int
batch
=
index
/
inputChannels
/
inputHeight
/
inputWidth
;
const
int
c_in
=
(
index
/
inputHeight
/
inputWidth
)
%
inputChannels
;
const
int
c_in
=
(
index
/
inputHeight
/
inputWidth
)
%
inputChannels
;
...
@@ -96,65 +117,80 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
...
@@ -96,65 +117,80 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
const
int
c_out_start
=
c_in
*
filterMultiplier
;
const
int
c_out_start
=
c_in
*
filterMultiplier
;
int
h_out_start
=
(
h_in
-
filterHeight
+
paddingH
+
strideH
)
/
strideH
;
int
h_out_start
=
(
h_in
-
filterHeight
+
paddingH
+
strideH
)
/
strideH
;
h_out_start
=
0
>
h_out_start
?
0
:
h_out_start
;
h_out_start
=
0
>
h_out_start
?
0
:
h_out_start
;
int
h_out_end
=
(
h_in
+
paddingH
)
/
strideH
;
int
h_out_end
=
(
h_in
+
paddingH
)
/
strideH
;
h_out_end
=
outputHeight
-
1
<
h_out_end
?
outputHeight
-
1
:
h_out_end
;
h_out_end
=
outputHeight
-
1
<
h_out_end
?
outputHeight
-
1
:
h_out_end
;
int
w_out_start
=
(
w_in
-
filterWidth
+
paddingW
+
strideW
)
/
strideW
;
int
w_out_start
=
(
w_in
-
filterWidth
+
paddingW
+
strideW
)
/
strideW
;
w_out_start
=
0
>
w_out_start
?
0
:
w_out_start
;
w_out_start
=
0
>
w_out_start
?
0
:
w_out_start
;
int
w_out_end
=
(
w_in
+
paddingW
)
/
strideW
;
int
w_out_end
=
(
w_in
+
paddingW
)
/
strideW
;
w_out_end
=
outputWidth
-
1
<
w_out_end
?
outputWidth
-
1
:
w_out_end
;
w_out_end
=
outputWidth
-
1
<
w_out_end
?
outputWidth
-
1
:
w_out_end
;
T
value
=
0
;
T
value
=
0
;
for
(
int
c_out
=
c_out_start
;
for
(
int
c_out
=
c_out_start
;
c_out
<
c_out_start
+
filterMultiplier
;
c_out
<
c_out_start
+
filterMultiplier
;
c_out
++
)
{
c_out
++
)
{
for
(
int
h_out
=
h_out_start
;
h_out
<=
h_out_end
;
++
h_out
)
{
for
(
int
h_out
=
h_out_start
;
h_out
<=
h_out_end
;
++
h_out
)
{
const
int
filter_h
=
h_in
+
paddingH
-
h_out
*
strideH
;
const
int
filter_h
=
h_in
+
paddingH
-
h_out
*
strideH
;
for
(
int
w_out
=
w_out_start
;
w_out
<=
w_out_end
;
++
w_out
)
{
for
(
int
w_out
=
w_out_start
;
w_out
<=
w_out_end
;
++
w_out
)
{
const
int
filter_w
=
w_in
+
paddingW
-
w_out
*
strideW
;
const
int
filter_w
=
w_in
+
paddingW
-
w_out
*
strideW
;
const
int
filter_offset
=
c_out
*
filterHeight
*
filterWidth
const
int
filter_offset
=
c_out
*
filterHeight
*
filterWidth
+
+
filter_h
*
filterWidth
+
filter_w
;
filter_h
*
filterWidth
+
filter_w
;
const
int
top_diff_offset
=
((
batch
*
outputChannels
+
c_out
)
*
const
int
top_diff_offset
=
outputHeight
+
h_out
)
*
outputWidth
+
w_out
;
((
batch
*
outputChannels
+
c_out
)
*
outputHeight
+
h_out
)
*
value
+=
top_diff
[
top_diff_offset
]
*
weight_data
[
filter_offset
];
outputWidth
+
}
w_out
;
value
+=
top_diff
[
top_diff_offset
]
*
weight_data
[
filter_offset
];
}
}
}
}
}
bottom_diff
[
index
]
+=
value
;
bottom_diff
[
index
]
+=
value
;
}
}
}
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
ConvolutionDepthwiseFilterBackward
(
const
int
num_i
,
void
ConvolutionDepthwiseFilterBackward
(
const
int
num_i
,
const
int
nthreads
,
const
int
nthreads
,
const
T
*
const
top_diff
,
const
T
*
const
inputData
,
const
T
*
const
top_diff
,
const
int
num
,
const
int
outputChannels
,
const
int
outputHeight
,
const
T
*
const
inputData
,
const
int
outputWidth
,
const
int
inputChannels
,
const
int
inputHeight
,
const
int
num
,
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
outputChannels
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
outputHeight
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
buffer_data
)
{
const
int
outputWidth
,
int
index
=
const
int
inputChannels
,
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
inputHeight
,
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
buffer_data
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
const
int
h_out
=
(
index
/
outputWidth
)
%
outputHeight
;
const
int
h_out
=
(
index
/
outputWidth
)
%
outputHeight
;
const
int
w_out
=
index
%
outputWidth
;
const
int
w_out
=
index
%
outputWidth
;
const
int
kh
=
(
index
/
filterWidth
/
outputHeight
/
outputWidth
)
const
int
kh
=
%
filterHeight
;
(
index
/
filterWidth
/
outputHeight
/
outputWidth
)
%
filterHeight
;
const
int
kw
=
(
index
/
outputHeight
/
outputWidth
)
%
filterWidth
;
const
int
kw
=
(
index
/
outputHeight
/
outputWidth
)
%
filterWidth
;
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
&&
(
w_in
>=
0
)
&&
&&
(
w_in
>=
0
)
&&
(
w_in
<
inputWidth
))
{
(
w_in
<
inputWidth
))
{
const
int
c_out
=
index
/
const
int
c_out
=
(
filterHeight
*
filterWidth
*
outputHeight
*
outputWidth
);
index
/
(
filterHeight
*
filterWidth
*
outputHeight
*
outputWidth
);
const
int
c_in
=
c_out
/
filterMultiplier
;
const
int
c_in
=
c_out
/
filterMultiplier
;
const
int
batch
=
num_i
;
const
int
batch
=
num_i
;
const
int
top_offset
=
((
batch
*
outputChannels
+
c_out
)
*
const
int
top_offset
=
outputHeight
+
h_out
)
*
outputWidth
+
w_out
;
((
batch
*
outputChannels
+
c_out
)
*
outputHeight
+
h_out
)
*
const
int
bottom_offset
=
((
batch
*
inputChannels
+
c_in
)
outputWidth
+
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
w_out
;
const
int
bottom_offset
=
((
batch
*
inputChannels
+
c_in
)
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
buffer_data
[
index
]
=
top_diff
[
top_offset
]
*
inputData
[
bottom_offset
];
buffer_data
[
index
]
=
top_diff
[
top_offset
]
*
inputData
[
bottom_offset
];
}
else
{
}
else
{
buffer_data
[
index
]
=
0
;
buffer_data
[
index
]
=
0
;
...
@@ -163,170 +199,169 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
...
@@ -163,170 +199,169 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
}
}
template
<
class
T
>
template
<
class
T
>
class
DepthwiseConvFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
class
DepthwiseConvFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
public:
public:
void
operator
()(
const
T
*
inputData
,
void
operator
()(
const
T
*
inputData
,
const
T
*
filterData
,
const
T
*
filterData
,
int
batchSize
,
int
batchSize
,
int
outputChannels
,
int
outputChannels
,
int
outputHeight
,
int
outputHeight
,
int
outputWidth
,
int
outputWidth
,
int
inputChannels
,
int
inputChannels
,
int
inputHeight
,
int
inputHeight
,
int
inputWidth
,
int
inputWidth
,
int
filterMultiplier
,
int
filterMultiplier
,
int
filterHeight
,
int
filterHeight
,
int
filterWidth
,
int
filterWidth
,
int
strideH
,
int
strideH
,
int
strideW
,
int
strideW
,
int
paddingH
,
int
paddingH
,
int
paddingW
,
int
paddingW
,
T
*
outputData
)
{
T
*
outputData
)
{
int
outputSize
=
batchSize
*
outputChannels
*
outputHeight
*
outputWidth
;
int
outputSize
=
batchSize
*
outputChannels
*
outputHeight
*
outputWidth
;
size_t
blocks
=
(
outputSize
+
1024
-
1
)
/
1024
;
size_t
blocks
=
(
outputSize
+
1024
-
1
)
/
1024
;
size_t
blockX
=
512
;
size_t
blockX
=
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
ConvolutionDepthwiseForward
<
T
>
ConvolutionDepthwiseForward
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
outputSize
,
outputSize
,
inputData
,
inputData
,
filterData
,
filterData
,
batchSize
,
batchSize
,
outputChannels
,
outputChannels
,
outputHeight
,
outputHeight
,
outputWidth
,
outputWidth
,
inputChannels
,
inputChannels
,
inputHeight
,
inputHeight
,
inputWidth
,
inputWidth
,
filterMultiplier
,
filterMultiplier
,
filterHeight
,
filterHeight
,
filterWidth
,
filterWidth
,
strideH
,
strideH
,
strideW
,
strideW
,
paddingH
,
paddingH
,
paddingW
,
paddingW
,
outputData
);
outputData
);
}
}
};
};
template
<
class
T
>
template
<
class
T
>
class
DepthwiseConvGradInputFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
class
DepthwiseConvGradInputFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
public:
public:
void
operator
()(
const
T
*
outputGrad
,
void
operator
()(
const
T
*
outputGrad
,
const
T
*
filterData
,
const
T
*
filterData
,
int
batchSize
,
int
batchSize
,
int
outputChannels
,
int
outputChannels
,
int
outputHeight
,
int
outputHeight
,
int
outputWidth
,
int
outputWidth
,
int
inputChannels
,
int
inputChannels
,
int
inputHeight
,
int
inputHeight
,
int
inputWidth
,
int
inputWidth
,
int
filterMultiplier
,
int
filterMultiplier
,
int
filterHeight
,
int
filterHeight
,
int
filterWidth
,
int
filterWidth
,
int
strideH
,
int
strideH
,
int
strideW
,
int
strideW
,
int
paddingH
,
int
paddingH
,
int
paddingW
,
int
paddingW
,
T
*
inputGrad
)
{
T
*
inputGrad
)
{
int
inputSize
=
batchSize
*
inputChannels
*
inputHeight
*
inputWidth
;
int
inputSize
=
batchSize
*
inputChannels
*
inputHeight
*
inputWidth
;
size_t
blocks
=
(
inputSize
+
1024
-
1
)
/
1024
;
size_t
blocks
=
(
inputSize
+
1024
-
1
)
/
1024
;
size_t
blockX
=
512
;
size_t
blockX
=
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
ConvolutionDepthwiseInputBackward
<
T
>
ConvolutionDepthwiseInputBackward
<
T
>
// NOLINT_NEXT_LINE(whitespace/operators)
// NOLINT_NEXT_LINE(whitespace/operators)
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
inputSize
,
inputSize
,
outputGrad
,
outputGrad
,
filterData
,
filterData
,
batchSize
,
batchSize
,
outputChannels
,
outputChannels
,
outputHeight
,
outputHeight
,
outputWidth
,
outputWidth
,
inputChannels
,
inputChannels
,
inputHeight
,
inputHeight
,
inputWidth
,
inputWidth
,
filterMultiplier
,
filterMultiplier
,
filterHeight
,
filterHeight
,
filterWidth
,
filterWidth
,
strideH
,
strideH
,
strideW
,
strideW
,
paddingH
,
paddingH
,
paddingW
,
paddingW
,
inputGrad
);
inputGrad
);
}
}
};
};
template
<
class
T
>
template
<
class
T
>
class
DepthwiseConvGradFilterFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
class
DepthwiseConvGradFilterFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
public:
public:
void
operator
()(
const
T
*
outputGrad
,
void
operator
()(
const
T
*
outputGrad
,
const
T
*
inputData
,
const
T
*
inputData
,
int
batchSize
,
int
batchSize
,
int
outputChannels
,
int
outputChannels
,
int
outputHeight
,
int
outputHeight
,
int
outputWidth
,
int
outputWidth
,
int
inputChannels
,
int
inputChannels
,
int
inputHeight
,
int
inputHeight
,
int
inputWidth
,
int
inputWidth
,
int
filterMultiplier
,
int
filterMultiplier
,
int
filterHeight
,
int
filterHeight
,
int
filterWidth
,
int
filterWidth
,
int
strideH
,
int
strideH
,
int
strideW
,
int
strideW
,
int
paddingH
,
int
paddingH
,
int
paddingW
,
int
paddingW
,
T
*
colData
,
T
*
colData
,
T
*
filterGrad
)
{
T
*
filterGrad
)
{
int
colDataSize
=
outputChannels
*
filterHeight
*
filterWidth
int
colDataSize
=
outputChannels
*
filterHeight
*
filterWidth
*
*
outputHeight
*
outputWidth
;
outputHeight
*
outputWidth
;
size_t
blocks
=
(
colDataSize
+
1024
-
1
)
/
1024
;
size_t
blocks
=
(
colDataSize
+
1024
-
1
)
/
1024
;
size_t
blockX
=
512
;
size_t
blockX
=
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
BaseMatrix
filterGradMatrix
(
outputChannels
*
filterHeight
*
filterWidth
,
BaseMatrix
filterGradMatrix
(
outputChannels
*
filterHeight
*
filterWidth
,
1
,
filterGrad
,
false
,
true
);
1
,
filterGrad
,
false
,
true
);
for
(
int
i
=
0
;
i
<
batchSize
;
i
++
)
{
for
(
int
i
=
0
;
i
<
batchSize
;
i
++
)
{
ConvolutionDepthwiseFilterBackward
<
T
>
ConvolutionDepthwiseFilterBackward
<
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
i
,
i
,
colDataSize
,
colDataSize
,
outputGrad
,
outputGrad
,
inputData
,
inputData
,
batchSize
,
batchSize
,
outputChannels
,
outputChannels
,
outputHeight
,
outputHeight
,
outputWidth
,
outputWidth
,
inputChannels
,
inputChannels
,
inputHeight
,
inputHeight
,
inputWidth
,
inputWidth
,
filterMultiplier
,
filterMultiplier
,
filterHeight
,
filterHeight
,
filterWidth
,
filterWidth
,
strideH
,
strideH
,
strideW
,
strideW
,
paddingH
,
paddingH
,
paddingW
,
paddingW
,
colData
);
colData
);
int
K
=
outputHeight
*
outputWidth
;
int
K
=
outputHeight
*
outputWidth
;
int
M
=
colDataSize
/
K
;
int
M
=
colDataSize
/
K
;
BaseMatrix
colMatrix
(
M
,
K
,
colData
,
false
,
true
);
BaseMatrix
colMatrix
(
M
,
K
,
colData
,
false
,
true
);
filterGradMatrix
.
sumRows
(
colMatrix
,
(
T
)
1.0
,
(
T
)
1.0
);
filterGradMatrix
.
sumRows
(
colMatrix
,
(
T
)
1.0
,
(
T
)
1.0
);
}
}
}
}
};
};
#ifdef PADDLE_TYPE_DOUBLE
#ifdef PADDLE_TYPE_DOUBLE
...
...
paddle/function/Im2ColOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -17,16 +17,21 @@ limitations under the License. */
...
@@ -17,16 +17,21 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
im2col
(
const
T
*
data_im
,
void
im2col
(
const
T
*
data_im
,
int
numOuts
,
int
height
,
int
width
,
int
numOuts
,
int
blockH
,
int
blockW
,
int
height
,
int
strideH
,
int
strideW
,
int
width
,
int
paddingH
,
int
paddingW
,
int
blockH
,
int
height_col
,
int
width_col
,
int
blockW
,
T
*
data_col
)
{
int
strideH
,
int
index
=
int
strideW
,
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
int
paddingH
,
int
paddingW
,
int
height_col
,
int
width_col
,
T
*
data_col
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
numOuts
)
{
if
(
index
<
numOuts
)
{
int
w_out
=
index
%
width_col
;
int
w_out
=
index
%
width_col
;
index
/=
width_col
;
index
/=
width_col
;
...
@@ -39,17 +44,17 @@ void im2col(const T* data_im, int numOuts, int height, int width,
...
@@ -39,17 +44,17 @@ void im2col(const T* data_im, int numOuts, int height, int width,
data_col
+=
(
channel_out
*
height_col
+
h_out
)
*
width_col
+
w_out
;
data_col
+=
(
channel_out
*
height_col
+
h_out
)
*
width_col
+
w_out
;
for
(
int
i
=
0
;
i
<
blockH
;
++
i
)
{
for
(
int
i
=
0
;
i
<
blockH
;
++
i
)
{
for
(
int
j
=
0
;
j
<
blockW
;
++
j
)
{
for
(
int
j
=
0
;
j
<
blockW
;
++
j
)
{
int
rIdx
=
int
(
h_in
+
i
);
int
rIdx
=
int
(
h_in
+
i
);
int
cIdx
=
int
(
w_in
+
j
);
int
cIdx
=
int
(
w_in
+
j
);
if
((
rIdx
-
(
int
)
paddingH
)
>=
(
int
)
height
||
if
((
rIdx
-
(
int
)
paddingH
)
>=
(
int
)
height
||
(
rIdx
-
(
int
)
paddingH
)
<
0
||
(
rIdx
-
(
int
)
paddingH
)
<
0
||
(
cIdx
-
(
int
)
paddingW
)
>=
(
int
)
width
||
(
cIdx
-
(
int
)
paddingW
)
>=
(
int
)
width
||
(
cIdx
-
(
int
)
paddingW
)
<
0
)
{
(
cIdx
-
(
int
)
paddingW
)
<
0
)
{
*
data_col
=
0
;
*
data_col
=
0
;
}
else
{
}
else
{
rIdx
=
rIdx
+
channel_in
*
height
-
paddingH
;
rIdx
=
rIdx
+
channel_in
*
height
-
paddingH
;
cIdx
=
cIdx
-
paddingW
;
cIdx
=
cIdx
-
paddingW
;
*
data_col
=
data_im
[
rIdx
*
width
+
cIdx
];
*
data_col
=
data_im
[
rIdx
*
width
+
cIdx
];
}
}
data_col
+=
height_col
*
width_col
;
data_col
+=
height_col
*
width_col
;
}
}
...
@@ -82,60 +87,73 @@ public:
...
@@ -82,60 +87,73 @@ public:
int
outputWidth
=
colShape
[
4
];
int
outputWidth
=
colShape
[
4
];
int
numKernels
=
inputChannels
*
outputHeight
*
outputWidth
;
int
numKernels
=
inputChannels
*
outputHeight
*
outputWidth
;
int
blocks
=
(
numKernels
+
1024
-
1
)
/
1024
;
int
blocks
=
(
numKernels
+
1024
-
1
)
/
1024
;
int
blockX
=
512
;
int
blockX
=
512
;
int
blockY
=
(
blocks
+
512
-
1
)
/
512
;
int
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
im2col
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
im2col
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
imData
,
(
imData
,
numKernels
,
inputHeight
,
inputWidth
,
filterHeight
,
filterWidth
,
numKernels
,
strideHeight
,
strideWidth
,
paddingHeight
,
paddingWidth
,
inputHeight
,
outputHeight
,
outputWidth
,
colData
);
inputWidth
,
filterHeight
,
filterWidth
,
strideHeight
,
strideWidth
,
paddingHeight
,
paddingWidth
,
outputHeight
,
outputWidth
,
colData
);
CHECK_SYNC
(
"Im2ColFunctor GPU failed"
);
CHECK_SYNC
(
"Im2ColFunctor GPU failed"
);
}
}
};
};
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
col2im
(
size_t
n
,
void
col2im
(
size_t
n
,
const
T
*
data_col
,
size_t
height
,
const
T
*
data_col
,
size_t
width
,
size_t
channels
,
size_t
height
,
size_t
blockH
,
size_t
blockW
,
size_t
width
,
size_t
strideH
,
size_t
strideW
,
size_t
channels
,
size_t
paddingH
,
size_t
paddingW
,
size_t
blockH
,
size_t
height_col
,
size_t
width_col
,
size_t
blockW
,
T
*
data_im
)
{
size_t
strideH
,
size_t
strideW
,
size_t
paddingH
,
size_t
paddingW
,
size_t
height_col
,
size_t
width_col
,
T
*
data_im
)
{
size_t
index
=
size_t
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
n
)
{
if
(
index
<
n
)
{
T
val
=
0
;
T
val
=
0
;
int
w
=
int
(
index
%
width
);
int
w
=
int
(
index
%
width
);
int
h
=
int
((
index
/
width
)
%
height
);
int
h
=
int
((
index
/
width
)
%
height
);
int
c
=
int
(
index
/
(
width
*
height
));
int
c
=
int
(
index
/
(
width
*
height
));
if
((
w
-
(
int
)
paddingW
)
>=
0
&&
if
((
w
-
(
int
)
paddingW
)
>=
0
&&
(
w
-
(
int
)
paddingW
)
<
(
width
-
2
*
paddingW
)
&&
(
w
-
(
int
)
paddingW
)
<
(
width
-
2
*
paddingW
)
&&
(
h
-
(
int
)
paddingH
)
>=
0
&&
(
h
-
(
int
)
paddingH
)
>=
0
&&
(
h
-
paddingH
)
<
(
height
-
2
*
paddingH
))
{
(
h
-
paddingH
)
<
(
height
-
2
*
paddingH
))
{
// compute the start and end of the output
// compute the start and end of the output
int
w_col_start
=
int
w_col_start
=
(
w
<
(
int
)
blockW
)
?
0
:
(
w
-
int
(
blockW
))
/
(
int
)
strideW
+
1
;
(
w
<
(
int
)
blockW
)
?
0
:
(
w
-
int
(
blockW
))
/
(
int
)
strideW
+
1
;
int
w_col_end
=
int
w_col_end
=
min
((
int
)(
w
/
(
int
)
strideW
+
1
),
(
int
)(
width_col
));
min
((
int
)(
w
/
(
int
)
strideW
+
1
),
(
int
)(
width_col
));
int
h_col_start
=
int
h_col_start
=
(
h
<
(
int
)
blockH
)
?
0
:
(
h
-
(
int
)
blockH
)
/
(
int
)
strideH
+
1
;
(
h
<
(
int
)
blockH
)
?
0
:
(
h
-
(
int
)
blockH
)
/
(
int
)
strideH
+
1
;
int
h_col_end
=
min
(
int
(
h
/
strideH
+
1
),
int
(
height_col
));
int
h_col_end
=
min
(
int
(
h
/
strideH
+
1
),
int
(
height_col
));
for
(
int
h_col
=
h_col_start
;
h_col
<
h_col_end
;
++
h_col
)
{
for
(
int
h_col
=
h_col_start
;
h_col
<
h_col_end
;
++
h_col
)
{
for
(
int
w_col
=
w_col_start
;
w_col
<
w_col_end
;
++
w_col
)
{
for
(
int
w_col
=
w_col_start
;
w_col
<
w_col_end
;
++
w_col
)
{
// the col location: [c * width * height + h_out, w_out]
// the col location: [c * width * height + h_out, w_out]
int
c_col
=
int
(
c
*
blockH
*
blockW
)
+
\
int
c_col
=
int
(
c
*
blockH
*
blockW
)
+
(
h
-
h_col
*
(
int
)
strideH
)
*
(
int
)
blockW
+
(
h
-
h_col
*
(
int
)
strideH
)
*
(
int
)
blockW
+
(
w
-
w_col
*
(
int
)
strideW
);
(
w
-
w_col
*
(
int
)
strideW
);
val
+=
data_col
[(
c_col
*
height_col
+
h_col
)
*
width_col
+
w_col
];
val
+=
data_col
[(
c_col
*
height_col
+
h_col
)
*
width_col
+
w_col
];
}
}
}
}
h
-=
paddingH
;
h
-=
paddingH
;
w
-=
paddingW
;
w
-=
paddingW
;
data_im
[
c
*
((
width
-
2
*
paddingW
)
*
(
height
-
2
*
paddingH
))
+
data_im
[
c
*
((
width
-
2
*
paddingW
)
*
(
height
-
2
*
paddingH
))
+
h
*
(
width
-
2
*
paddingW
)
+
w
]
+=
val
;
h
*
(
width
-
2
*
paddingW
)
+
w
]
+=
val
;
}
}
}
}
}
}
...
@@ -164,32 +182,32 @@ public:
...
@@ -164,32 +182,32 @@ public:
int
outputHeight
=
colShape
[
3
];
int
outputHeight
=
colShape
[
3
];
int
outputWidth
=
colShape
[
4
];
int
outputWidth
=
colShape
[
4
];
size_t
numKernels
=
inputChannels
*
(
inputHeight
+
2
*
paddingHeight
)
size_t
numKernels
=
inputChannels
*
(
inputHeight
+
2
*
paddingHeight
)
*
*
(
inputWidth
+
2
*
paddingWidth
);
(
inputWidth
+
2
*
paddingWidth
);
size_t
blocks
=
(
numKernels
+
1024
-
1
)
/
1024
;
size_t
blocks
=
(
numKernels
+
1024
-
1
)
/
1024
;
size_t
blockX
=
512
;
size_t
blockX
=
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
// To avoid involving atomic operations, we will launch one kernel per
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
// bottom dimension, and then in the kernel add up the top dimensions.
col2im
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
col2im
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
numKernels
,
numKernels
,
colData
,
colData
,
inputHeight
+
2
*
paddingHeight
,
inputHeight
+
2
*
paddingHeight
,
inputWidth
+
2
*
paddingWidth
,
inputWidth
+
2
*
paddingWidth
,
inputChannels
,
inputChannels
,
filterHeight
,
filterHeight
,
filterWidth
,
filterWidth
,
strideHeight
,
strideHeight
,
strideWidth
,
strideWidth
,
paddingHeight
,
paddingHeight
,
paddingWidth
,
paddingWidth
,
outputHeight
,
outputHeight
,
outputWidth
,
outputWidth
,
imData
);
imData
);
CHECK_SYNC
(
"Col2ImFunctor GPU failed"
);
CHECK_SYNC
(
"Col2ImFunctor GPU failed"
);
}
}
};
};
...
@@ -199,31 +217,35 @@ template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
...
@@ -199,31 +217,35 @@ template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
template
class
Col2ImFunctor
<
kCFO
,
DEVICE_TYPE_GPU
,
float
>;
template
class
Col2ImFunctor
<
kCFO
,
DEVICE_TYPE_GPU
,
float
>;
template
class
Col2ImFunctor
<
kCFO
,
DEVICE_TYPE_GPU
,
double
>;
template
class
Col2ImFunctor
<
kCFO
,
DEVICE_TYPE_GPU
,
double
>;
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
im2colOCF
(
const
T
*
imData
,
void
im2colOCF
(
const
T
*
imData
,
T
*
colData
,
T
*
colData
,
int
inputChannels
,
int
inputChannels
,
int
inputHeight
,
int
inputWidth
,
int
inputHeight
,
int
filterHeight
,
int
filterWidth
,
int
inputWidth
,
int
strideHeight
,
int
strideWidth
,
int
filterHeight
,
int
paddingHeight
,
int
paddingWidth
,
int
filterWidth
,
int
outputHeight
,
int
outputWidth
)
{
int
strideHeight
,
int
strideWidth
,
int
paddingHeight
,
int
paddingWidth
,
int
outputHeight
,
int
outputWidth
)
{
int
swId
=
blockIdx
.
x
;
int
swId
=
blockIdx
.
x
;
int
shId
=
blockIdx
.
y
;
int
shId
=
blockIdx
.
y
;
for
(
int
channelId
=
threadIdx
.
z
;
for
(
int
channelId
=
threadIdx
.
z
;
channelId
<
inputChannels
;
channelId
<
inputChannels
;
channelId
+=
blockDim
.
z
)
{
channelId
+=
blockDim
.
z
)
{
for
(
int
idy
=
threadIdx
.
y
;
idy
<
filterHeight
;
idy
+=
blockDim
.
y
)
{
for
(
int
idy
=
threadIdx
.
y
;
idy
<
filterHeight
;
idy
+=
blockDim
.
y
)
{
for
(
int
idx
=
threadIdx
.
x
;
idx
<
filterWidth
;
idx
+=
blockDim
.
x
)
{
for
(
int
idx
=
threadIdx
.
x
;
idx
<
filterWidth
;
idx
+=
blockDim
.
x
)
{
int
widthOffset
=
idx
+
swId
*
strideWidth
-
paddingWidth
;
int
widthOffset
=
idx
+
swId
*
strideWidth
-
paddingWidth
;
int
heightOffset
=
idy
+
shId
*
strideHeight
-
paddingHeight
;
int
heightOffset
=
idy
+
shId
*
strideHeight
-
paddingHeight
;
int
imOffset
=
widthOffset
+
heightOffset
*
inputWidth
int
imOffset
=
widthOffset
+
heightOffset
*
inputWidth
+
+
channelId
*
inputHeight
*
inputWidth
;
channelId
*
inputHeight
*
inputWidth
;
int
colOffset
=
idx
+
idy
*
filterWidth
int
colOffset
=
idx
+
idy
*
filterWidth
+
+
channelId
*
filterHeight
*
filterWidth
channelId
*
filterHeight
*
filterWidth
+
+
(
shId
*
outputWidth
+
swId
)
(
shId
*
outputWidth
+
swId
)
*
*
(
inputChannels
*
filterHeight
*
filterWidth
);
(
inputChannels
*
filterHeight
*
filterWidth
);
if
(
heightOffset
>=
inputHeight
||
heightOffset
<
0
||
if
(
heightOffset
>=
inputHeight
||
heightOffset
<
0
||
widthOffset
>=
inputWidth
||
widthOffset
<
0
)
{
widthOffset
>=
inputWidth
||
widthOffset
<
0
)
{
...
@@ -279,39 +301,52 @@ public:
...
@@ -279,39 +301,52 @@ public:
int
blockDimZ
=
1024
/
blockDimX
/
blockDimY
;
int
blockDimZ
=
1024
/
blockDimX
/
blockDimY
;
dim3
threads
(
blockDimX
,
blockDimY
,
std
::
min
(
blockDimZ
,
inputChannels
));
dim3
threads
(
blockDimX
,
blockDimY
,
std
::
min
(
blockDimZ
,
inputChannels
));
dim3
grid
(
outputWidth
,
outputHeight
);
dim3
grid
(
outputWidth
,
outputHeight
);
im2colOCF
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
im2colOCF
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
imData
,
(
imData
,
colData
,
inputChannels
,
inputHeight
,
inputWidth
,
colData
,
filterHeight
,
filterWidth
,
strideHeight
,
strideWidth
,
inputChannels
,
paddingHeight
,
paddingWidth
,
outputHeight
,
outputWidth
);
inputHeight
,
inputWidth
,
filterHeight
,
filterWidth
,
strideHeight
,
strideWidth
,
paddingHeight
,
paddingWidth
,
outputHeight
,
outputWidth
);
CHECK_SYNC
(
"Im2ColFunctor GPU failed"
);
CHECK_SYNC
(
"Im2ColFunctor GPU failed"
);
}
}
};
};
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
col2imOCF
(
T
*
imData
,
void
col2imOCF
(
T
*
imData
,
const
T
*
colData
,
const
T
*
colData
,
int
inputChannels
,
int
inputChannels
,
int
inputHeight
,
int
inputWidth
,
int
inputHeight
,
int
filterHeight
,
int
filterWidth
,
int
inputWidth
,
int
strideHeight
,
int
strideWidth
,
int
filterHeight
,
int
paddingHeight
,
int
paddingWidth
,
int
filterWidth
,
int
outputHeight
,
int
outputWidth
)
{
int
strideHeight
,
int
strideWidth
,
int
paddingHeight
,
int
paddingWidth
,
int
outputHeight
,
int
outputWidth
)
{
int
swId
=
blockIdx
.
x
;
int
swId
=
blockIdx
.
x
;
int
shId
=
blockIdx
.
y
;
int
shId
=
blockIdx
.
y
;
for
(
int
channelId
=
threadIdx
.
z
;
for
(
int
channelId
=
threadIdx
.
z
;
channelId
<
inputChannels
;
channelId
<
inputChannels
;
channelId
+=
blockDim
.
z
)
{
channelId
+=
blockDim
.
z
)
{
for
(
int
idy
=
threadIdx
.
y
;
idy
<
filterHeight
;
idy
+=
blockDim
.
y
)
{
for
(
int
idy
=
threadIdx
.
y
;
idy
<
filterHeight
;
idy
+=
blockDim
.
y
)
{
for
(
int
idx
=
threadIdx
.
x
;
idx
<
filterWidth
;
idx
+=
blockDim
.
x
)
{
for
(
int
idx
=
threadIdx
.
x
;
idx
<
filterWidth
;
idx
+=
blockDim
.
x
)
{
int
widthOffset
=
idx
+
swId
*
strideWidth
-
paddingWidth
;
int
widthOffset
=
idx
+
swId
*
strideWidth
-
paddingWidth
;
int
heightOffset
=
idy
+
shId
*
strideHeight
-
paddingHeight
;
int
heightOffset
=
idy
+
shId
*
strideHeight
-
paddingHeight
;
int
imOffset
=
widthOffset
+
heightOffset
*
inputWidth
int
imOffset
=
widthOffset
+
heightOffset
*
inputWidth
+
+
channelId
*
inputHeight
*
inputWidth
;
channelId
*
inputHeight
*
inputWidth
;
int
colOffset
=
idx
+
idy
*
filterWidth
int
colOffset
=
idx
+
idy
*
filterWidth
+
+
channelId
*
filterHeight
*
filterWidth
channelId
*
filterHeight
*
filterWidth
+
+
(
shId
*
outputWidth
+
swId
)
(
shId
*
outputWidth
+
swId
)
*
*
(
inputChannels
*
filterHeight
*
filterWidth
);
(
inputChannels
*
filterHeight
*
filterWidth
);
if
(
heightOffset
>=
0
&&
heightOffset
<
inputHeight
&&
if
(
heightOffset
>=
0
&&
heightOffset
<
inputHeight
&&
widthOffset
>=
0
&&
widthOffset
<
inputWidth
)
{
widthOffset
>=
0
&&
widthOffset
<
inputWidth
)
{
...
@@ -365,10 +400,19 @@ public:
...
@@ -365,10 +400,19 @@ public:
int
blockDimZ
=
1024
/
blockDimX
/
blockDimY
;
int
blockDimZ
=
1024
/
blockDimX
/
blockDimY
;
dim3
threads
(
blockDimX
,
blockDimY
,
std
::
min
(
blockDimZ
,
inputChannels
));
dim3
threads
(
blockDimX
,
blockDimY
,
std
::
min
(
blockDimZ
,
inputChannels
));
dim3
grid
(
outputWidth
,
outputHeight
);
dim3
grid
(
outputWidth
,
outputHeight
);
col2imOCF
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
col2imOCF
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
imData
,
(
imData
,
colData
,
inputChannels
,
inputHeight
,
inputWidth
,
colData
,
filterHeight
,
filterWidth
,
strideHeight
,
strideWidth
,
inputChannels
,
paddingHeight
,
paddingWidth
,
outputHeight
,
outputWidth
);
inputHeight
,
inputWidth
,
filterHeight
,
filterWidth
,
strideHeight
,
strideWidth
,
paddingHeight
,
paddingWidth
,
outputHeight
,
outputWidth
);
CHECK_SYNC
(
"Col2ImFunctor GPU failed"
);
CHECK_SYNC
(
"Col2ImFunctor GPU failed"
);
}
}
};
};
...
...
paddle/function/MulOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "MulOp.h"
#include "MulOp.h"
#include "hl_base.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/SparseMatrix.h"
#include "paddle/math/SparseMatrix.h"
...
...
paddle/function/PadOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "PadOp.h"
#include "PadOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
__global__
void
KePad
(
real
*
outputs
,
const
real
*
inputs
,
__global__
void
KePad
(
real
*
outputs
,
int
inC
,
int
inH
,
int
inW
,
const
real
*
inputs
,
int
padc
,
int
padh
,
int
padw
,
int
inC
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
int
inH
,
int
inW
,
int
padc
,
int
padh
,
int
padw
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
nthreads
)
{
if
(
idx
<
nthreads
)
{
const
int
w
=
idx
%
inW
;
const
int
w
=
idx
%
inW
;
...
@@ -50,16 +58,33 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
...
@@ -50,16 +58,33 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
int
outC
=
inC
+
cstart
+
cend
;
int
outC
=
inC
+
cstart
+
cend
;
int
outH
=
inH
+
hstart
+
hend
;
int
outH
=
inH
+
hstart
+
hend
;
int
outW
=
inW
+
wstart
+
wend
;
int
outW
=
inW
+
wstart
+
wend
;
KePad
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KePad
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
outputs
,
(
outputs
,
inputs
,
inC
,
inH
,
inW
,
cstart
,
hstart
,
wstart
,
inputs
,
outC
,
outH
,
outW
,
nth
);
inC
,
inH
,
inW
,
cstart
,
hstart
,
wstart
,
outC
,
outH
,
outW
,
nth
);
CHECK_SYNC
(
"Pad"
);
CHECK_SYNC
(
"Pad"
);
}
}
__global__
void
KePadDiff
(
real
*
inGrad
,
const
real
*
outGrad
,
__global__
void
KePadDiff
(
real
*
inGrad
,
int
inC
,
int
inH
,
int
inW
,
const
real
*
outGrad
,
int
padc
,
int
padh
,
int
padw
,
int
inC
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
int
inH
,
int
inW
,
int
padc
,
int
padh
,
int
padw
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
nthreads
)
{
if
(
idx
<
nthreads
)
{
const
int
w
=
idx
%
inW
;
const
int
w
=
idx
%
inW
;
...
@@ -89,9 +114,18 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
...
@@ -89,9 +114,18 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
int
outC
=
inC
+
cstart
+
cend
;
int
outC
=
inC
+
cstart
+
cend
;
int
outH
=
inH
+
hstart
+
hend
;
int
outH
=
inH
+
hstart
+
hend
;
int
outW
=
inW
+
wstart
+
wend
;
int
outW
=
inW
+
wstart
+
wend
;
KePadDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KePadDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
inGrad
,
(
inGrad
,
outGrad
,
inC
,
inH
,
inW
,
cstart
,
hstart
,
wstart
,
outGrad
,
outC
,
outH
,
outW
,
nth
);
inC
,
inH
,
inW
,
cstart
,
hstart
,
wstart
,
outC
,
outH
,
outW
,
nth
);
CHECK_SYNC
(
"PadGrad"
);
CHECK_SYNC
(
"PadGrad"
);
}
}
...
...
paddle/function/RowConvOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "RowConvOp.h"
#include "RowConvOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
template
<
int
BLOCK_H
,
int
BLOCK_W
>
template
<
int
BLOCK_H
,
int
BLOCK_W
>
__global__
void
KeRowConv
(
real
*
y
,
const
real
*
x
,
const
real
*
w
,
__global__
void
KeRowConv
(
real
*
y
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
real
*
x
,
const
int
numSeq
,
const
int
context
)
{
const
real
*
w
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
int
context
)
{
const
int
tidx
=
threadIdx
.
x
;
const
int
tidx
=
threadIdx
.
x
;
const
int
tidy
=
threadIdx
.
y
;
const
int
tidy
=
threadIdx
.
y
;
const
int
blky
=
blockDim
.
y
;
const
int
blky
=
blockDim
.
y
;
...
@@ -30,7 +34,7 @@ __global__ void KeRowConv(real* y, const real* x, const real* w,
...
@@ -30,7 +34,7 @@ __global__ void KeRowConv(real* y, const real* x, const real* w,
__shared__
real
sw
[
BLOCK_H
][
BLOCK_W
];
__shared__
real
sw
[
BLOCK_H
][
BLOCK_W
];
for
(
int
i
=
tidy
;
i
<
context
;
i
+=
blky
)
{
for
(
int
i
=
tidy
;
i
<
context
;
i
+=
blky
)
{
sw
[
i
][
tidx
]
=
gidx
+
tidx
<
width
?
w
[
i
*
width
+
gidx
+
tidx
]
:
0.0
;
sw
[
i
][
tidx
]
=
gidx
+
tidx
<
width
?
w
[
i
*
width
+
gidx
+
tidx
]
:
0.0
;
}
}
__syncthreads
();
__syncthreads
();
...
@@ -56,9 +60,14 @@ __global__ void KeRowConv(real* y, const real* x, const real* w,
...
@@ -56,9 +60,14 @@ __global__ void KeRowConv(real* y, const real* x, const real* w,
}
}
}
}
__global__
void
KeRowConv2
(
real
*
y
,
const
real
*
x
,
const
real
*
w
,
__global__
void
KeRowConv2
(
real
*
y
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
real
*
x
,
const
int
numSeq
,
const
int
context
)
{
const
real
*
w
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
int
context
)
{
const
int
tidx
=
threadIdx
.
x
;
const
int
tidx
=
threadIdx
.
x
;
const
int
tidy
=
threadIdx
.
y
;
const
int
tidy
=
threadIdx
.
y
;
const
int
blky
=
blockDim
.
y
;
const
int
blky
=
blockDim
.
y
;
...
@@ -84,8 +93,6 @@ __global__ void KeRowConv2(real* y, const real* x, const real* w,
...
@@ -84,8 +93,6 @@ __global__ void KeRowConv2(real* y, const real* x, const real* w,
}
}
}
}
template
<
>
template
<
>
void
RowConv
<
DEVICE_TYPE_GPU
>
(
GpuMatrix
&
out
,
void
RowConv
<
DEVICE_TYPE_GPU
>
(
GpuMatrix
&
out
,
const
GpuMatrix
&
in
,
const
GpuMatrix
&
in
,
...
@@ -105,21 +112,24 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
...
@@ -105,21 +112,24 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
if
(
contextLength
<=
32
)
{
if
(
contextLength
<=
32
)
{
KeRowConv
<
32
,
32
><<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
KeRowConv
<
32
,
32
><<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
(
y
,
x
,
w
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
y
,
x
,
w
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
else
{
}
else
{
KeRowConv2
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
KeRowConv2
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
(
y
,
x
,
w
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
y
,
x
,
w
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
}
CHECK_SYNC
(
"RowConv"
);
CHECK_SYNC
(
"RowConv"
);
}
}
template
<
int
BLOCK_H
,
int
BLOCK_W
,
int
CONTEXT
>
template
<
int
BLOCK_H
,
int
BLOCK_W
,
int
CONTEXT
>
__global__
void
KeRowConvBwWeight
(
real
*
dw
,
__global__
void
KeRowConvBwWeight
(
real
*
dw
,
const
real
*
x
,
const
real
*
dy
,
const
real
*
x
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
real
*
dy
,
const
int
context
)
{
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
int
context
)
{
const
int
tidx
=
threadIdx
.
x
;
const
int
tidx
=
threadIdx
.
x
;
const
int
tidy
=
threadIdx
.
y
;
const
int
tidy
=
threadIdx
.
y
;
const
int
blky
=
blockDim
.
y
;
const
int
blky
=
blockDim
.
y
;
...
@@ -138,21 +148,21 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
...
@@ -138,21 +148,21 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
const
int
start
=
starts
[
i
];
const
int
start
=
starts
[
i
];
const
int
end
=
starts
[
i
+
1
];
const
int
end
=
starts
[
i
+
1
];
const
int
steps
=
end
-
start
;
const
int
steps
=
end
-
start
;
const
int
size
=
((
steps
+
BLOCK_H
-
1
)
/
BLOCK_H
)
*
BLOCK_H
;
const
int
size
=
((
steps
+
BLOCK_H
-
1
)
/
BLOCK_H
)
*
BLOCK_H
;
for
(
int
j
=
tidy
;
j
<
size
;
j
+=
BLOCK_H
)
{
for
(
int
j
=
tidy
;
j
<
size
;
j
+=
BLOCK_H
)
{
int
xoff
=
gidx
+
tidx
;
int
xoff
=
gidx
+
tidx
;
int
yoff
=
start
+
j
;
int
yoff
=
start
+
j
;
// transpose
// transpose
sh_x
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
yoff
<
end
)
?
sh_x
[
tidx
][
tidy
]
=
x
[
yoff
*
width
+
xoff
]
:
0.0
;
(
xoff
<
width
&&
yoff
<
end
)
?
x
[
yoff
*
width
+
xoff
]
:
0.0
;
sh_dy
[
tidx
][
tidy
+
context
-
1
]
=
(
xoff
<
width
&&
yoff
<
end
)
?
sh_dy
[
tidx
][
tidy
+
context
-
1
]
=
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
(
xoff
<
width
&&
yoff
<
end
)
?
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
__syncthreads
();
__syncthreads
();
if
(
tidy
<
(
context
-
1
))
{
if
(
tidy
<
(
context
-
1
))
{
yoff
=
yoff
-
context
+
1
;
yoff
=
yoff
-
context
+
1
;
sh_dy
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
yoff
>=
start
)
?
sh_dy
[
tidx
][
tidy
]
=
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
(
xoff
<
width
&&
yoff
>=
start
)
?
dy
[
yoff
*
width
+
xoff
]
:
0.0
;
}
}
__syncthreads
();
__syncthreads
();
...
@@ -179,11 +189,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
...
@@ -179,11 +189,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
}
}
}
}
template
<
int
BLOCK_H
,
int
BLOCK_W
>
template
<
int
BLOCK_H
,
int
BLOCK_W
>
__global__
void
KeRowConvBwWeight2
(
real
*
dw
,
const
real
*
x
,
const
real
*
dy
,
__global__
void
KeRowConvBwWeight2
(
real
*
dw
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
real
*
x
,
const
int
context
)
{
const
real
*
dy
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
int
context
)
{
const
int
tidx
=
threadIdx
.
x
;
const
int
tidx
=
threadIdx
.
x
;
const
int
tidy
=
threadIdx
.
y
;
const
int
tidy
=
threadIdx
.
y
;
const
int
gidx
=
blockIdx
.
x
*
blockDim
.
x
;
const
int
gidx
=
blockIdx
.
x
*
blockDim
.
x
;
...
@@ -196,19 +210,21 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
...
@@ -196,19 +210,21 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
const
int
end
=
starts
[
i
+
1
];
const
int
end
=
starts
[
i
+
1
];
const
int
steps
=
end
-
start
;
const
int
steps
=
end
-
start
;
const
int
size
=
((
steps
+
BLOCK_H
-
1
)
/
BLOCK_H
)
*
BLOCK_H
;
const
int
size
=
((
steps
+
BLOCK_H
-
1
)
/
BLOCK_H
)
*
BLOCK_H
;
for
(
int
j
=
tidy
;
j
<
size
;
j
+=
BLOCK_H
)
{
for
(
int
j
=
tidy
;
j
<
size
;
j
+=
BLOCK_H
)
{
int
xoff
=
gidx
+
tidx
;
int
xoff
=
gidx
+
tidx
;
int
yoff
=
start
+
j
;
int
yoff
=
start
+
j
;
// transpose
// transpose
sh_x
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
yoff
<
end
)
?
sh_x
[
tidx
][
tidy
]
=
x
[
yoff
*
width
+
xoff
]
:
0.0
;
(
xoff
<
width
&&
yoff
<
end
)
?
x
[
yoff
*
width
+
xoff
]
:
0.0
;
__syncthreads
();
__syncthreads
();
for
(
int
t
=
0
;
t
<
context
;
t
++
)
{
for
(
int
t
=
0
;
t
<
context
;
t
++
)
{
sh_dy
[
tidx
][
tidy
]
=
(
xoff
<
width
&&
(
yoff
-
t
)
>=
start
&&
sh_dy
[
tidx
][
tidy
]
=
yoff
-
t
<
end
)
?
dy
[(
yoff
-
t
)
*
width
+
xoff
]
:
0.0
;
(
xoff
<
width
&&
(
yoff
-
t
)
>=
start
&&
yoff
-
t
<
end
)
?
dy
[(
yoff
-
t
)
*
width
+
xoff
]
:
0.0
;
__syncthreads
();
__syncthreads
();
real
val
=
sh_x
[
tidy
][
tidx
]
*
sh_dy
[
tidy
][
tidx
];
real
val
=
sh_x
[
tidy
][
tidx
]
*
sh_dy
[
tidy
][
tidx
];
...
@@ -222,18 +238,22 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
...
@@ -222,18 +238,22 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
__syncthreads
();
__syncthreads
();
if
(
tidx
==
0
&&
(
gidx
+
tidy
)
<
width
)
{
if
(
tidx
==
0
&&
(
gidx
+
tidy
)
<
width
)
{
dw
[
t
*
width
+
gidx
+
tidy
]
+=
val
;
dw
[
t
*
width
+
gidx
+
tidy
]
+=
val
;
}
}
}
}
}
}
}
}
}
}
template
<
int
BLOCK_H
,
int
BLOCK_W
>
template
<
int
BLOCK_H
,
int
BLOCK_W
>
__global__
void
KeRowConvBwData
(
real
*
dx
,
const
real
*
w
,
const
real
*
dy
,
__global__
void
KeRowConvBwData
(
real
*
dx
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
real
*
w
,
const
int
context
)
{
const
real
*
dy
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
int
context
)
{
const
int
tidx
=
threadIdx
.
x
;
const
int
tidx
=
threadIdx
.
x
;
const
int
tidy
=
threadIdx
.
y
;
const
int
tidy
=
threadIdx
.
y
;
const
int
blky
=
blockDim
.
y
;
const
int
blky
=
blockDim
.
y
;
...
@@ -242,7 +262,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
...
@@ -242,7 +262,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
__shared__
real
sw
[
BLOCK_H
][
BLOCK_W
];
__shared__
real
sw
[
BLOCK_H
][
BLOCK_W
];
for
(
int
i
=
tidy
;
i
<
context
;
i
+=
blky
)
{
for
(
int
i
=
tidy
;
i
<
context
;
i
+=
blky
)
{
sw
[
i
][
tidx
]
=
gidx
+
tidx
<
width
?
w
[
i
*
width
+
gidx
+
tidx
]
:
0.0
;
sw
[
i
][
tidx
]
=
gidx
+
tidx
<
width
?
w
[
i
*
width
+
gidx
+
tidx
]
:
0.0
;
}
}
__syncthreads
();
__syncthreads
();
...
@@ -266,10 +286,14 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
...
@@ -266,10 +286,14 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
}
}
}
}
__global__
void
KeRowConvBwData2
(
real
*
dx
,
const
real
*
w
,
const
real
*
dy
,
__global__
void
KeRowConvBwData2
(
real
*
dx
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
real
*
w
,
const
int
context
)
{
const
real
*
dy
,
const
int
*
starts
,
const
int
height
,
const
int
width
,
const
int
numSeq
,
const
int
context
)
{
const
int
tidx
=
threadIdx
.
x
;
const
int
tidx
=
threadIdx
.
x
;
const
int
tidy
=
threadIdx
.
y
;
const
int
tidy
=
threadIdx
.
y
;
const
int
blky
=
blockDim
.
y
;
const
int
blky
=
blockDim
.
y
;
...
@@ -295,14 +319,13 @@ __global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
...
@@ -295,14 +319,13 @@ __global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
}
}
}
}
template
<
>
template
<
>
void
RowConvGrad
<
DEVICE_TYPE_GPU
>
(
const
GpuMatrix
&
outG
,
void
RowConvGrad
<
DEVICE_TYPE_GPU
>
(
const
GpuMatrix
&
outG
,
const
GpuMatrix
&
in
,
const
GpuMatrix
&
in
,
const
GpuMatrix
&
filter
,
const
GpuMatrix
&
filter
,
GpuMatrix
&
inG
,
GpuMatrix
&
inG
,
GpuMatrix
&
filterG
,
GpuMatrix
&
filterG
,
const
GpuIVector
&
seq
)
{
const
GpuIVector
&
seq
)
{
const
size_t
numSeq
=
seq
.
getSize
()
-
1
;
const
size_t
numSeq
=
seq
.
getSize
()
-
1
;
const
size_t
contextLength
=
filter
.
getHeight
();
const
size_t
contextLength
=
filter
.
getHeight
();
const
size_t
height
=
in
.
getHeight
();
const
size_t
height
=
in
.
getHeight
();
...
@@ -318,13 +341,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
...
@@ -318,13 +341,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
dim3
dimGrid
(
DIVUP
(
width
,
dimBlock
.
x
),
1
);
real
*
dw
=
filterG
.
getData
();
real
*
dw
=
filterG
.
getData
();
if
(
contextLength
<=
32
)
{
if
(
contextLength
<=
32
)
{
KeRowConvBwWeight
<
32
,
32
,
32
>
KeRowConvBwWeight
<
32
,
32
,
32
><<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
(
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
else
{
}
else
{
KeRowConvBwWeight2
<
32
,
32
>
KeRowConvBwWeight2
<
32
,
32
><<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
(
dw
,
x
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
}
}
}
...
@@ -333,13 +354,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
...
@@ -333,13 +354,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
dim3
dimBlock2
(
32
,
32
);
dim3
dimBlock2
(
32
,
32
);
dim3
dimGrid2
(
DIVUP
(
width
,
dimBlock2
.
x
),
1
);
dim3
dimGrid2
(
DIVUP
(
width
,
dimBlock2
.
x
),
1
);
if
(
contextLength
<=
64
)
{
if
(
contextLength
<=
64
)
{
KeRowConvBwData
<
32
,
64
>
KeRowConvBwData
<
32
,
64
><<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
(
<<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
(
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
else
{
}
else
{
KeRowConvBwData2
KeRowConvBwData2
<<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
(
<<<
dimGrid2
,
dimBlock2
,
0
,
STREAM_DEFAULT
>>>
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
(
dx
,
w
,
dy
,
starts
,
height
,
width
,
numSeq
,
contextLength
);
}
}
}
}
...
...
paddle/gserver/layers/GruCompute.cu
浏览文件 @
1d4fa243
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "GruCompute.h"
#include "GruCompute.h"
#include "hl_recurrent_apply.cuh"
#include "hl_recurrent_apply.cuh"
...
@@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
...
@@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
}
}
template
<
>
template
<
>
void
GruCompute
::
backward
<
1
>
(
hl_gru_value
value
,
hl_gru_grad
grad
,
void
GruCompute
::
backward
<
1
>
(
hl_gru_value
value
,
int
frameSize
,
int
batchSize
)
{
hl_gru_grad
grad
,
int
frameSize
,
int
batchSize
)
{
hl_gpu_gru_backward
(
hppl
::
backward
::
gru_stateGrad
(),
hl_gpu_gru_backward
(
hppl
::
backward
::
gru_stateGrad
(),
hppl
::
backward
::
gru_resetGrad
(),
hppl
::
backward
::
gru_resetGrad
(),
value
,
value
,
...
...
paddle/gserver/layers/LstmCompute.cu
浏览文件 @
1d4fa243
...
@@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "LstmCompute.h"
#include "LstmCompute.h"
#include "hl_recurrent_apply.cuh"
#include "hl_recurrent_apply.cuh"
namespace
paddle
{
namespace
paddle
{
template
<
>
template
<
>
void
LstmCompute
::
forwardBatch
<
1
>
(
hl_lstm_value
value
,
int
frameSize
,
void
LstmCompute
::
forwardBatch
<
1
>
(
hl_lstm_value
value
,
int
batchSize
)
{
int
frameSize
,
hl_gpu_lstm_forward
(
hppl
::
forward
::
lstm
(),
value
,
frameSize
,
int
batchSize
)
{
batchSize
,
activeNode_
,
activeGate_
,
hl_gpu_lstm_forward
(
hppl
::
forward
::
lstm
(),
value
,
frameSize
,
batchSize
,
activeNode_
,
activeGate_
,
activeState_
);
activeState_
);
}
}
template
<
>
template
<
>
void
LstmCompute
::
backwardBatch
<
1
>
(
hl_lstm_value
value
,
hl_lstm_grad
grad
,
void
LstmCompute
::
backwardBatch
<
1
>
(
hl_lstm_value
value
,
int
frameSize
,
int
batchSize
)
{
hl_lstm_grad
grad
,
hl_gpu_lstm_backward
(
hppl
::
backward
::
lstm
(),
value
,
grad
,
int
frameSize
,
frameSize
,
batchSize
,
activeNode_
,
int
batchSize
)
{
activeGate_
,
activeState_
);
hl_gpu_lstm_backward
(
hppl
::
backward
::
lstm
(),
value
,
grad
,
frameSize
,
batchSize
,
activeNode_
,
activeGate_
,
activeState_
);
}
}
template
<
>
template
<
>
void
LstmCompute
::
forwardOneSequence
<
1
>
(
hl_lstm_value
value
,
int
frameSize
)
{
void
LstmCompute
::
forwardOneSequence
<
1
>
(
hl_lstm_value
value
,
int
frameSize
)
{
hl_gpu_lstm_forward
(
hppl
::
forward
::
lstm
(),
value
,
hl_gpu_lstm_forward
(
hppl
::
forward
::
lstm
(),
frameSize
,
/* batchSize */
1
,
value
,
activeNode_
,
activeGate_
,
activeState_
);
frameSize
,
/* batchSize */
1
,
activeNode_
,
activeGate_
,
activeState_
);
}
}
template
<
>
template
<
>
void
LstmCompute
::
backwardOneSequence
<
1
>
(
hl_lstm_value
value
,
hl_lstm_grad
grad
,
void
LstmCompute
::
backwardOneSequence
<
1
>
(
hl_lstm_value
value
,
hl_lstm_grad
grad
,
int
frameSize
)
{
int
frameSize
)
{
hl_gpu_lstm_backward
(
hppl
::
backward
::
lstm
(),
value
,
grad
,
hl_gpu_lstm_backward
(
hppl
::
backward
::
lstm
(),
frameSize
,
/* batchSize */
1
,
value
,
activeNode_
,
activeGate_
,
activeState_
);
grad
,
frameSize
,
/* batchSize */
1
,
activeNode_
,
activeGate_
,
activeState_
);
}
}
}
// namespace paddle
}
// namespace paddle
paddle/math/BaseMatrix.cu
浏览文件 @
1d4fa243
...
@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <cmath>
#include <string.h>
#include <paddle/utils/Logging.h>
#include <paddle/utils/Logging.h>
#include <string.h>
#include <cmath>
#include "BaseMatrix.h"
#include "BaseMatrix.h"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_base.cuh"
#include "hl_matrix_apply.cuh"
#include "SIMDFunctions.h"
#include "MathFunctions.h"
#include "MathFunctions.h"
#include "SIMDFunctions.h"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_base.cuh"
#include "hl_matrix_ops.cuh"
namespace
paddle
{
namespace
paddle
{
const
char
*
SPARSE_SUPPORT_ERROR
=
"Sparse Matrix/Vector is not supported."
;
const
char
*
SPARSE_SUPPORT_ERROR
=
"Sparse Matrix/Vector is not supported."
;
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyUnary
(
Op
op
)
{
int
BaseMatrixT
<
T
>::
applyUnary
(
Op
op
)
{
MatrixOffset
offset
(
0
,
0
);
MatrixOffset
offset
(
0
,
0
);
...
@@ -34,9 +34,11 @@ int BaseMatrixT<T>::applyUnary(Op op) {
...
@@ -34,9 +34,11 @@ int BaseMatrixT<T>::applyUnary(Op op) {
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyUnary
(
Op
op
,
int
numRows
,
int
numCols
,
int
BaseMatrixT
<
T
>::
applyUnary
(
Op
op
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
)
{
MatrixOffset
&
offset
)
{
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
int
dimM
=
numRows
;
int
dimM
=
numRows
;
...
@@ -56,7 +58,7 @@ int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
...
@@ -56,7 +58,7 @@ int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyBinary
(
Op
op
,
BaseMatrixT
&
b
)
{
int
BaseMatrixT
<
T
>::
applyBinary
(
Op
op
,
BaseMatrixT
&
b
)
{
CHECK
(
height_
==
b
.
height_
&&
width_
==
b
.
width_
)
CHECK
(
height_
==
b
.
height_
&&
width_
==
b
.
width_
)
...
@@ -67,18 +69,23 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
...
@@ -67,18 +69,23 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyBinary
(
Op
op
,
BaseMatrixT
&
b
,
int
numRows
,
int
numCols
,
int
BaseMatrixT
<
T
>::
applyBinary
(
MatrixOffset
&
offset
)
{
Op
op
,
BaseMatrixT
&
b
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
)
{
applyBinary
(
op
,
b
,
numRows
,
numCols
,
offset
,
false_type
(),
false_type
());
applyBinary
(
op
,
b
,
numRows
,
numCols
,
offset
,
false_type
(),
false_type
());
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
,
class
bAsRowVector
,
class
bAsColVector
>
template
<
class
Op
,
class
bAsRowVector
,
class
bAsColVector
>
int
BaseMatrixT
<
T
>::
applyBinary
(
Op
op
,
BaseMatrixT
&
b
,
int
numRows
,
int
numCols
,
int
BaseMatrixT
<
T
>::
applyBinary
(
Op
op
,
MatrixOffset
&
offset
,
bAsRowVector
,
bAsColVector
)
{
BaseMatrixT
&
b
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
,
bAsRowVector
,
bAsColVector
)
{
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
b
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
b
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
useGpu_
==
b
.
useGpu_
)
<<
"Matrix type mismatch"
;
CHECK
(
useGpu_
==
b
.
useGpu_
)
<<
"Matrix type mismatch"
;
...
@@ -91,8 +98,8 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
...
@@ -91,8 +98,8 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
T
*
A
=
data_
;
T
*
A
=
data_
;
T
*
B
=
b
.
data_
;
T
*
B
=
b
.
data_
;
CAL_MATRIX_START_ADDRESS
(
A
,
height_
,
width_
,
lda
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
A
,
height_
,
width_
,
lda
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
bRow_
);
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
offset
.
bRow_
);
CHECK_LE
(
dimM
+
offset
.
aRow_
,
this
->
height_
);
CHECK_LE
(
dimM
+
offset
.
aRow_
,
this
->
height_
);
CHECK_LE
(
dimN
+
offset
.
aCol_
,
this
->
width_
);
CHECK_LE
(
dimN
+
offset
.
aCol_
,
this
->
width_
);
if
(
!
bAsRowVector
::
value
&&
!
bAsColVector
::
value
)
{
if
(
!
bAsRowVector
::
value
&&
!
bAsColVector
::
value
)
{
...
@@ -115,7 +122,7 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
...
@@ -115,7 +122,7 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyTernary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
int
BaseMatrixT
<
T
>::
applyTernary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
CHECK_EQ
(
height_
,
b
.
height_
);
CHECK_EQ
(
height_
,
b
.
height_
);
...
@@ -129,21 +136,29 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
...
@@ -129,21 +136,29 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyTernary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
int
BaseMatrixT
<
T
>::
applyTernary
(
Op
op
,
int
numRows
,
int
numCols
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
)
{
MatrixOffset
&
offset
)
{
applyTernary
(
op
,
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
false_type
());
applyTernary
(
op
,
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
false_type
());
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
,
class
cAsRowVector
,
class
cAsColVector
>
template
<
class
Op
,
class
cAsRowVector
,
class
cAsColVector
>
int
BaseMatrixT
<
T
>::
applyTernary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
int
BaseMatrixT
<
T
>::
applyTernary
(
Op
op
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
,
BaseMatrixT
&
b
,
cAsRowVector
,
cAsColVector
)
{
BaseMatrixT
&
c
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
,
cAsRowVector
,
cAsColVector
)
{
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
b
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
b
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
c
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
c
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
...
@@ -160,10 +175,10 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
...
@@ -160,10 +175,10 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
T
*
B
=
b
.
data_
;
T
*
B
=
b
.
data_
;
T
*
C
=
c
.
data_
;
T
*
C
=
c
.
data_
;
CAL_MATRIX_START_ADDRESS
(
A
,
height_
,
width_
,
lda
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
A
,
height_
,
width_
,
lda
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
bRow_
);
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
offset
.
bRow_
);
CAL_MATRIX_START_ADDRESS
(
C
,
c
.
height_
,
c
.
width_
,
ldc
,
offset
.
cCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
cRow_
);
C
,
c
.
height_
,
c
.
width_
,
ldc
,
offset
.
cCol_
,
offset
.
cRow_
);
CHECK_LE
(
dimM
+
offset
.
aRow_
,
this
->
height_
);
CHECK_LE
(
dimM
+
offset
.
aRow_
,
this
->
height_
);
CHECK_LE
(
dimN
+
offset
.
aCol_
,
this
->
width_
);
CHECK_LE
(
dimN
+
offset
.
aCol_
,
this
->
width_
);
...
@@ -180,21 +195,21 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
...
@@ -180,21 +195,21 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
}
}
if
(
true
==
useGpu_
)
{
if
(
true
==
useGpu_
)
{
hl_gpu_apply_ternary_op
hl_gpu_apply_ternary_op
<
T
,
Op
,
cAsRowVector
::
value
,
cAsColVector
::
value
>
(
<
T
,
Op
,
cAsRowVector
::
value
,
cAsColVector
::
value
>
(
op
,
A
,
B
,
C
,
dimM
,
dimN
,
lda
,
ldb
,
ldc
);
op
,
A
,
B
,
C
,
dimM
,
dimN
,
lda
,
ldb
,
ldc
);
}
else
{
}
else
{
hl_cpu_apply_ternary_op
hl_cpu_apply_ternary_op
<
T
,
Op
,
cAsRowVector
::
value
,
cAsColVector
::
value
>
(
<
T
,
Op
,
cAsRowVector
::
value
,
cAsColVector
::
value
>
(
op
,
A
,
B
,
C
,
dimM
,
dimN
,
lda
,
ldb
,
ldc
);
op
,
A
,
B
,
C
,
dimM
,
dimN
,
lda
,
ldb
,
ldc
);
}
}
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyQuaternary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
int
BaseMatrixT
<
T
>::
applyQuaternary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
)
{
BaseMatrixT
&
d
)
{
CHECK_EQ
(
height_
,
b
.
height_
);
CHECK_EQ
(
height_
,
b
.
height_
);
CHECK_EQ
(
width_
,
b
.
width_
);
CHECK_EQ
(
width_
,
b
.
width_
);
...
@@ -209,10 +224,14 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
...
@@ -209,10 +224,14 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Op
>
template
<
class
Op
>
int
BaseMatrixT
<
T
>::
applyQuaternary
(
Op
op
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
int
BaseMatrixT
<
T
>::
applyQuaternary
(
Op
op
,
BaseMatrixT
&
d
,
int
numRows
,
int
numCols
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
)
{
MatrixOffset
&
offset
)
{
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
this
->
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
b
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
CHECK
(
!
b
.
isSparse
())
<<
SPARSE_SUPPORT_ERROR
;
...
@@ -234,12 +253,12 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
...
@@ -234,12 +253,12 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
T
*
C
=
c
.
data_
;
T
*
C
=
c
.
data_
;
T
*
D
=
d
.
data_
;
T
*
D
=
d
.
data_
;
CAL_MATRIX_START_ADDRESS
(
A
,
height_
,
width_
,
lda
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
A
,
height_
,
width_
,
lda
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
bRow_
);
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
offset
.
bRow_
);
CAL_MATRIX_START_ADDRESS
(
C
,
c
.
height_
,
c
.
width_
,
ldc
,
offset
.
cCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
cRow_
);
C
,
c
.
height_
,
c
.
width_
,
ldc
,
offset
.
cCol_
,
offset
.
cRow_
);
CAL_MATRIX_START_ADDRESS
(
D
,
d
.
height_
,
d
.
width_
,
ldd
,
offset
.
dCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
dRow_
);
D
,
d
.
height_
,
d
.
width_
,
ldd
,
offset
.
dCol_
,
offset
.
dRow_
);
CHECK_LE
(
dimM
+
offset
.
aRow_
,
this
->
height_
);
CHECK_LE
(
dimM
+
offset
.
aRow_
,
this
->
height_
);
CHECK_LE
(
dimN
+
offset
.
aCol_
,
this
->
width_
);
CHECK_LE
(
dimN
+
offset
.
aCol_
,
this
->
width_
);
...
@@ -250,22 +269,29 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
...
@@ -250,22 +269,29 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
CHECK_LE
(
dimM
+
offset
.
dRow_
,
d
.
height_
);
CHECK_LE
(
dimM
+
offset
.
dRow_
,
d
.
height_
);
CHECK_LE
(
dimN
+
offset
.
dCol_
,
d
.
width_
);
CHECK_LE
(
dimN
+
offset
.
dCol_
,
d
.
width_
);
if
(
true
==
useGpu_
)
{
if
(
true
==
useGpu_
)
{
hl_gpu_apply_quaternary_op
(
op
,
A
,
B
,
C
,
D
,
dimM
,
dimN
,
lda
,
ldb
,
hl_gpu_apply_quaternary_op
(
op
,
A
,
B
,
C
,
D
,
dimM
,
dimN
,
lda
,
ldb
,
ldc
,
ldd
);
ldc
,
ldd
);
}
else
{
}
else
{
hl_cpu_apply_quaternary_op
(
op
,
A
,
B
,
C
,
D
,
dimM
,
dimN
,
lda
,
ldb
,
hl_cpu_apply_quaternary_op
(
op
,
A
,
B
,
C
,
D
,
dimM
,
dimN
,
lda
,
ldb
,
ldc
,
ldd
);
ldc
,
ldd
);
}
}
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Agg
,
class
Op
,
class
Saver
,
class
aAsRowVector
,
template
<
class
Agg
,
class
Op
,
class
Saver
,
class
aAsRowVector
,
class
aAsColVector
>
class
aAsColVector
>
int
BaseMatrixT
<
T
>::
aggregate
(
Agg
agg
,
Op
op
,
Saver
sv
,
BaseMatrixT
&
b
,
int
BaseMatrixT
<
T
>::
aggregate
(
Agg
agg
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
,
Op
op
,
aAsRowVector
,
aAsColVector
)
{
Saver
sv
,
BaseMatrixT
&
b
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
,
aAsRowVector
,
aAsColVector
)
{
CHECK_EQ
(
useGpu_
,
b
.
useGpu_
);
CHECK_EQ
(
useGpu_
,
b
.
useGpu_
);
int
ld
=
stride_
;
int
ld
=
stride_
;
...
@@ -273,10 +299,10 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
...
@@ -273,10 +299,10 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
T
*
dst
=
data_
;
T
*
dst
=
data_
;
T
*
B
=
b
.
data_
;
T
*
B
=
b
.
data_
;
CAL_MATRIX_START_ADDRESS
(
dst
,
height_
,
width_
,
ld
,
offset
.
aCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
aRow_
);
dst
,
height_
,
width_
,
ld
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
bRow_
);
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
offset
.
bRow_
);
if
(
aAsRowVector
::
value
&&
!
aAsColVector
::
value
)
{
if
(
aAsRowVector
::
value
&&
!
aAsColVector
::
value
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
...
@@ -297,12 +323,21 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
...
@@ -297,12 +323,21 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
return
0
;
return
0
;
}
}
template
<
class
T
>
template
<
class
T
>
template
<
class
Agg
,
class
Op
,
class
Saver
,
class
aAsRowVector
,
template
<
class
Agg
,
class
Op
,
class
Saver
,
class
aAsRowVector
,
class
aAsColVector
>
class
aAsColVector
>
int
BaseMatrixT
<
T
>::
aggregate
(
Agg
agg
,
Op
op
,
Saver
sv
,
BaseMatrixT
&
b
,
int
BaseMatrixT
<
T
>::
aggregate
(
Agg
agg
,
BaseMatrixT
&
c
,
int
numRows
,
int
numCols
,
Op
op
,
MatrixOffset
&
offset
,
aAsRowVector
,
Saver
sv
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
int
numRows
,
int
numCols
,
MatrixOffset
&
offset
,
aAsRowVector
,
aAsColVector
)
{
aAsColVector
)
{
CHECK_EQ
(
useGpu_
,
b
.
useGpu_
);
CHECK_EQ
(
useGpu_
,
b
.
useGpu_
);
CHECK_EQ
(
useGpu_
,
c
.
useGpu_
);
CHECK_EQ
(
useGpu_
,
c
.
useGpu_
);
...
@@ -314,28 +349,28 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
...
@@ -314,28 +349,28 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
T
*
dst
=
data_
;
T
*
dst
=
data_
;
T
*
B
=
b
.
data_
;
T
*
B
=
b
.
data_
;
T
*
C
=
c
.
data_
;
T
*
C
=
c
.
data_
;
CAL_MATRIX_START_ADDRESS
(
dst
,
height_
,
width_
,
ld
,
offset
.
aCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
aRow_
);
dst
,
height_
,
width_
,
ld
,
offset
.
aCol_
,
offset
.
aRow_
);
CAL_MATRIX_START_ADDRESS
(
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
bRow_
);
B
,
b
.
height_
,
b
.
width_
,
ldb
,
offset
.
bCol_
,
offset
.
bRow_
);
CAL_MATRIX_START_ADDRESS
(
C
,
c
.
height_
,
c
.
width_
,
ldc
,
offset
.
cCol_
,
CAL_MATRIX_START_ADDRESS
(
offset
.
cRow_
);
C
,
c
.
height_
,
c
.
width_
,
ldc
,
offset
.
cCol_
,
offset
.
cRow_
);
if
(
aAsRowVector
::
value
&&
!
aAsColVector
::
value
)
{
if
(
aAsRowVector
::
value
&&
!
aAsColVector
::
value
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
hl_gpu_matrix_column_op
(
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
B
,
hl_gpu_matrix_column_op
(
ldb
,
C
,
ldc
);
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
B
,
ldb
,
C
,
ldc
);
}
else
{
}
else
{
hl_cpu_matrix_column_op
(
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
B
,
hl_cpu_matrix_column_op
(
ldb
,
C
,
ldc
);
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
B
,
ldb
,
C
,
ldc
);
}
}
}
else
if
(
!
aAsRowVector
::
value
&&
aAsColVector
::
value
)
{
}
else
if
(
!
aAsRowVector
::
value
&&
aAsColVector
::
value
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
hl_gpu_matrix_row_op
(
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
ld
,
B
,
hl_gpu_matrix_row_op
(
ldb
,
C
,
ldc
);
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
ld
,
B
,
ldb
,
C
,
ldc
);
}
else
{
}
else
{
hl_cpu_matrix_row_op
(
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
ld
,
B
,
hl_cpu_matrix_row_op
(
ldb
,
C
,
ldc
);
agg
,
op
,
sv
,
numRows
,
numCols
,
dst
,
ld
,
B
,
ldb
,
C
,
ldc
);
}
}
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"not supported"
;
LOG
(
FATAL
)
<<
"not supported"
;
...
@@ -350,15 +385,19 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
...
@@ -350,15 +385,19 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
*/
*/
DEFINE_MATRIX_UNARY_OP
(
Neg
,
a
=
-
a
);
DEFINE_MATRIX_UNARY_OP
(
Neg
,
a
=
-
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
neg
()
{
applyUnary
(
unary
::
Neg
<
T
>
());
}
void
BaseMatrixT
<
T
>::
neg
()
{
applyUnary
(
unary
::
Neg
<
T
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Exp
,
a
=
exp
(
a
));
DEFINE_MATRIX_UNARY_OP
(
Exp
,
a
=
exp
(
a
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
exp2
()
{
applyUnary
(
unary
::
Exp
<
real
>
());
}
void
BaseMatrixT
<
real
>::
exp2
()
{
applyUnary
(
unary
::
Exp
<
real
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Log
,
a
=
log
(
a
));
DEFINE_MATRIX_UNARY_OP
(
Log
,
a
=
log
(
a
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
log2
()
{
void
BaseMatrixT
<
real
>::
log2
()
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyUnary
(
unary
::
Log
<
real
>
());
applyUnary
(
unary
::
Log
<
real
>
());
...
@@ -368,30 +407,42 @@ void BaseMatrixT<real>::log2() {
...
@@ -368,30 +407,42 @@ void BaseMatrixT<real>::log2() {
}
}
DEFINE_MATRIX_UNARY_OP
(
Sqrt
,
a
=
sqrt
(
a
));
DEFINE_MATRIX_UNARY_OP
(
Sqrt
,
a
=
sqrt
(
a
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
sqrt2
()
{
applyUnary
(
unary
::
Sqrt
<
real
>
());
}
void
BaseMatrixT
<
real
>::
sqrt2
()
{
applyUnary
(
unary
::
Sqrt
<
real
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Square
,
a
=
a
*
a
);
DEFINE_MATRIX_UNARY_OP
(
Square
,
a
=
a
*
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
square2
()
{
applyUnary
(
unary
::
Square
<
T
>
());
}
void
BaseMatrixT
<
T
>::
square2
()
{
applyUnary
(
unary
::
Square
<
T
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Reciprocal
,
a
=
1.0
f
/
a
);
DEFINE_MATRIX_UNARY_OP
(
Reciprocal
,
a
=
1.0
f
/
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
reciprocal2
()
{
applyUnary
(
unary
::
Reciprocal
<
T
>
());
}
void
BaseMatrixT
<
T
>::
reciprocal2
()
{
applyUnary
(
unary
::
Reciprocal
<
T
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Abs
,
a
=
a
>
0
?
a
:
-
a
);
DEFINE_MATRIX_UNARY_OP
(
Abs
,
a
=
a
>
0
?
a
:
-
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
abs2
()
{
applyUnary
(
unary
::
Abs
<
T
>
());
}
void
BaseMatrixT
<
T
>::
abs2
()
{
applyUnary
(
unary
::
Abs
<
T
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Sign
,
a
=
(
a
>
0
)
-
(
a
<
0
));
DEFINE_MATRIX_UNARY_OP
(
Sign
,
a
=
(
a
>
0
)
-
(
a
<
0
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sign2
()
{
applyUnary
(
unary
::
Sign
<
T
>
());
}
void
BaseMatrixT
<
T
>::
sign2
()
{
applyUnary
(
unary
::
Sign
<
T
>
());
}
DEFINE_MATRIX_UNARY_OP
(
Zero
,
a
=
0
);
DEFINE_MATRIX_UNARY_OP
(
Zero
,
a
=
0
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
zero
()
{
applyUnary
(
unary
::
Zero
<
T
>
());
}
void
BaseMatrixT
<
T
>::
zero
()
{
applyUnary
(
unary
::
Zero
<
T
>
());
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
zeroAtOffset
(
int64_t
columnOffset
,
int64_t
numColumns
)
{
void
BaseMatrixT
<
T
>::
zeroAtOffset
(
int64_t
columnOffset
,
int64_t
numColumns
)
{
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
numColumns
;
int
numCols
=
numColumns
;
...
@@ -400,11 +451,13 @@ void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
...
@@ -400,11 +451,13 @@ void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
}
}
DEFINE_MATRIX_UNARY_OP
(
One
,
a
=
1
);
DEFINE_MATRIX_UNARY_OP
(
One
,
a
=
1
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
one
()
{
applyUnary
(
unary
::
One
<
T
>
());
}
void
BaseMatrixT
<
T
>::
one
()
{
applyUnary
(
unary
::
One
<
T
>
());
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Pow
,
ONE_PARAMETER
,
a
=
pow
(
a
,
p
));
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Pow
,
ONE_PARAMETER
,
a
=
pow
(
a
,
p
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
pow2
(
real
p
)
{
void
BaseMatrixT
<
real
>::
pow2
(
real
p
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyUnary
(
unary
::
Pow
<
real
>
(
p
));
applyUnary
(
unary
::
Pow
<
real
>
(
p
));
...
@@ -414,51 +467,67 @@ void BaseMatrixT<real>::pow2(real p) {
...
@@ -414,51 +467,67 @@ void BaseMatrixT<real>::pow2(real p) {
}
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
SubScalar
,
ONE_PARAMETER
,
a
-=
p
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
SubScalar
,
ONE_PARAMETER
,
a
-=
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
subScalar
(
T
p
)
{
applyUnary
(
unary
::
SubScalar
<
T
>
(
p
));
}
void
BaseMatrixT
<
T
>::
subScalar
(
T
p
)
{
applyUnary
(
unary
::
SubScalar
<
T
>
(
p
));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
MulScalar
,
ONE_PARAMETER
,
a
*=
p
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
MulScalar
,
ONE_PARAMETER
,
a
*=
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
mulScalar
(
T
p
)
{
applyUnary
(
unary
::
MulScalar
<
T
>
(
p
));
}
void
BaseMatrixT
<
T
>::
mulScalar
(
T
p
)
{
applyUnary
(
unary
::
MulScalar
<
T
>
(
p
));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
DivScalar
,
ONE_PARAMETER
,
a
/=
p
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
DivScalar
,
ONE_PARAMETER
,
a
/=
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
divScalar
(
T
p
)
{
applyUnary
(
unary
::
DivScalar
<
T
>
(
p
));
}
void
BaseMatrixT
<
T
>::
divScalar
(
T
p
)
{
applyUnary
(
unary
::
DivScalar
<
T
>
(
p
));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Assign
,
ONE_PARAMETER
,
a
=
p
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Assign
,
ONE_PARAMETER
,
a
=
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
assign
(
T
p
)
{
applyUnary
(
unary
::
Assign
<
T
>
(
p
));
}
void
BaseMatrixT
<
T
>::
assign
(
T
p
)
{
applyUnary
(
unary
::
Assign
<
T
>
(
p
));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Add
,
ONE_PARAMETER
,
a
+=
p
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Add
,
ONE_PARAMETER
,
a
+=
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
T
p
)
{
applyUnary
(
unary
::
Add
<
T
>
(
p
));
}
void
BaseMatrixT
<
T
>::
add
(
T
p
)
{
applyUnary
(
unary
::
Add
<
T
>
(
p
));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Add2
,
TWO_PARAMETER
,
a
=
a
*
p1
+
p2
);
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Add2
,
TWO_PARAMETER
,
a
=
a
*
p1
+
p2
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
T
p1
,
T
p2
)
{
applyUnary
(
unary
::
Add2
<
T
>
(
p1
,
p2
));
}
void
BaseMatrixT
<
T
>::
add
(
T
p1
,
T
p2
)
{
applyUnary
(
unary
::
Add2
<
T
>
(
p1
,
p2
));
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Clip
,
TWO_PARAMETER
,
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
Clip
,
TWO_PARAMETER
,
a
=
a
<
p1
?
p1
:
(
a
>
p2
?
p2
:
a
));
a
=
a
<
p1
?
p1
:
(
a
>
p2
?
p2
:
a
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
clip
(
T
p1
,
T
p2
)
{
applyUnary
(
unary
::
Clip
<
T
>
(
p1
,
p2
));
}
void
BaseMatrixT
<
T
>::
clip
(
T
p1
,
T
p2
)
{
applyUnary
(
unary
::
Clip
<
T
>
(
p1
,
p2
));
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ClipDerivative
,
TWO_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ClipDerivative
,
a
=
b
<
p1
?
0
:
(
b
>
p2
?
0
:
1
));
TWO_PARAMETER
,
template
<
class
T
>
a
=
b
<
p1
?
0
:
(
b
>
p2
?
0
:
1
));
template
<
class
T
>
void
BaseMatrixT
<
T
>::
clipDerivative
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
clipDerivative
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
applyBinary
(
binary
::
ClipDerivative
<
T
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
ClipDerivative
<
T
>
(
p1
,
p2
),
b
);
}
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
BiggerThanScalar
,
ONE_PARAMETER
,
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
BiggerThanScalar
,
ONE_PARAMETER
,
a
=
a
>
p
?
1.0
f
:
0.0
f
);
a
=
a
>
p
?
1.0
f
:
0.0
f
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
biggerThanScalar
(
T
p
)
{
void
BaseMatrixT
<
T
>::
biggerThanScalar
(
T
p
)
{
applyUnary
(
unary
::
BiggerThanScalar
<
T
>
(
p
));
applyUnary
(
unary
::
BiggerThanScalar
<
T
>
(
p
));
}
}
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
DownClip
,
ONE_PARAMETER
,
DEFINE_MATRIX_UNARY_PARAMETER_OP
(
DownClip
,
ONE_PARAMETER
,
a
=
a
>
p
?
a
:
p
);
a
=
a
>
p
?
a
:
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
downClip
(
T
p
)
{
void
BaseMatrixT
<
T
>::
downClip
(
T
p
)
{
applyUnary
(
unary
::
DownClip
<
T
>
(
p
));
applyUnary
(
unary
::
DownClip
<
T
>
(
p
));
}
}
...
@@ -469,12 +538,12 @@ void BaseMatrixT<T>::downClip(T p) {
...
@@ -469,12 +538,12 @@ void BaseMatrixT<T>::downClip(T p) {
*/
*/
DEFINE_MATRIX_BINARY_OP
(
Add
,
a
+=
b
);
DEFINE_MATRIX_BINARY_OP
(
Add
,
a
+=
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Add
<
T
>
(),
b
);
applyBinary
(
binary
::
Add
<
T
>
(),
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
add
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
add
(
BaseMatrixT
&
b
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
Add
<
real
>
(),
b
);
applyBinary
(
binary
::
Add
<
real
>
(),
b
);
...
@@ -485,7 +554,7 @@ void BaseMatrixT<real>::add(BaseMatrixT& b) {
...
@@ -485,7 +554,7 @@ void BaseMatrixT<real>::add(BaseMatrixT& b) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addAtOffset
(
BaseMatrixT
&
b
,
int64_t
columnOffset
)
{
void
BaseMatrixT
<
T
>::
addAtOffset
(
BaseMatrixT
&
b
,
int64_t
columnOffset
)
{
if
(
columnOffset
+
b
.
width_
<=
width_
)
{
if
(
columnOffset
+
b
.
width_
<=
width_
)
{
int
numRows
=
height_
;
int
numRows
=
height_
;
...
@@ -504,43 +573,53 @@ void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
...
@@ -504,43 +573,53 @@ void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addP2P
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
addP2P
(
BaseMatrixT
&
b
)
{
T
*
A
=
data_
;
T
*
A
=
data_
;
T
*
B
=
b
.
data_
;
T
*
B
=
b
.
data_
;
int
dimM
=
height_
;
int
dimM
=
height_
;
int
dimN
=
width_
;
int
dimN
=
width_
;
hl_gpu_apply_binary_op
<
T
,
binary
::
Add
<
T
>
,
0
,
0
>
hl_gpu_apply_binary_op
<
T
,
binary
::
Add
<
T
>
,
0
,
0
>
(
(
binary
::
Add
<
T
>
(),
A
,
B
,
dimM
,
dimN
,
dimN
,
dimN
);
binary
::
Add
<
T
>
(),
A
,
B
,
dimM
,
dimN
,
dimN
,
dimN
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addColVector
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
addColVector
(
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
Add
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
false_type
(),
applyBinary
(
binary
::
Add
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/* bAsColVector */
);
true_type
()
/* bAsColVector */
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addRowVector
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
addRowVector
(
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
Add
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
applyBinary
(
binary
::
Add
<
T
>
(),
true_type
()
/* bAsRowVector */
,
false_type
());
b
,
numRows
,
numCols
,
offset
,
true_type
()
/* bAsRowVector */
,
false_type
());
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Add1
,
ONE_PARAMETER
,
a
+=
b
*
p
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Add1
,
ONE_PARAMETER
,
a
+=
b
*
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
Add1
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
Add1
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Pow
,
ONE_PARAMETER
,
a
=
pow
(
b
,
p
));
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Pow
,
ONE_PARAMETER
,
a
=
pow
(
b
,
p
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
pow2
(
BaseMatrixT
&
b
,
real
p
)
{
void
BaseMatrixT
<
real
>::
pow2
(
BaseMatrixT
&
b
,
real
p
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
Pow
<
real
>
(
p
),
b
);
applyBinary
(
binary
::
Pow
<
real
>
(
p
),
b
);
...
@@ -550,36 +629,45 @@ void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
...
@@ -550,36 +629,45 @@ void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Add2
,
TWO_PARAMETER
,
a
=
p1
*
a
+
p2
*
b
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Add2
,
TWO_PARAMETER
,
a
=
p1
*
a
+
p2
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
applyBinary
(
binary
::
Add2
<
T
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
Add2
<
T
>
(
p1
,
p2
),
b
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addBias
(
BaseMatrixT
&
b
,
T
scale
)
{
void
BaseMatrixT
<
T
>::
addBias
(
BaseMatrixT
&
b
,
T
scale
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
Add1
<
T
>
(
scale
),
b
,
numRows
,
numCols
,
offset
,
applyBinary
(
binary
::
Add1
<
T
>
(
scale
),
true_type
()
/* bAsRowVector */
,
false_type
());
b
,
numRows
,
numCols
,
offset
,
true_type
()
/* bAsRowVector */
,
false_type
());
}
}
DEFINE_MATRIX_BINARY_OP
(
Sub
,
a
-=
b
);
DEFINE_MATRIX_BINARY_OP
(
Sub
,
a
-=
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Sub
<
T
>
(),
b
);
}
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Sub
<
T
>
(),
b
);
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Sub1
,
ONE_PARAMETER
,
a
-=
b
*
p
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Sub1
,
ONE_PARAMETER
,
a
-=
b
*
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
Sub1
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
Sub1
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Relu
,
b
=
a
>
0.0
f
?
a
:
0.0
f
);
DEFINE_MATRIX_BINARY_OP
(
Relu
,
b
=
a
>
0.0
f
?
a
:
0.0
f
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
relu
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Relu
<
T
>
(),
b
);
}
void
BaseMatrixT
<
T
>::
relu
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Relu
<
T
>
(),
b
);
}
DEFINE_MATRIX_BINARY_OP
(
ReluDerivative
,
a
*=
(
b
>
0.0
f
?
1.0
f
:
0.0
f
));
DEFINE_MATRIX_BINARY_OP
(
ReluDerivative
,
a
*=
(
b
>
0.0
f
?
1.0
f
:
0.0
f
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
reluDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
reluDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
ReluDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
ReluDerivative
<
T
>
(),
b
);
}
}
...
@@ -589,7 +677,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
...
@@ -589,7 +677,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
?
THRESHOLD
?
THRESHOLD
:
((
a
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
((
a
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
a
))));
:
a
))));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
softrelu
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
softrelu
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Softrelu
<
real
>
(),
b
);
applyBinary
(
binary
::
Softrelu
<
real
>
(),
b
);
}
}
...
@@ -599,97 +687,100 @@ DEFINE_MATRIX_BINARY_OP(
...
@@ -599,97 +687,100 @@ DEFINE_MATRIX_BINARY_OP(
a
*=
(
1.0
-
exp
(
-
1.0
*
((
b
>
THRESHOLD
)
a
*=
(
1.0
-
exp
(
-
1.0
*
((
b
>
THRESHOLD
)
?
THRESHOLD
?
THRESHOLD
:
((
b
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
b
)))));
:
((
b
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
b
)))));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
softreluDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
softreluDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
SoftreluDerivative
<
real
>
(),
b
);
applyBinary
(
binary
::
SoftreluDerivative
<
real
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Brelu
,
TWO_PARAMETER
,
b
=
a
>
p1
?
a
:
p1
;
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Brelu
,
TWO_PARAMETER
,
b
=
a
>
p1
?
a
:
p1
;
b
=
b
<
p2
?
b
:
p2
);
b
=
b
<
p2
?
b
:
p2
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
brelu
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
brelu
(
BaseMatrixT
&
b
)
{
int
p1
=
0
,
p2
=
24
;
//! TODO(yuyang18): Make p1,p2 configuable.
int
p1
=
0
,
p2
=
24
;
//! TODO(yuyang18): Make p1,p2 configuable.
applyBinary
(
binary
::
Brelu
<
T
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
Brelu
<
T
>
(
p1
,
p2
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
BreluDerivative
,
TWO_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
BreluDerivative
,
TWO_PARAMETER
,
a
*=
(
b
>
p1
&&
b
<
p2
)
?
1.0
:
0.0
);
a
*=
(
b
>
p1
&&
b
<
p2
)
?
1.0
:
0.0
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
breluDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
breluDerivative
(
BaseMatrixT
&
b
)
{
int
p1
=
0
,
p2
=
24
;
int
p1
=
0
,
p2
=
24
;
applyBinary
(
binary
::
BreluDerivative
<
T
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
BreluDerivative
<
T
>
(
p1
,
p2
),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Square
,
b
=
a
*
a
);
DEFINE_MATRIX_BINARY_OP
(
Square
,
b
=
a
*
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
square2
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
square2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Square
<
T
>
(),
b
);
applyBinary
(
binary
::
Square
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
SquareDerivative
,
a
*=
2.0
*
b
);
DEFINE_MATRIX_BINARY_OP
(
SquareDerivative
,
a
*=
2.0
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
squareDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
squareDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
SquareDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
SquareDerivative
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Tanh
,
DEFINE_MATRIX_BINARY_OP
(
Tanh
,
T
tmp
=
-
2.0
*
a
;
T
tmp
=
-
2.0
*
a
;
tmp
=
(
tmp
>
EXP_MAX_INPUT
)
?
EXP_MAX_INPUT
:
tmp
;
tmp
=
(
tmp
>
EXP_MAX_INPUT
)
?
EXP_MAX_INPUT
:
tmp
;
b
=
2.0
/
(
1.0
+
std
::
exp
(
tmp
))
-
1.0
);
b
=
2.0
/
(
1.0
+
std
::
exp
(
tmp
))
-
1.0
);
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
tanh
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
tanh
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Tanh
<
real
>
(),
b
);
applyBinary
(
binary
::
Tanh
<
real
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
TanhDerivative
,
a
*=
1
-
b
*
b
);
DEFINE_MATRIX_BINARY_OP
(
TanhDerivative
,
a
*=
1
-
b
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
tanhDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
tanhDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
TanhDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
TanhDerivative
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ScaledTanh
,
TWO_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
b
=
p1
*
ScaledTanh
,
TWO_PARAMETER
,
b
=
p1
*
(
2.0
/
(
1.0
+
exp
(
-
2
*
p2
*
a
))
-
1.0
));
(
2.0
/
(
1.0
+
exp
(
-
2
*
p2
*
a
))
-
1.0
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
scaledTanh
(
BaseMatrixT
&
b
,
real
p1
,
real
p2
)
{
void
BaseMatrixT
<
real
>::
scaledTanh
(
BaseMatrixT
&
b
,
real
p1
,
real
p2
)
{
applyBinary
(
binary
::
ScaledTanh
<
real
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
ScaledTanh
<
real
>
(
p1
,
p2
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ScaledTanhDerivative
,
TWO_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ScaledTanhDerivative
,
TWO_PARAMETER
,
a
*=
p2
*
(
p1
-
b
*
b
));
a
*=
p2
*
(
p1
-
b
*
b
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
scaledTanhDerivative
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
scaledTanhDerivative
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
applyBinary
(
binary
::
ScaledTanhDerivative
<
T
>
(
p1
*
p1
,
p2
/
p1
),
b
);
applyBinary
(
binary
::
ScaledTanhDerivative
<
T
>
(
p1
*
p1
,
p2
/
p1
),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Reciprocal
,
b
=
1.0
f
/
a
);
DEFINE_MATRIX_BINARY_OP
(
Reciprocal
,
b
=
1.0
f
/
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
reciprocal2
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
reciprocal2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Reciprocal
<
T
>
(),
b
);
applyBinary
(
binary
::
Reciprocal
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
ReciprocalDerivative
,
a
*=
-
b
*
b
);
DEFINE_MATRIX_BINARY_OP
(
ReciprocalDerivative
,
a
*=
-
b
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
reciprocalDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
reciprocalDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
ReciprocalDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
ReciprocalDerivative
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Abs
,
b
=
a
>
0.0
f
?
a
:
-
a
);
DEFINE_MATRIX_BINARY_OP
(
Abs
,
b
=
a
>
0.0
f
?
a
:
-
a
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
abs2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Abs
<
T
>
(),
b
);
}
void
BaseMatrixT
<
T
>::
abs2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Abs
<
T
>
(),
b
);
}
DEFINE_MATRIX_BINARY_OP
(
AbsDerivative
,
a
=
(
b
>
0
)
?
a
:
(
b
<
0
)
?
-
a
:
0
);
DEFINE_MATRIX_BINARY_OP
(
AbsDerivative
,
a
=
(
b
>
0
)
?
a
:
(
b
<
0
)
?
-
a
:
0
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
absDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
absDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
AbsDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
AbsDerivative
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
DEFINE_MATRIX_BINARY_OP
(
Sigmoid
,
const
T
THRESHOLD_MIN
=
-
40.0
;
Sigmoid
,
const
T
THRESHOLD_MIN
=
-
40.0
;
const
T
THRESHOLD_MAX
=
13.0
;
const
T
THRESHOLD_MAX
=
13.0
;
T
tmp
=
(
a
<
THRESHOLD_MIN
)
?
THRESHOLD_MIN
T
tmp
=
(
a
<
THRESHOLD_MIN
)
:
((
a
>
THRESHOLD_MAX
)
?
THRESHOLD_MAX
:
a
);
?
THRESHOLD_MIN
b
=
1.0
f
/
(
1.0
f
+
exp
(
-
tmp
)));
:
((
a
>
THRESHOLD_MAX
)
?
THRESHOLD_MAX
:
a
);
template
<
>
b
=
1.0
f
/
(
1.0
f
+
exp
(
-
tmp
)));
template
<
>
void
BaseMatrixT
<
real
>::
sigmoid
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
sigmoid
(
BaseMatrixT
&
b
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
Sigmoid
<
real
>
(),
b
);
applyBinary
(
binary
::
Sigmoid
<
real
>
(),
b
);
...
@@ -723,31 +814,31 @@ void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
...
@@ -723,31 +814,31 @@ void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
}
}
DEFINE_MATRIX_BINARY_OP
(
SigmoidDerivative
,
a
*=
b
*
(
1
-
b
));
DEFINE_MATRIX_BINARY_OP
(
SigmoidDerivative
,
a
*=
b
*
(
1
-
b
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sigmoidDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
sigmoidDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
SigmoidDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
SigmoidDerivative
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
ExpDerivative
,
a
*=
b
);
DEFINE_MATRIX_BINARY_OP
(
ExpDerivative
,
a
*=
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
expDerivative
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
expDerivative
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
ExpDerivative
<
T
>
(),
b
);
applyBinary
(
binary
::
ExpDerivative
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Sign
,
b
=
a
>
0.0
f
?
1.0
f
:
-
1.0
f
);
DEFINE_MATRIX_BINARY_OP
(
Sign
,
b
=
a
>
0.0
f
?
1.0
f
:
-
1.0
f
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sign2
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
sign2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Sign
<
T
>
(),
b
);
applyBinary
(
binary
::
Sign
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Exp
,
a
=
exp
(
b
));
DEFINE_MATRIX_BINARY_OP
(
Exp
,
a
=
exp
(
b
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
exp2
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
exp2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Exp
<
real
>
(),
b
);
applyBinary
(
binary
::
Exp
<
real
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Log
,
a
=
log
(
b
));
DEFINE_MATRIX_BINARY_OP
(
Log
,
a
=
log
(
b
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
log2
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
log2
(
BaseMatrixT
&
b
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
Log
<
real
>
(),
b
);
applyBinary
(
binary
::
Log
<
real
>
(),
b
);
...
@@ -757,13 +848,13 @@ void BaseMatrixT<real>::log2(BaseMatrixT& b) {
...
@@ -757,13 +848,13 @@ void BaseMatrixT<real>::log2(BaseMatrixT& b) {
}
}
DEFINE_MATRIX_BINARY_OP
(
Sqrt
,
a
=
sqrt
(
b
));
DEFINE_MATRIX_BINARY_OP
(
Sqrt
,
a
=
sqrt
(
b
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
sqrt2
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
sqrt2
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
Sqrt
<
real
>
(),
b
);
applyBinary
(
binary
::
Sqrt
<
real
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
InvSqrt
,
a
=
1.0
f
/
sqrt
(
b
));
DEFINE_MATRIX_BINARY_OP
(
InvSqrt
,
a
=
1.0
f
/
sqrt
(
b
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
invSqrt
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
invSqrt
(
BaseMatrixT
&
b
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
InvSqrt
<
real
>
(),
b
);
applyBinary
(
binary
::
InvSqrt
<
real
>
(),
b
);
...
@@ -775,37 +866,37 @@ void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
...
@@ -775,37 +866,37 @@ void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
IsEqual
,
ONE_PARAMETER
,
a
=
(
b
==
p
));
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
IsEqual
,
ONE_PARAMETER
,
a
=
(
b
==
p
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
isEqualTo
(
BaseMatrixT
&
b
,
T
value
)
{
void
BaseMatrixT
<
T
>::
isEqualTo
(
BaseMatrixT
&
b
,
T
value
)
{
applyBinary
(
binary
::
IsEqual
<
T
>
(
value
),
b
);
applyBinary
(
binary
::
IsEqual
<
T
>
(
value
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
AddScalar
,
ONE_PARAMETER
,
a
=
b
+
p
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
AddScalar
,
ONE_PARAMETER
,
a
=
b
+
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addScalar
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
addScalar
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
AddScalar
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
AddScalar
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
SubScalar
,
ONE_PARAMETER
,
a
=
b
-
p
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
SubScalar
,
ONE_PARAMETER
,
a
=
b
-
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
subScalar
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
subScalar
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
SubScalar
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
SubScalar
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
MulScalar
,
ONE_PARAMETER
,
a
=
b
*
p
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
MulScalar
,
ONE_PARAMETER
,
a
=
b
*
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
mulScalar
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
mulScalar
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
MulScalar
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
MulScalar
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
DivScalar
,
ONE_PARAMETER
,
a
=
b
/
p
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
DivScalar
,
ONE_PARAMETER
,
a
=
b
/
p
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
divScalar
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
divScalar
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
DivScalar
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
DivScalar
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ScalarDiv
,
ONE_PARAMETER
,
a
=
p
/
b
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ScalarDiv
,
ONE_PARAMETER
,
a
=
p
/
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
scalarDiv
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
scalarDiv
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
ScalarDiv
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
ScalarDiv
<
T
>
(
p
),
b
);
}
}
...
@@ -817,20 +908,20 @@ void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
...
@@ -817,20 +908,20 @@ void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
DEFINE_MATRIX_TERNARY_OP
(
SoftCrossEntropy
,
DEFINE_MATRIX_TERNARY_OP
(
SoftCrossEntropy
,
a
=
-
c
*
log
(
b
)
-
(
1
-
c
)
*
log
(
1
-
b
));
a
=
-
c
*
log
(
b
)
-
(
1
-
c
)
*
log
(
1
-
b
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
softCrossEntropy
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
real
>::
softCrossEntropy
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
SoftCrossEntropy
<
real
>
(),
b
,
c
);
applyTernary
(
ternary
::
SoftCrossEntropy
<
real
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
SoftCrossEntropyBp
,
a
+=
(
b
-
c
)
/
(
b
*
(
1
-
b
)));
DEFINE_MATRIX_TERNARY_OP
(
SoftCrossEntropyBp
,
a
+=
(
b
-
c
)
/
(
b
*
(
1
-
b
)));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
softCrossEntropyBp
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
softCrossEntropyBp
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
SoftCrossEntropyBp
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
SoftCrossEntropyBp
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
BinaryCrossEntropy
,
DEFINE_MATRIX_TERNARY_OP
(
BinaryCrossEntropy
,
a
=
c
>
0.5
?
-
log
(
b
)
:
-
log
(
1.0
-
b
));
a
=
c
>
0.5
?
-
log
(
b
)
:
-
log
(
1.0
-
b
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
binaryLabelCrossEntropy
(
BaseMatrixT
&
b
,
void
BaseMatrixT
<
real
>::
binaryLabelCrossEntropy
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
BaseMatrixT
&
c
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
...
@@ -858,70 +949,73 @@ void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
...
@@ -858,70 +949,73 @@ void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
DEFINE_MATRIX_TERNARY_OP
(
BinaryCrossEntropyBp
,
DEFINE_MATRIX_TERNARY_OP
(
BinaryCrossEntropyBp
,
a
+=
c
>
0.5
?
-
1.0
/
b
:
1.0
/
(
1.0
-
b
));
a
+=
c
>
0.5
?
-
1.0
/
b
:
1.0
/
(
1.0
-
b
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
binaryLabelCrossEntropyBp
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
binaryLabelCrossEntropyBp
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
BinaryCrossEntropyBp
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
BinaryCrossEntropyBp
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
Add
,
a
=
b
+
c
);
DEFINE_MATRIX_TERNARY_OP
(
Add
,
a
=
b
+
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
Add
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
Add
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
Add1
,
TWO_PARAMETER
,
a
=
p1
*
b
+
p2
*
c
);
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
Add1
,
TWO_PARAMETER
,
a
=
p1
*
b
+
p2
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
T
p1
,
BaseMatrixT
&
c
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
add
(
BaseMatrixT
&
b
,
T
p1
,
BaseMatrixT
&
c
,
T
p2
)
{
applyTernary
(
ternary
::
Add1
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
Add1
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
Sub
,
a
=
b
-
c
);
DEFINE_MATRIX_TERNARY_OP
(
Sub
,
a
=
b
-
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
Sub
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
Sub
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
Sub1
,
TWO_PARAMETER
,
a
=
p1
*
b
-
p2
*
c
);
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
Sub1
,
TWO_PARAMETER
,
a
=
p1
*
b
-
p2
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
,
T
p1
,
BaseMatrixT
&
c
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
sub
(
BaseMatrixT
&
b
,
T
p1
,
BaseMatrixT
&
c
,
T
p2
)
{
applyTernary
(
ternary
::
Sub1
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
Sub1
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
Add2
,
a
=
a
+
b
+
c
);
DEFINE_MATRIX_TERNARY_OP
(
Add2
,
a
=
a
+
b
+
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
add2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
Add2
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
Add2
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
Add3
,
THREE_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
Add3
,
THREE_PARAMETER
,
a
=
p1
*
a
+
p2
*
b
+
p3
*
c
);
a
=
p1
*
a
+
p2
*
b
+
p3
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
,
T
p3
)
{
void
BaseMatrixT
<
T
>::
add2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
,
T
p3
)
{
applyTernary
(
ternary
::
Add3
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
);
applyTernary
(
ternary
::
Add3
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
SgdUpdate
,
THREE_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
SgdUpdate
,
THREE_PARAMETER
,
c
=
p2
*
c
-
p1
*
(
b
+
p3
*
a
);
c
=
p2
*
c
-
p1
*
(
b
+
p3
*
a
);
a
=
a
+
c
);
a
=
a
+
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sgdUpdate
(
BaseMatrixT
&
b
,
// grad
void
BaseMatrixT
<
T
>::
sgdUpdate
(
BaseMatrixT
&
b
,
// grad
BaseMatrixT
&
c
,
// mom
BaseMatrixT
&
c
,
// mom
T
p1
,
// learningRate,
T
p1
,
// learningRate,
T
p2
,
// momentum,
T
p2
,
// momentum,
T
p3
)
{
// decayRate
T
p3
)
{
// decayRate
applyTernary
(
ternary
::
SgdUpdate
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
);
applyTernary
(
ternary
::
SgdUpdate
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
);
}
}
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP
(
SgdUpdate
,
THREE_PARAMETER
,
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP
(
SgdUpdate
,
THREE_PARAMETER
,
c
=
p2
*
c
-
p1
*
d
*
(
b
+
p3
*
a
);
c
=
p2
*
c
-
p1
*
d
*
(
b
+
p3
*
a
);
a
+=
c
);
a
+=
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
sgdUpdate
(
BaseMatrixT
&
b
,
// grad,
void
BaseMatrixT
<
T
>::
sgdUpdate
(
BaseMatrixT
&
b
,
// grad,
BaseMatrixT
&
c
,
// mom,
BaseMatrixT
&
c
,
// mom,
BaseMatrixT
&
d
,
// lr,
BaseMatrixT
&
d
,
// lr,
T
p1
,
// learningRate,
T
p1
,
// learningRate,
T
p2
,
// momentum,
T
p2
,
// momentum,
T
p3
)
{
// decayRate
T
p3
)
{
// decayRate
applyQuaternary
(
quaternary
::
SgdUpdate
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
,
d
);
applyQuaternary
(
quaternary
::
SgdUpdate
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
,
d
);
}
}
...
@@ -929,19 +1023,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
...
@@ -929,19 +1023,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
a
=
(
a
>
lambda
)
a
=
(
a
>
lambda
)
?
(
a
-
lambda
)
?
(
a
-
lambda
)
:
(
a
<
-
lambda
)
?
(
a
+
lambda
)
:
0
);
:
(
a
<
-
lambda
)
?
(
a
+
lambda
)
:
0
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
applyL1
(
BaseMatrixT
&
lr
,
T
learningRate
,
T
decayRate
)
{
void
BaseMatrixT
<
T
>::
applyL1
(
BaseMatrixT
&
lr
,
T
learningRate
,
T
decayRate
)
{
applyBinary
(
binary
::
ApplyL1
<
T
>
(
learningRate
*
decayRate
),
lr
);
applyBinary
(
binary
::
ApplyL1
<
T
>
(
learningRate
*
decayRate
),
lr
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
applyL1
(
BaseMatrixT
&
lr
,
void
BaseMatrixT
<
real
>::
applyL1
(
BaseMatrixT
&
lr
,
real
learningRate
,
real
learningRate
,
real
decayRate
)
{
real
decayRate
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
ApplyL1
<
real
>
(
learningRate
*
decayRate
),
lr
);
applyBinary
(
binary
::
ApplyL1
<
real
>
(
learningRate
*
decayRate
),
lr
);
}
else
{
}
else
{
simd
::
decayL1
(
this
->
data_
,
this
->
data_
,
lr
.
data_
,
learningRate
*
decayRate
,
simd
::
decayL1
(
this
->
data_
,
this
->
data_
,
lr
.
data_
,
learningRate
*
decayRate
,
height_
*
width_
);
height_
*
width_
);
}
}
}
}
...
@@ -950,24 +1047,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
...
@@ -950,24 +1047,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
a
=
(
a
>
lambda
)
a
=
(
a
>
lambda
)
?
(
a
-
lambda
)
?
(
a
-
lambda
)
:
(
a
<
-
lambda
)
?
(
a
+
lambda
)
:
0
);
:
(
a
<
-
lambda
)
?
(
a
+
lambda
)
:
0
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
applyL1
(
T
learningRate
,
T
decayRate
)
{
void
BaseMatrixT
<
T
>::
applyL1
(
T
learningRate
,
T
decayRate
)
{
applyUnary
(
unary
::
ApplyL1
<
T
>
(
learningRate
*
decayRate
));
applyUnary
(
unary
::
ApplyL1
<
T
>
(
learningRate
*
decayRate
));
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
applyL1
(
real
learningRate
,
real
decayRate
)
{
void
BaseMatrixT
<
real
>::
applyL1
(
real
learningRate
,
real
decayRate
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyUnary
(
unary
::
ApplyL1
<
real
>
(
learningRate
*
decayRate
));
applyUnary
(
unary
::
ApplyL1
<
real
>
(
learningRate
*
decayRate
));
}
else
{
}
else
{
simd
::
decayL1
(
this
->
data_
,
this
->
data_
,
learningRate
*
decayRate
,
simd
::
decayL1
(
height_
*
width_
);
this
->
data_
,
this
->
data_
,
learningRate
*
decayRate
,
height_
*
width_
);
}
}
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ApplyL2
,
ONE_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
ApplyL2
,
ONE_PARAMETER
,
a
*=
(
1.0
f
/
(
1.0
f
+
p
*
b
)));
a
*=
(
1.0
f
/
(
1.0
f
+
p
*
b
)));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
applyL2
(
BaseMatrixT
&
lr
,
T
learningRate
,
T
decayRate
)
{
void
BaseMatrixT
<
T
>::
applyL2
(
BaseMatrixT
&
lr
,
T
learningRate
,
T
decayRate
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
ApplyL2
<
T
>
(
learningRate
*
decayRate
),
lr
);
applyBinary
(
binary
::
ApplyL2
<
T
>
(
learningRate
*
decayRate
),
lr
);
...
@@ -980,32 +1078,33 @@ void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
...
@@ -980,32 +1078,33 @@ void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
applyL2
(
T
learningRate
,
T
decayRate
)
{
void
BaseMatrixT
<
T
>::
applyL2
(
T
learningRate
,
T
decayRate
)
{
BaseMatrixT
<
T
>::
mulScalar
(
1.0
f
/
(
1.0
f
+
learningRate
*
decayRate
));
BaseMatrixT
<
T
>::
mulScalar
(
1.0
f
/
(
1.0
f
+
learningRate
*
decayRate
));
}
}
DEFINE_MATRIX_BINARY_OP
(
DotMul
,
a
*=
b
);
DEFINE_MATRIX_BINARY_OP
(
DotMul
,
a
*=
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotMul
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
dotMul
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
DotMul
<
T
>
(),
b
);
applyBinary
(
binary
::
DotMul
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
DotMul
,
a
=
b
*
c
);
DEFINE_MATRIX_TERNARY_OP
(
DotMul
,
a
=
b
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotMul
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
dotMul
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
DotMul
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
DotMul
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
DotDiv
,
a
=
(
b
==
0.0
)
?
0.0
:
b
/
c
);
DEFINE_MATRIX_TERNARY_OP
(
DotDiv
,
a
=
(
b
==
0.0
)
?
0.0
:
b
/
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotDiv
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
dotDiv
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
DotDiv
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
DotDiv
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotDiv2P
,
TWO_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotDiv2P
,
TWO_PARAMETER
,
a
=
(
b
+
p1
)
/
(
c
+
p2
));
a
=
(
b
+
p1
)
/
(
c
+
p2
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotDiv
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
dotDiv
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
applyTernary
(
ternary
::
DotDiv2P
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
DotDiv2P
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
...
@@ -1015,7 +1114,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
...
@@ -1015,7 +1114,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
?
THRESHOLD
?
THRESHOLD
:
((
a
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
a
);
:
((
a
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
a
);
a
=
log
(
1
+
exp
(
a
))
-
a
*
d
);
a
=
log
(
1
+
exp
(
a
))
-
a
*
d
);
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
rankLoss
(
BaseMatrixT
&
b
,
void
BaseMatrixT
<
real
>::
rankLoss
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
)
{
BaseMatrixT
&
d
)
{
...
@@ -1026,8 +1125,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
...
@@ -1026,8 +1125,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
a
=
(
a
>
THRESHOLD
)
a
=
(
a
>
THRESHOLD
)
?
THRESHOLD
?
THRESHOLD
:
((
a
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
a
);
:
((
a
<
-
THRESHOLD
)
?
(
-
THRESHOLD
)
:
a
);
a
=
exp
(
a
);
a
=
(
a
/
(
1
+
a
)
-
d
));
a
=
exp
(
a
);
template
<
>
a
=
(
a
/
(
1
+
a
)
-
d
));
template
<
>
void
BaseMatrixT
<
real
>::
rankLossBp
(
BaseMatrixT
&
b
,
void
BaseMatrixT
<
real
>::
rankLossBp
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
)
{
BaseMatrixT
&
d
)
{
...
@@ -1040,7 +1140,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
...
@@ -1040,7 +1140,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
?
-
THRESHOLD
?
-
THRESHOLD
:
b
;
:
b
;
a
=
log
(
1
+
exp
(
x
))
-
c
*
x
);
a
=
log
(
1
+
exp
(
x
))
-
c
*
x
);
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
logisticRegressionLoss
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
real
>::
logisticRegressionLoss
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
LogisticRegressionLoss
<
real
>
(),
b
,
c
);
applyTernary
(
ternary
::
LogisticRegressionLoss
<
real
>
(),
b
,
c
);
}
}
...
@@ -1050,22 +1150,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
...
@@ -1050,22 +1150,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
T
x
=
(
b
>
THRESHOLD
)
?
THRESHOLD
:
(
b
<
-
THRESHOLD
)
T
x
=
(
b
>
THRESHOLD
)
?
THRESHOLD
:
(
b
<
-
THRESHOLD
)
?
-
THRESHOLD
?
-
THRESHOLD
:
b
;
:
b
;
x
=
exp
(
x
);
a
=
x
/
(
1
+
x
)
-
c
);
x
=
exp
(
x
);
template
<
>
a
=
x
/
(
1
+
x
)
-
c
);
template
<
>
void
BaseMatrixT
<
real
>::
logisticRegressionLossBp
(
BaseMatrixT
&
b
,
void
BaseMatrixT
<
real
>::
logisticRegressionLossBp
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
LogisticRegressionLossBp
<
real
>
(),
b
,
c
);
applyTernary
(
ternary
::
LogisticRegressionLossBp
<
real
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
BiggerThan
,
a
=
(
b
>
c
)
?
1.0
f
:
0.0
f
);
DEFINE_MATRIX_TERNARY_OP
(
BiggerThan
,
a
=
(
b
>
c
)
?
1.0
f
:
0.0
f
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
biggerThan
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
biggerThan
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
BiggerThan
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
BiggerThan
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_QUATERNARY_OP
(
DEFINE_MATRIX_QUATERNARY_OP
(
BiggerThan
,
a
=
((
b
>
c
&&
d
>
0.5
f
)
||
(
b
<
c
&&
d
<
0.5
f
))
?
1.0
f
:
0.0
f
);
BiggerThan
,
a
=
((
b
>
c
&&
d
>
0.5
f
)
||
(
b
<
c
&&
d
<
0.5
f
))
?
1.0
f
:
0.0
f
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
biggerThan
(
BaseMatrixT
&
b
,
void
BaseMatrixT
<
T
>::
biggerThan
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
)
{
BaseMatrixT
&
d
)
{
...
@@ -1073,25 +1174,34 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
...
@@ -1073,25 +1174,34 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
}
}
DEFINE_MATRIX_TERNARY_OP
(
Max
,
a
=
(
b
>
c
)
?
b
:
c
);
DEFINE_MATRIX_TERNARY_OP
(
Max
,
a
=
(
b
>
c
)
?
b
:
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
max2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
max2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
Max
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
Max
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
BinaryClassificationError
,
ONE_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
BinaryClassificationError
,
ONE_PARAMETER
,
c
+=
((
a
>
p
)
==
(
b
>
p
))
?
0.0
f
:
1.0
f
);
c
+=
((
a
>
p
)
==
(
b
>
p
))
?
0.0
f
:
1.0
f
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
binaryClassificationError2
(
size_t
destCol
,
BaseMatrixT
&
b
,
void
BaseMatrixT
<
T
>::
binaryClassificationError2
(
size_t
destCol
,
BaseMatrixT
&
c
,
T
p
)
{
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p
)
{
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
destCol
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
destCol
,
0
);
int
numRows
=
b
.
height_
;
int
numRows
=
b
.
height_
;
int
numCols
=
b
.
width_
;
int
numCols
=
b
.
width_
;
b
.
applyTernary
(
ternary
::
BinaryClassificationError
<
T
>
(
p
),
c
,
*
this
,
numRows
,
b
.
applyTernary
(
ternary
::
BinaryClassificationError
<
T
>
(
p
),
numCols
,
offset
,
false_type
(),
true_type
()
/*cAsColVector*/
);
c
,
*
this
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*cAsColVector*/
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
binaryClassificationError
(
size_t
destCol
,
void
BaseMatrixT
<
real
>::
binaryClassificationError
(
size_t
destCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
c
,
...
@@ -1099,127 +1209,148 @@ void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
...
@@ -1099,127 +1209,148 @@ void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
MatrixOffset
offset
(
destCol
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
destCol
,
0
,
0
,
0
,
0
,
0
);
int
numRows
=
b
.
height_
;
int
numRows
=
b
.
height_
;
int
numCols
=
b
.
width_
;
int
numCols
=
b
.
width_
;
aggregate
(
aggregate
::
sum
(),
base
::
binary
::
classificationError
(
p
),
aggregate
(
aggregate
::
sum
(),
base
::
binary
::
add
(),
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
base
::
binary
::
classificationError
(
p
),
base
::
binary
::
add
(),
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*aAsColVector*/
);
true_type
()
/*aAsColVector*/
);
}
}
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP
(
Add3
,
THREE_PARAMETER
,
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP
(
Add3
,
THREE_PARAMETER
,
a
=
p1
*
b
+
p2
*
c
+
p3
*
d
);
a
=
p1
*
b
+
p2
*
c
+
p3
*
d
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
add3
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
,
T
p1
,
void
BaseMatrixT
<
T
>::
add3
(
T
p2
,
T
p3
)
{
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
&
d
,
T
p1
,
T
p2
,
T
p3
)
{
applyQuaternary
(
quaternary
::
Add3
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
,
d
);
applyQuaternary
(
quaternary
::
Add3
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
,
d
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
DotMulSquare
,
a
=
b
*
c
*
c
);
DEFINE_MATRIX_TERNARY_OP
(
DotMulSquare
,
a
=
b
*
c
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotMulSquare
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
dotMulSquare
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
DotMulSquare
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
DotMulSquare
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
DotSquareSquare
,
a
=
b
*
b
*
c
*
c
);
DEFINE_MATRIX_TERNARY_OP
(
DotSquareSquare
,
a
=
b
*
b
*
c
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotSquareSquare
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
dotSquareSquare
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
applyTernary
(
ternary
::
DotSquareSquare
<
T
>
(),
b
,
c
);
applyTernary
(
ternary
::
DotSquareSquare
<
T
>
(),
b
,
c
);
}
}
DEFINE_MATRIX_BINARY_OP
(
DotMulSquare
,
a
*=
b
*
b
);
DEFINE_MATRIX_BINARY_OP
(
DotMulSquare
,
a
*=
b
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotMulSquare
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
dotMulSquare
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
DotMulSquare
<
T
>
(),
b
);
applyBinary
(
binary
::
DotMulSquare
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_BINARY_OP
(
DotSquareMul
,
a
=
a
*
a
*
b
);
DEFINE_MATRIX_BINARY_OP
(
DotSquareMul
,
a
=
a
*
a
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotSquareMul
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
dotSquareMul
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
DotSquareMul
<
T
>
(),
b
);
applyBinary
(
binary
::
DotSquareMul
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP
(
AddSquareSum
,
THREE_PARAMETER
,
DEFINE_MATRIX_QUATERNARY_PARAMETER_OP
(
AddSquareSum
,
THREE_PARAMETER
,
T
tmp
=
p1
*
b
+
p2
*
c
+
p3
*
d
;
T
tmp
=
p1
*
b
+
p2
*
c
+
p3
*
d
;
a
+=
tmp
*
tmp
);
a
+=
tmp
*
tmp
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addSquareSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
d
,
void
BaseMatrixT
<
T
>::
addSquareSum
(
T
p1
,
T
p2
,
T
p3
)
{
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
BaseMatrixT
d
,
T
p1
,
T
p2
,
T
p3
)
{
applyQuaternary
(
quaternary
::
AddSquareSum
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
,
d
);
applyQuaternary
(
quaternary
::
AddSquareSum
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
,
d
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
AddSquare
,
ONE_PARAMETER
,
a
+=
p
*
b
*
b
);
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
AddSquare
,
ONE_PARAMETER
,
a
+=
p
*
b
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addSquare
(
BaseMatrixT
&
b
,
T
p
)
{
void
BaseMatrixT
<
T
>::
addSquare
(
BaseMatrixT
&
b
,
T
p
)
{
applyBinary
(
binary
::
AddSquare
<
T
>
(
p
),
b
);
applyBinary
(
binary
::
AddSquare
<
T
>
(
p
),
b
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
DecayAddSquare
,
TWO_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
DecayAddSquare
,
TWO_PARAMETER
,
a
=
p1
*
a
+
p2
*
b
*
b
);
a
=
p1
*
a
+
p2
*
b
*
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
decayAddSquare
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
decayAddSquare
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
applyBinary
(
binary
::
DecayAddSquare
<
T
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
DecayAddSquare
<
T
>
(
p1
,
p2
),
b
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DecayAddSquareMul
,
TWO_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DecayAddSquareMul
,
TWO_PARAMETER
,
a
=
p1
*
a
+
p2
*
b
*
b
*
c
*
c
);
a
=
p1
*
a
+
p2
*
b
*
b
*
c
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
decayAddSquareMul
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
void
BaseMatrixT
<
T
>::
decayAddSquareMul
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
T
p2
)
{
applyTernary
(
ternary
::
DecayAddSquareMul
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
DecayAddSquareMul
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
ReciprocalSum
,
THREE_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
ReciprocalSum
,
THREE_PARAMETER
,
a
=
1
/
(
p1
*
b
+
p2
*
c
+
p3
));
a
=
1
/
(
p1
*
b
+
p2
*
c
+
p3
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
reciprocalSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
,
void
BaseMatrixT
<
T
>::
reciprocalSum
(
T
p3
)
{
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
,
T
p3
)
{
applyTernary
(
ternary
::
ReciprocalSum
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
);
applyTernary
(
ternary
::
ReciprocalSum
<
T
>
(
p1
,
p2
,
p3
),
b
,
c
);
}
}
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Reciprocal2
,
TWO_PARAMETER
,
DEFINE_MATRIX_BINARY_PARAMETER_OP
(
Reciprocal2
,
TWO_PARAMETER
,
a
=
1
/
(
p1
*
b
+
p2
));
a
=
1
/
(
p1
*
b
+
p2
));
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
reciprocal2
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
reciprocal2
(
BaseMatrixT
&
b
,
T
p1
,
T
p2
)
{
applyBinary
(
binary
::
Reciprocal2
<
T
>
(
p1
,
p2
),
b
);
applyBinary
(
binary
::
Reciprocal2
<
T
>
(
p1
,
p2
),
b
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotMulSquareSum
,
TWO_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotMulSquareSum
,
TWO_PARAMETER
,
T
tmp
=
p1
*
b
+
p2
*
c
;
T
tmp
=
p1
*
b
+
p2
*
c
;
a
*=
tmp
*
tmp
);
a
*=
tmp
*
tmp
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotMulSquareSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
void
BaseMatrixT
<
T
>::
dotMulSquareSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
T
p2
)
{
applyTernary
(
ternary
::
DotMulSquareSum
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
DotMulSquareSum
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotSquareSum
,
TWO_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotSquareSum
,
TWO_PARAMETER
,
T
tmp
=
p1
*
b
+
p2
*
c
;
T
tmp
=
p1
*
b
+
p2
*
c
;
a
=
tmp
*
tmp
);
a
=
tmp
*
tmp
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotSquareSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
dotSquareSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
applyTernary
(
ternary
::
DotSquareSum
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
DotSquareSum
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotMulSum
,
TWO_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
DotMulSum
,
TWO_PARAMETER
,
a
*=
p1
*
b
+
p2
*
c
);
a
*=
p1
*
b
+
p2
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
dotMulSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
dotMulSum
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
applyTernary
(
ternary
::
DotMulSum
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
DotMulSum
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_BINARY_OP
(
CopyAndClear
,
b
=
a
;
a
=
0
);
DEFINE_MATRIX_BINARY_OP
(
CopyAndClear
,
b
=
a
;
a
=
0
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
copyAndClear
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
copyAndClear
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
CopyAndClear
<
T
>
(),
b
);
applyBinary
(
binary
::
CopyAndClear
<
T
>
(),
b
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
AddDotMul
,
TWO_PARAMETER
,
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
AddDotMul
,
TWO_PARAMETER
,
a
=
p1
*
a
+
p2
*
b
*
c
);
a
=
p1
*
a
+
p2
*
b
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addDotMul
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
void
BaseMatrixT
<
T
>::
addDotMul
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p1
,
T
p2
)
{
applyTernary
(
ternary
::
AddDotMul
<
T
>
(
p1
,
p2
),
b
,
c
);
applyTernary
(
ternary
::
AddDotMul
<
T
>
(
p1
,
p2
),
b
,
c
);
}
}
DEFINE_MATRIX_BINARY_OP
(
Assign
,
a
=
b
;);
DEFINE_MATRIX_BINARY_OP
(
Assign
,
a
=
b
;);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
assign
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
assign
(
BaseMatrixT
&
b
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
applyBinary
(
binary
::
Assign
<
T
>
(),
b
);
applyBinary
(
binary
::
Assign
<
T
>
(),
b
);
...
@@ -1230,7 +1361,7 @@ void BaseMatrixT<T>::assign(BaseMatrixT& b) {
...
@@ -1230,7 +1361,7 @@ void BaseMatrixT<T>::assign(BaseMatrixT& b) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
assignAtOffset
(
BaseMatrixT
&
b
,
int64_t
columnOffset
)
{
void
BaseMatrixT
<
T
>::
assignAtOffset
(
BaseMatrixT
&
b
,
int64_t
columnOffset
)
{
if
(
columnOffset
+
b
.
width_
<=
width_
)
{
if
(
columnOffset
+
b
.
width_
<=
width_
)
{
int
numRows
=
height_
;
int
numRows
=
height_
;
...
@@ -1250,24 +1381,31 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
...
@@ -1250,24 +1381,31 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
}
}
DEFINE_MATRIX_BINARY_OP
(
DeepSwap
,
T
tmp
=
a
;
a
=
b
;
b
=
tmp
);
DEFINE_MATRIX_BINARY_OP
(
DeepSwap
,
T
tmp
=
a
;
a
=
b
;
b
=
tmp
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
deepSwap
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
deepSwap
(
BaseMatrixT
&
b
)
{
applyBinary
(
binary
::
DeepSwap
<
T
>
(),
b
);
applyBinary
(
binary
::
DeepSwap
<
T
>
(),
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
rowDotMul
(
size_t
destCol
,
void
BaseMatrixT
<
real
>::
rowDotMul
(
size_t
destCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
BaseMatrixT
&
c
)
{
int
numRows
=
b
.
height_
;
int
numRows
=
b
.
height_
;
int
numCols
=
b
.
width_
;
int
numCols
=
b
.
width_
;
MatrixOffset
offset
(
destCol
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
destCol
,
0
,
0
,
0
,
0
,
0
);
aggregate
(
aggregate
::
sum
(),
base
::
binary
::
mul
(),
base
::
binary
::
add
(),
b
,
c
,
aggregate
(
aggregate
::
sum
(),
numRows
,
numCols
,
offset
,
false_type
(),
base
::
binary
::
mul
(),
base
::
binary
::
add
(),
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*aAsColVector*/
);
true_type
()
/*aAsColVector*/
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
rowDotMul2
(
size_t
destCol
,
void
BaseMatrixT
<
T
>::
rowDotMul2
(
size_t
destCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
BaseMatrixT
&
c
)
{
...
@@ -1290,17 +1428,24 @@ void BaseMatrixT<T>::rowDotMul2(size_t destCol,
...
@@ -1290,17 +1428,24 @@ void BaseMatrixT<T>::rowDotMul2(size_t destCol,
}
}
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
addDotMulVMM
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
real
>::
addDotMulVMM
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
int
numRows
=
b
.
height_
;
int
numRows
=
b
.
height_
;
int
numCols
=
b
.
width_
;
int
numCols
=
b
.
width_
;
aggregate
(
aggregate
::
sum
(),
base
::
binary
::
mul
(),
base
::
binary
::
add
(),
b
,
c
,
aggregate
(
aggregate
::
sum
(),
numRows
,
numCols
,
offset
,
true_type
()
/*aAsRowVector*/
,
base
::
binary
::
mul
(),
base
::
binary
::
add
(),
b
,
c
,
numRows
,
numCols
,
offset
,
true_type
()
/*aAsRowVector*/
,
false_type
());
false_type
());
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addDotMulVMM2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
addDotMulVMM2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
...
@@ -1321,16 +1466,22 @@ void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
...
@@ -1321,16 +1466,22 @@ void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
}
}
DEFINE_MATRIX_TERNARY_OP
(
addDotMulMMV
,
a
+=
b
*
c
);
DEFINE_MATRIX_TERNARY_OP
(
addDotMulMMV
,
a
+=
b
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addDotMulMMV
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
addDotMulMMV
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
addDotMulMMV
<
T
>
(),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
addDotMulMMV
<
T
>
(),
true_type
()
/*cAsRowVector*/
,
false_type
());
b
,
c
,
numRows
,
numCols
,
offset
,
true_type
()
/*cAsRowVector*/
,
false_type
());
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addDotMulMMV2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
addDotMulMMV2
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
...
@@ -1350,16 +1501,22 @@ void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
...
@@ -1350,16 +1501,22 @@ void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
rowScale
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
rowScale
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
DotMul
<
T
>
(),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
DotMul
<
T
>
(),
false_type
(),
true_type
()
/*cAsColVector*/
);
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*cAsColVector*/
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
rowScale2
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
rowScale2
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
CHECK
(
!
useGpu_
)
<<
"do not support gpu"
;
...
@@ -1379,52 +1536,82 @@ void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
...
@@ -1379,52 +1536,82 @@ void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
colScale
(
size_t
cRow
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
colScale
(
size_t
cRow
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
cRow
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
cRow
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
DotMul
<
T
>
(),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
DotMul
<
T
>
(),
true_type
()
/* cAsRowVector */
,
false_type
()
/* cAsColVector */
);
b
,
c
,
numRows
,
numCols
,
offset
,
true_type
()
/* cAsRowVector */
,
false_type
()
/* cAsColVector */
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addColScale
(
size_t
cRow
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
addColScale
(
size_t
cRow
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
cRow
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
cRow
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
addDotMulMMV
<
T
>
(),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
addDotMulMMV
<
T
>
(),
true_type
()
/* cAsRowVector */
,
false_type
()
/* cAsColVector */
);
b
,
c
,
numRows
,
numCols
,
offset
,
true_type
()
/* cAsRowVector */
,
false_type
()
/* cAsColVector */
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
addRowScale
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
T
>::
addRowScale
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
addDotMulMMV
<
T
>
(),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
addDotMulMMV
<
T
>
(),
false_type
(),
true_type
()
/*cAsColVector*/
);
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*cAsColVector*/
);
}
}
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
RowAdd
,
ONE_PARAMETER
,
a
=
b
+
p
*
c
);
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
RowAdd
,
ONE_PARAMETER
,
a
=
b
+
p
*
c
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
rowAdd
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p
)
{
void
BaseMatrixT
<
T
>::
rowAdd
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
T
p
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
RowAdd
<
T
>
(
p
),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
RowAdd
<
T
>
(
p
),
false_type
(),
true_type
()
/*cAsColVector*/
);
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*cAsColVector*/
);
}
}
DEFINE_MATRIX_TERNARY_OP
(
RowPow
,
a
=
pow
(
b
,
c
));
DEFINE_MATRIX_TERNARY_OP
(
RowPow
,
a
=
pow
(
b
,
c
));
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
rowPow
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
void
BaseMatrixT
<
real
>::
rowPow
(
size_t
cCol
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
if
(
useGpu_
)
{
if
(
useGpu_
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
cCol
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyTernary
(
ternary
::
RowPow
<
real
>
(),
b
,
c
,
numRows
,
numCols
,
offset
,
applyTernary
(
ternary
::
RowPow
<
real
>
(),
false_type
(),
true_type
()
/*cAsColVector*/
);
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*cAsColVector*/
);
}
else
{
}
else
{
size_t
height
=
this
->
height_
;
size_t
height
=
this
->
height_
;
size_t
width
=
this
->
width_
;
size_t
width
=
this
->
width_
;
...
@@ -1441,44 +1628,64 @@ void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
...
@@ -1441,44 +1628,64 @@ void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
}
}
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
mulRowVector
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
mulRowVector
(
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
DotMul
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
applyBinary
(
binary
::
DotMul
<
T
>
(),
true_type
()
/* bAsRowVector */
,
false_type
());
b
,
numRows
,
numCols
,
offset
,
true_type
()
/* bAsRowVector */
,
false_type
());
}
}
DEFINE_MATRIX_BINARY_OP
(
DotDiv
,
a
/=
b
);
DEFINE_MATRIX_BINARY_OP
(
DotDiv
,
a
/=
b
);
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
divRowVector
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
divRowVector
(
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
DotDiv
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
applyBinary
(
binary
::
DotDiv
<
T
>
(),
true_type
()
/* bAsRowVector */
,
false_type
());
b
,
numRows
,
numCols
,
offset
,
true_type
()
/* bAsRowVector */
,
false_type
());
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
mulColVector
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
mulColVector
(
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
DotMul
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
applyBinary
(
binary
::
DotMul
<
T
>
(),
false_type
(),
true_type
()
/* bAsColVector */
);
b
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/* bAsColVector */
);
}
}
template
<
class
T
>
template
<
class
T
>
void
BaseMatrixT
<
T
>::
divColVector
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
T
>::
divColVector
(
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
);
int
numRows
=
height_
;
int
numRows
=
height_
;
int
numCols
=
width_
;
int
numCols
=
width_
;
applyBinary
(
binary
::
DotDiv
<
T
>
(),
b
,
numRows
,
numCols
,
offset
,
applyBinary
(
binary
::
DotDiv
<
T
>
(),
false_type
(),
true_type
()
/* bAsColVector */
);
b
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/* bAsColVector */
);
}
}
template
<
>
template
<
>
template
<
class
Agg
>
template
<
class
Agg
>
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
BaseMatrixT
&
b
)
{
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
...
@@ -1486,13 +1693,20 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
...
@@ -1486,13 +1693,20 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
size_t
numCols
=
b
.
width_
;
size_t
numCols
=
b
.
width_
;
CHECK_EQ
(
height_
,
numRows
);
CHECK_EQ
(
height_
,
numRows
);
CHECK_EQ
(
width_
,
1UL
);
CHECK_EQ
(
width_
,
1UL
);
aggregate
(
agg
,
base
::
unary
::
identity
(),
base
::
binary
::
second
(),
b
,
numRows
,
aggregate
(
agg
,
numCols
,
offset
,
false_type
(),
true_type
()
/*aAsColVector*/
);
base
::
unary
::
identity
(),
base
::
binary
::
second
(),
b
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*aAsColVector*/
);
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
,
class
Saver
>
template
<
class
Agg
,
class
Saver
>
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
Saver
sv
,
BaseMatrixT
&
b
)
{
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
Saver
sv
,
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
...
@@ -1500,16 +1714,25 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
...
@@ -1500,16 +1714,25 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
size_t
numCols
=
b
.
width_
;
size_t
numCols
=
b
.
width_
;
CHECK_EQ
(
height_
,
numRows
);
CHECK_EQ
(
height_
,
numRows
);
CHECK_EQ
(
width_
,
1UL
);
CHECK_EQ
(
width_
,
1UL
);
aggregate
(
agg
,
base
::
unary
::
identity
(),
sv
,
b
,
numRows
,
numCols
,
offset
,
aggregate
(
agg
,
false_type
(),
true_type
()
/*aAsColVector*/
);
base
::
unary
::
identity
(),
sv
,
b
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*aAsColVector*/
);
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
>
template
<
class
Agg
>
int
BaseMatrixT
<
real
>::
applyRow
(
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
Agg
agg
,
real
scaleDest
,
real
scaleAgg
,
BaseMatrixT
&
b
)
{
real
scaleDest
,
real
scaleAgg
,
BaseMatrixT
&
b
)
{
if
(
scaleDest
!=
0
)
{
if
(
scaleDest
!=
0
)
{
applyRow
(
agg
,
base
::
binary
::
add2
(
scaleDest
,
scaleAgg
),
b
);
applyRow
(
agg
,
base
::
binary
::
add2
(
scaleDest
,
scaleAgg
),
b
);
}
else
{
}
else
{
...
@@ -1521,10 +1744,10 @@ int BaseMatrixT<real>::applyRow(
...
@@ -1521,10 +1744,10 @@ int BaseMatrixT<real>::applyRow(
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
,
class
Op
,
class
Saver
>
template
<
class
Agg
,
class
Op
,
class
Saver
>
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
Op
op
,
Saver
sv
,
int
BaseMatrixT
<
real
>::
applyRow
(
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
Agg
agg
,
Op
op
,
Saver
sv
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
size_t
numRows
=
b
.
height_
;
size_t
numRows
=
b
.
height_
;
size_t
numCols
=
b
.
width_
;
size_t
numCols
=
b
.
width_
;
...
@@ -1532,16 +1755,27 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
...
@@ -1532,16 +1755,27 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
CHECK_EQ
(
width_
,
1UL
);
CHECK_EQ
(
width_
,
1UL
);
CHECK_EQ
(
c
.
height_
,
numRows
);
CHECK_EQ
(
c
.
height_
,
numRows
);
CHECK_EQ
(
c
.
width_
,
numCols
);
CHECK_EQ
(
c
.
width_
,
numCols
);
aggregate
(
agg
,
op
,
sv
,
aggregate
(
agg
,
b
,
c
,
numRows
,
numCols
,
offset
,
op
,
false_type
(),
true_type
()
/*aAsColVector*/
);
sv
,
b
,
c
,
numRows
,
numCols
,
offset
,
false_type
(),
true_type
()
/*aAsColVector*/
);
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
,
class
Op
>
template
<
class
Agg
,
class
Op
>
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
Op
op
,
real
scaleDest
,
real
scaleAgg
,
int
BaseMatrixT
<
real
>::
applyRow
(
Agg
agg
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
Op
op
,
real
scaleDest
,
real
scaleAgg
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
)
{
if
(
scaleDest
!=
0
)
{
if
(
scaleDest
!=
0
)
{
applyRow
(
agg
,
op
,
base
::
binary
::
add2
(
scaleDest
,
scaleAgg
),
b
,
c
);
applyRow
(
agg
,
op
,
base
::
binary
::
add2
(
scaleDest
,
scaleAgg
),
b
,
c
);
}
else
{
}
else
{
...
@@ -1553,7 +1787,7 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
...
@@ -1553,7 +1787,7 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
>
template
<
class
Agg
>
int
BaseMatrixT
<
real
>::
applyCol
(
Agg
agg
,
BaseMatrixT
&
b
)
{
int
BaseMatrixT
<
real
>::
applyCol
(
Agg
agg
,
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
...
@@ -1561,13 +1795,20 @@ int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
...
@@ -1561,13 +1795,20 @@ int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
size_t
numCols
=
b
.
width_
;
size_t
numCols
=
b
.
width_
;
CHECK_EQ
(
width_
,
numCols
);
CHECK_EQ
(
width_
,
numCols
);
CHECK_EQ
(
height_
,
1UL
);
CHECK_EQ
(
height_
,
1UL
);
aggregate
(
agg
,
base
::
unary
::
identity
(),
base
::
binary
::
second
(),
b
,
numRows
,
aggregate
(
agg
,
numCols
,
offset
,
true_type
()
/*aAsRowVector*/
,
false_type
());
base
::
unary
::
identity
(),
base
::
binary
::
second
(),
b
,
numRows
,
numCols
,
offset
,
true_type
()
/*aAsRowVector*/
,
false_type
());
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
,
class
Saver
>
template
<
class
Agg
,
class
Saver
>
int
BaseMatrixT
<
real
>::
applyCol
(
Agg
agg
,
Saver
sv
,
BaseMatrixT
&
b
)
{
int
BaseMatrixT
<
real
>::
applyCol
(
Agg
agg
,
Saver
sv
,
BaseMatrixT
&
b
)
{
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
MatrixOffset
offset
(
0
,
0
,
0
,
0
,
0
,
0
);
...
@@ -1575,16 +1816,25 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
...
@@ -1575,16 +1816,25 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
size_t
numCols
=
b
.
width_
;
size_t
numCols
=
b
.
width_
;
CHECK_EQ
(
width_
,
numCols
);
CHECK_EQ
(
width_
,
numCols
);
CHECK_EQ
(
height_
,
1UL
);
CHECK_EQ
(
height_
,
1UL
);
aggregate
(
agg
,
base
::
unary
::
identity
(),
sv
,
b
,
numRows
,
numCols
,
offset
,
aggregate
(
agg
,
true_type
()
/*aAsRowVector*/
,
false_type
());
base
::
unary
::
identity
(),
sv
,
b
,
numRows
,
numCols
,
offset
,
true_type
()
/*aAsRowVector*/
,
false_type
());
return
0
;
return
0
;
}
}
template
<
>
template
<
>
template
<
class
Agg
>
template
<
class
Agg
>
int
BaseMatrixT
<
real
>::
applyCol
(
int
BaseMatrixT
<
real
>::
applyCol
(
Agg
agg
,
Agg
agg
,
real
scaleDest
,
real
scaleAgg
,
BaseMatrixT
&
b
)
{
real
scaleDest
,
real
scaleAgg
,
BaseMatrixT
&
b
)
{
if
(
scaleDest
!=
0
)
{
if
(
scaleDest
!=
0
)
{
applyCol
(
agg
,
base
::
binary
::
add2
(
scaleDest
,
scaleAgg
),
b
);
applyCol
(
agg
,
base
::
binary
::
add2
(
scaleDest
,
scaleAgg
),
b
);
}
else
{
}
else
{
...
@@ -1596,48 +1846,51 @@ int BaseMatrixT<real>::applyCol(
...
@@ -1596,48 +1846,51 @@ int BaseMatrixT<real>::applyCol(
return
0
;
return
0
;
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
sumRows
(
BaseMatrixT
&
b
,
real
scaleSum
,
real
scaleDest
)
{
void
BaseMatrixT
<
real
>::
sumRows
(
BaseMatrixT
&
b
,
real
scaleSum
,
real
scaleDest
)
{
applyRow
(
aggregate
::
sum
(),
scaleDest
,
scaleSum
,
b
);
applyRow
(
aggregate
::
sum
(),
scaleDest
,
scaleSum
,
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
maxRows
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
maxRows
(
BaseMatrixT
&
b
)
{
applyRow
(
aggregate
::
max
(),
b
);
applyRow
(
aggregate
::
max
(),
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
minRows
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
minRows
(
BaseMatrixT
&
b
)
{
applyRow
(
aggregate
::
min
(),
b
);
applyRow
(
aggregate
::
min
(),
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
maxCols
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
maxCols
(
BaseMatrixT
&
b
)
{
applyCol
(
aggregate
::
max
(),
b
);
applyCol
(
aggregate
::
max
(),
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
minCols
(
BaseMatrixT
&
b
)
{
void
BaseMatrixT
<
real
>::
minCols
(
BaseMatrixT
&
b
)
{
applyCol
(
aggregate
::
min
(),
b
);
applyCol
(
aggregate
::
min
(),
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
sumCols
(
BaseMatrixT
&
b
,
real
scaleSum
,
real
scaleDest
)
{
void
BaseMatrixT
<
real
>::
sumCols
(
BaseMatrixT
&
b
,
real
scaleSum
,
real
scaleDest
)
{
applyCol
(
aggregate
::
sum
(),
scaleDest
,
scaleSum
,
b
);
applyCol
(
aggregate
::
sum
(),
scaleDest
,
scaleSum
,
b
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
sumOfSquaredDiffs
(
void
BaseMatrixT
<
real
>::
sumOfSquaredDiffs
(
BaseMatrixT
&
b
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
real
scaleSum
,
real
scaleDest
)
{
BaseMatrixT
&
c
,
applyRow
(
aggregate
::
sum
(),
base
::
binary
::
squaredDiff
(),
real
scaleSum
,
scaleDest
,
scaleSum
,
b
,
c
);
real
scaleDest
)
{
applyRow
(
aggregate
::
sum
(),
base
::
binary
::
squaredDiff
(),
scaleDest
,
scaleSum
,
b
,
c
);
}
}
template
<
>
template
<
>
void
BaseMatrixT
<
real
>::
sumOfProducts
(
void
BaseMatrixT
<
real
>::
sumOfProducts
(
BaseMatrixT
&
b
,
BaseMatrixT
&
b
,
BaseMatrixT
&
c
,
real
scaleSum
,
real
scaleDest
)
{
BaseMatrixT
&
c
,
applyRow
(
aggregate
::
sum
(),
base
::
binary
::
mul
(),
real
scaleSum
,
scaleDest
,
scaleSum
,
b
,
c
);
real
scaleDest
)
{
applyRow
(
aggregate
::
sum
(),
base
::
binary
::
mul
(),
scaleDest
,
scaleSum
,
b
,
c
);
}
}
template
class
BaseMatrixT
<
real
>;
template
class
BaseMatrixT
<
real
>;
...
...
paddle/math/TrainingAlgorithmOp.cu
浏览文件 @
1d4fa243
...
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/utils/Logging.h"
#include "BaseMatrix.h"
#include "BaseMatrix.h"
#include "TrainingAlgorithmOp.h"
#include "TrainingAlgorithmOp.h"
#include "paddle/utils/Logging.h"
#if __cplusplus > 199711L
#if __cplusplus > 199711L
...
@@ -32,10 +32,10 @@ void sparseMomentumApply(BaseMatrix& value,
...
@@ -32,10 +32,10 @@ void sparseMomentumApply(BaseMatrix& value,
real
tau
,
real
tau
,
real
learningRate
)
{
real
learningRate
)
{
auto
expr1
=
momU
.
lazyAssign
(
momU
-
(
alpha
*
gamma
*
learningRate
)
*
grad
);
auto
expr1
=
momU
.
lazyAssign
(
momU
-
(
alpha
*
gamma
*
learningRate
)
*
grad
);
auto
expr2
=
momV
.
lazyAssign
(
auto
expr2
=
momV
+
(
tau
*
alpha
*
gamma
*
learningRate
)
*
grad
);
momV
.
lazyAssign
(
momV
+
(
tau
*
alpha
*
gamma
*
learningRate
)
*
grad
);
auto
expr3
=
value
.
lazyAssign
(
auto
expr3
=
value
.
lazyAssign
(
(
tau
/
beta
+
(
real
)
1
/
alpha
)
*
momU
+
(
tau
/
beta
+
(
real
)
1
/
alpha
)
*
momU
+
((
real
)
1
/
beta
)
*
momV
);
((
real
)
1
/
beta
)
*
momV
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
);
}
}
...
@@ -52,12 +52,12 @@ void adadeltaApply(BaseMatrix& value,
...
@@ -52,12 +52,12 @@ void adadeltaApply(BaseMatrix& value,
real
momentum
,
real
momentum
,
real
decayRate
)
{
real
decayRate
)
{
auto
expr1
=
accum
.
lazyAssign
(
rou
*
accum
+
((
real
)
1
-
rou
)
*
grad
.
square
());
auto
expr1
=
accum
.
lazyAssign
(
rou
*
accum
+
((
real
)
1
-
rou
)
*
grad
.
square
());
auto
expr2
=
lr
.
lazyAssign
(
auto
expr2
=
((
accum_update
+
epsilon
)
/
(
accum
+
epsilon
)).
sqrt
());
lr
.
lazyAssign
(
((
accum_update
+
epsilon
)
/
(
accum
+
epsilon
)).
sqrt
());
auto
expr3
=
accum_update
.
lazyAssign
(
auto
expr3
=
accum_update
.
lazyAssign
(
rou
*
accum_update
+
rou
*
accum_update
+
((
real
)
1
-
rou
)
*
(
grad
*
lr
).
square
());
((
real
)
1
-
rou
)
*
(
grad
*
lr
).
square
());
auto
expr4
=
mom
.
lazyAssign
(
auto
expr4
=
mom
.
lazyAssign
(
mom
*
momentum
-
mom
*
momentum
-
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
auto
expr5
=
value
.
lazyAssign
(
value
+
mom
);
auto
expr5
=
value
.
lazyAssign
(
value
+
mom
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
,
expr5
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
,
expr5
);
...
@@ -74,10 +74,10 @@ void adagradApply(BaseMatrix& value,
...
@@ -74,10 +74,10 @@ void adagradApply(BaseMatrix& value,
real
momentum
,
real
momentum
,
real
decayRate
)
{
real
decayRate
)
{
auto
expr1
=
accum
.
lazyAssign
(
accum
+
grad
.
square
());
auto
expr1
=
accum
.
lazyAssign
(
accum
+
grad
.
square
());
auto
expr2
=
lr
.
lazyAssign
(
auto
expr2
=
(
accum_buffer
+
accum
+
epsilon
).
sqrt
().
reciprocal
());
lr
.
lazyAssign
(
(
accum_buffer
+
accum
+
epsilon
).
sqrt
().
reciprocal
());
auto
expr3
=
mom
.
lazyAssign
(
auto
expr3
=
mom
.
lazyAssign
(
mom
*
momentum
-
mom
*
momentum
-
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
auto
expr4
=
value
.
lazyAssign
(
value
+
mom
);
auto
expr4
=
value
.
lazyAssign
(
value
+
mom
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
);
...
@@ -98,8 +98,8 @@ void rmspropApply(BaseMatrix& value,
...
@@ -98,8 +98,8 @@ void rmspropApply(BaseMatrix& value,
bool
firstTime
)
{
bool
firstTime
)
{
auto
expr2
=
f
.
lazyAssign
(
accumulatedRou
*
f
+
((
real
)
1
-
rou
)
*
grad
);
auto
expr2
=
f
.
lazyAssign
(
accumulatedRou
*
f
+
((
real
)
1
-
rou
)
*
grad
);
auto
expr3
=
lr
.
lazyAssign
((
g
-
f
.
square
()
+
epsilon
).
sqrt
().
reciprocal
());
auto
expr3
=
lr
.
lazyAssign
((
g
-
f
.
square
()
+
epsilon
).
sqrt
().
reciprocal
());
auto
expr4
=
mom
.
lazyAssign
(
auto
expr4
=
mom
.
lazyAssign
(
mom
*
momentum
-
mom
*
momentum
-
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
auto
expr5
=
value
.
lazyAssign
(
value
+
mom
);
auto
expr5
=
value
.
lazyAssign
(
value
+
mom
);
if
(
firstTime
)
{
if
(
firstTime
)
{
...
@@ -107,8 +107,8 @@ void rmspropApply(BaseMatrix& value,
...
@@ -107,8 +107,8 @@ void rmspropApply(BaseMatrix& value,
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
,
expr5
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
,
expr5
);
}
else
{
}
else
{
auto
expr1
=
g
.
lazyAssign
(
auto
expr1
=
accumulatedRou
*
g
+
((
real
)
1
-
rou
)
*
grad
.
square
());
g
.
lazyAssign
(
accumulatedRou
*
g
+
((
real
)
1
-
rou
)
*
grad
.
square
());
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
,
expr5
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
,
expr5
);
}
}
...
@@ -127,8 +127,8 @@ void decayedAdagradApply(BaseMatrix& value,
...
@@ -127,8 +127,8 @@ void decayedAdagradApply(BaseMatrix& value,
real
decayRate
,
real
decayRate
,
bool
firstTime
)
{
bool
firstTime
)
{
auto
expr2
=
lr
.
lazyAssign
((
accum
+
epsilon
).
sqrt
().
reciprocal
());
auto
expr2
=
lr
.
lazyAssign
((
accum
+
epsilon
).
sqrt
().
reciprocal
());
auto
expr3
=
mom
.
lazyAssign
(
auto
expr3
=
mom
.
lazyAssign
(
mom
*
momentum
-
mom
*
momentum
-
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
learningRate
*
lr
*
(
grad
+
value
*
decayRate
));
auto
expr4
=
value
.
lazyAssign
(
value
+
mom
);
auto
expr4
=
value
.
lazyAssign
(
value
+
mom
);
if
(
firstTime
)
{
if
(
firstTime
)
{
...
@@ -136,8 +136,8 @@ void decayedAdagradApply(BaseMatrix& value,
...
@@ -136,8 +136,8 @@ void decayedAdagradApply(BaseMatrix& value,
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
);
}
else
{
}
else
{
auto
expr1
=
accum
.
lazyAssign
(
auto
expr1
=
accum
.
lazyAssign
(
accumulatedRou
*
accum
+
accumulatedRou
*
accum
+
((
real
)
1
-
rou
)
*
grad
.
square
());
((
real
)
1
-
rou
)
*
grad
.
square
());
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
,
expr4
);
}
}
...
@@ -153,13 +153,12 @@ void adamApply(BaseMatrix& value,
...
@@ -153,13 +153,12 @@ void adamApply(BaseMatrix& value,
real
beta2_power
,
real
beta2_power
,
real
epsilon
,
real
epsilon
,
real
learningRate
)
{
real
learningRate
)
{
real
alpha
=
learningRate
*
real
alpha
=
std
::
sqrt
((
real
)
1
-
beta2_power
)
/
((
real
)
1
-
beta1_power
);
learningRate
*
std
::
sqrt
((
real
)
1
-
beta2_power
)
/
((
real
)
1
-
beta1_power
);
auto
expr1
=
mom
.
lazyAssign
(
beta1
*
mom
+
((
real
)
1
-
beta1
)
*
grad
);
auto
expr1
=
mom
.
lazyAssign
(
beta1
*
mom
+
((
real
)
1
-
beta1
)
*
grad
);
auto
expr2
=
v
.
lazyAssign
(
beta2
*
v
+
((
real
)
1
-
beta2
)
*
grad
.
square
());
auto
expr2
=
v
.
lazyAssign
(
beta2
*
v
+
((
real
)
1
-
beta2
)
*
grad
.
square
());
auto
expr3
=
value
.
lazyAssign
(
auto
expr3
=
value
.
lazyAssign
(
value
-
(
mom
*
alpha
)
/
(
v
.
sqrt
()
+
epsilon
));
value
-
(
mom
*
alpha
)
/
(
v
.
sqrt
()
+
epsilon
));
AssignEvaluate
(
expr1
,
expr2
,
expr3
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
);
}
}
...
@@ -173,10 +172,10 @@ void adamaxApply(BaseMatrix& value,
...
@@ -173,10 +172,10 @@ void adamaxApply(BaseMatrix& value,
int64_t
step
,
int64_t
step
,
real
alpha
)
{
real
alpha
)
{
auto
expr1
=
mom
.
lazyAssign
(
beta1
*
mom
+
((
real
)
1
-
beta1
)
*
grad
);
auto
expr1
=
mom
.
lazyAssign
(
beta1
*
mom
+
((
real
)
1
-
beta1
)
*
grad
);
auto
expr2
=
u
.
lazyAssign
(
auto
expr2
=
(
beta2
*
u
>
grad
.
abs
()).
condition
(
beta2
*
u
,
grad
.
abs
()));
u
.
lazyAssign
(
(
beta2
*
u
>
grad
.
abs
()).
condition
(
beta2
*
u
,
grad
.
abs
()));
auto
expr3
=
value
.
lazyAssign
(
auto
expr3
=
value
.
lazyAssign
(
value
-
(
alpha
/
((
real
)
1
-
(
real
)
std
::
pow
(
beta1
,
step
)))
*
(
mom
/
u
));
value
-
(
alpha
/
((
real
)
1
-
(
real
)
std
::
pow
(
beta1
,
step
)))
*
(
mom
/
u
));
AssignEvaluate
(
expr1
,
expr2
,
expr3
);
AssignEvaluate
(
expr1
,
expr2
,
expr3
);
}
}
...
@@ -322,8 +321,8 @@ void adamApply(BaseMatrix& value,
...
@@ -322,8 +321,8 @@ void adamApply(BaseMatrix& value,
real
beta2_power
,
real
beta2_power
,
real
epsilon
,
real
epsilon
,
real
learningRate
)
{
real
learningRate
)
{
real
alpha
=
learningRate
*
real
alpha
=
std
::
sqrt
((
real
)
1
-
beta2_power
)
/
((
real
)
1
-
beta1_power
);
learningRate
*
std
::
sqrt
((
real
)
1
-
beta2_power
)
/
((
real
)
1
-
beta1_power
);
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
// m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
mom
=
beta1
*
mom
+
((
real
)
1
-
beta1
)
*
grad
;
mom
=
beta1
*
mom
+
((
real
)
1
-
beta1
)
*
grad
;
...
@@ -331,7 +330,7 @@ void adamApply(BaseMatrix& value,
...
@@ -331,7 +330,7 @@ void adamApply(BaseMatrix& value,
// v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
// v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
v
=
beta2
*
v
+
((
real
)
1
-
beta2
)
*
grad
.
square
();
v
=
beta2
*
v
+
((
real
)
1
-
beta2
)
*
grad
.
square
();
value
-=
(
mom
*
alpha
)
/
(
v
.
sqrt
()
+
epsilon
);
value
-=
(
mom
*
alpha
)
/
(
v
.
sqrt
()
+
epsilon
);
}
}
void
adamaxApply
(
BaseMatrix
&
value
,
void
adamaxApply
(
BaseMatrix
&
value
,
...
...
paddle/math/tests/test_Tensor.cu
浏览文件 @
1d4fa243
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include "paddle/math/Matrix.h"
#include "TensorCheck.h"
#include "TensorCheck.h"
#include "paddle/math/Matrix.h"
using
paddle
::
Matrix
;
using
paddle
::
Matrix
;
using
paddle
::
CpuMatrix
;
using
paddle
::
CpuMatrix
;
...
@@ -26,25 +26,25 @@ using paddle::GpuIVector;
...
@@ -26,25 +26,25 @@ using paddle::GpuIVector;
using
autotest
::
TensorCheckEqual
;
using
autotest
::
TensorCheckEqual
;
using
autotest
::
TensorCheckErr
;
using
autotest
::
TensorCheckErr
;
#define INIT_UNARY(A1, A2)
\
#define INIT_UNARY(A1, A2) \
Tensor A1(height, width);
\
Tensor A1(height, width);
\
Tensor A2(height, width);
\
Tensor A2(height, width);
\
A1.randomizeUniform();
\
A1.randomizeUniform();
\
A2.copyFrom(A1)
A2.copyFrom(A1)
#define INIT_BINARY(A1, A2, B)
\
#define INIT_BINARY(A1, A2, B) \
INIT_UNARY(A1, A2);
\
INIT_UNARY(A1, A2);
\
Tensor B(height, width);
\
Tensor B(height, width);
\
B.randomizeUniform()
B.randomizeUniform()
#define INIT_TERNARY(A1, A2, B, C)
\
#define INIT_TERNARY(A1, A2, B, C) \
INIT_BINARY(A1, A2, B);
\
INIT_BINARY(A1, A2, B);
\
Tensor C(height, width);
\
Tensor C(height, width);
\
C.randomizeUniform()
C.randomizeUniform()
#define INIT_QUATERNARY(A1, A2, B, C, D)
\
#define INIT_QUATERNARY(A1, A2, B, C, D) \
INIT_TERNARY(A1, A2, B, C);
\
INIT_TERNARY(A1, A2, B, C);
\
Tensor D(height, width);
\
Tensor D(height, width);
\
D.randomizeUniform()
D.randomizeUniform()
template
<
typename
Tensor
>
template
<
typename
Tensor
>
struct
TestUnaryMatrix
{
struct
TestUnaryMatrix
{
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
)
>
UnaryFunc
;
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
)
>
UnaryFunc
;
...
@@ -59,7 +59,7 @@ struct TestUnaryMatrix {
...
@@ -59,7 +59,7 @@ struct TestUnaryMatrix {
}
}
};
};
template
<
typename
Tensor
>
template
<
typename
Tensor
>
struct
TestBinaryMatrix
{
struct
TestBinaryMatrix
{
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
>
BinaryFunc
;
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
>
BinaryFunc
;
...
@@ -74,10 +74,10 @@ struct TestBinaryMatrix {
...
@@ -74,10 +74,10 @@ struct TestBinaryMatrix {
}
}
};
};
template
<
typename
Tensor
>
template
<
typename
Tensor
>
struct
TestTernaryMatrix
{
struct
TestTernaryMatrix
{
typedef
std
::
function
<
void
(
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
>
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
>
TernaryFunc
;
TernaryFunc
;
explicit
TestTernaryMatrix
(
TernaryFunc
testTernaryFunc
)
{
explicit
TestTernaryMatrix
(
TernaryFunc
testTernaryFunc
)
{
for
(
auto
height
:
{
1
,
11
,
73
,
128
,
200
,
330
})
{
for
(
auto
height
:
{
1
,
11
,
73
,
128
,
200
,
330
})
{
...
@@ -90,10 +90,11 @@ struct TestTernaryMatrix {
...
@@ -90,10 +90,11 @@ struct TestTernaryMatrix {
}
}
};
};
template
<
typename
Tensor
>
template
<
typename
Tensor
>
struct
TestQuaternaryMatrix
{
struct
TestQuaternaryMatrix
{
typedef
std
::
function
<
void
(
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
>
QuaternaryFunc
;
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
>
QuaternaryFunc
;
explicit
TestQuaternaryMatrix
(
QuaternaryFunc
testQuaternaryFunc
)
{
explicit
TestQuaternaryMatrix
(
QuaternaryFunc
testQuaternaryFunc
)
{
for
(
auto
height
:
{
1
,
11
,
73
,
128
,
200
,
330
})
{
for
(
auto
height
:
{
1
,
11
,
73
,
128
,
200
,
330
})
{
...
@@ -106,7 +107,7 @@ struct TestQuaternaryMatrix {
...
@@ -106,7 +107,7 @@ struct TestQuaternaryMatrix {
}
}
};
};
template
<
typename
Tensor
,
class
T
>
template
<
typename
Tensor
,
class
T
>
struct
TestUnaryVectorT
{
struct
TestUnaryVectorT
{
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
)
>
UnaryFunc
;
typedef
std
::
function
<
void
(
Tensor
&
A1
,
Tensor
&
A2
)
>
UnaryFunc
;
...
@@ -142,11 +143,11 @@ void SetTensorValue(Matrix& matrix, real value) {
...
@@ -142,11 +143,11 @@ void SetTensorValue(Matrix& matrix, real value) {
}
}
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAddScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorAddScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
real
p1
=
2.5
;
real
p1
=
2.5
;
real
p2
=
3.0
;
real
p2
=
3.0
;
A1
.
add
(
p1
);
// a += p
A1
.
add
(
p1
);
// a += p
A2
+=
p1
;
A2
+=
p1
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
...
@@ -155,7 +156,7 @@ void testTensorAddScalar(Tensor& A1, Tensor& A2) {
...
@@ -155,7 +156,7 @@ void testTensorAddScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSubScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorSubScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
real
p
=
2.5
;
real
p
=
2.5
;
A1
.
subScalar
(
p
);
// a -= p
A1
.
subScalar
(
p
);
// a -= p
...
@@ -163,7 +164,7 @@ void testTensorSubScalar(Tensor& A1, Tensor& A2) {
...
@@ -163,7 +164,7 @@ void testTensorSubScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorMulScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorMulScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
real
p
=
2.5
;
real
p
=
2.5
;
A1
.
mulScalar
(
p
);
// a *= p
A1
.
mulScalar
(
p
);
// a *= p
...
@@ -177,7 +178,7 @@ void testTensorMulScalar(Tensor& A1, Tensor& A2) {
...
@@ -177,7 +178,7 @@ void testTensorMulScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorDivScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorDivScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
real
p
=
2.5
;
real
p
=
2.5
;
A1
.
divScalar
(
p
);
// a /= p
A1
.
divScalar
(
p
);
// a /= p
...
@@ -185,44 +186,44 @@ void testTensorDivScalar(Tensor& A1, Tensor& A2) {
...
@@ -185,44 +186,44 @@ void testTensorDivScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorNeg
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorNeg
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
neg
();
// a = -a
A1
.
neg
();
// a = -a
A2
=
-
A2
;
A2
=
-
A2
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAbs
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorAbs
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
abs2
();
// a = a > 0 ? a : -a
A1
.
abs2
();
// a = a > 0 ? a : -a
A2
=
A2
.
abs
();
A2
=
A2
.
abs
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSquare
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorSquare
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
square2
();
// a = a * a
A1
.
square2
();
// a = a * a
A2
=
A2
.
square
();
A2
=
A2
.
square
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorReciprocal
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorReciprocal
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
reciprocal2
();
// a = 1.0f / a
A1
.
reciprocal2
();
// a = 1.0f / a
A2
=
A2
.
reciprocal
();
A2
=
A2
.
reciprocal
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSign
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorSign
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
sign2
();
// a = (a > 0) - (a < 0)
A1
.
sign2
();
// a = (a > 0) - (a < 0)
A2
=
A2
.
sign
();
A2
=
A2
.
sign
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAssign
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorAssign
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
assign
(
1.5
);
// a = p
A1
.
assign
(
1.5
);
// a = p
A2
=
A2
.
constant
(
1.5
);
A2
=
A2
.
constant
(
1.5
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
...
@@ -235,7 +236,7 @@ void testTensorAssign(Tensor& A1, Tensor& A2) {
...
@@ -235,7 +236,7 @@ void testTensorAssign(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testUnaryBaseOp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testUnaryBaseOp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
testTensorAddScalar
(
A1
,
A2
);
testTensorAddScalar
(
A1
,
A2
);
testTensorSubScalar
(
A1
,
A2
);
testTensorSubScalar
(
A1
,
A2
);
...
@@ -249,9 +250,9 @@ void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
...
@@ -249,9 +250,9 @@ void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
testTensorAssign
(
A1
,
A2
);
testTensorAssign
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testUnaryBaseOpInt
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testUnaryBaseOpInt
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
add
(
2
);
// a += p
A1
.
add
(
2
);
// a += p
A2
+=
2
;
A2
+=
2
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
...
@@ -266,46 +267,46 @@ void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
...
@@ -266,46 +267,46 @@ void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
TEST
(
Unary
,
BaseOp
)
{
TEST
(
Unary
,
BaseOp
)
{
TestUnaryMatrix
<
CpuMatrix
>
testCpuMatrix
(
testUnaryBaseOp
<
CpuMatrix
>
);
TestUnaryMatrix
<
CpuMatrix
>
testCpuMatrix
(
testUnaryBaseOp
<
CpuMatrix
>
);
TestUnaryVectorT
<
CpuVector
,
real
>
testCpuVector
(
testUnaryBaseOp
<
CpuVector
>
);
TestUnaryVectorT
<
CpuVector
,
real
>
testCpuVector
(
testUnaryBaseOp
<
CpuVector
>
);
TestUnaryVectorT
<
CpuIVector
,
int
>
TestUnaryVectorT
<
CpuIVector
,
int
>
testCpuIVector
(
testCpuIVector
(
testUnaryBaseOpInt
<
CpuIVector
>
);
testUnaryBaseOpInt
<
CpuIVector
>
);
#ifndef PADDLE_ONLY_CPU
#ifndef PADDLE_ONLY_CPU
TestUnaryMatrix
<
GpuMatrix
>
testGpuMatrix
(
testUnaryBaseOp
<
GpuMatrix
>
);
TestUnaryMatrix
<
GpuMatrix
>
testGpuMatrix
(
testUnaryBaseOp
<
GpuMatrix
>
);
TestUnaryVectorT
<
GpuVector
,
real
>
testGpuVector
(
testUnaryBaseOp
<
GpuVector
>
);
TestUnaryVectorT
<
GpuVector
,
real
>
testGpuVector
(
testUnaryBaseOp
<
GpuVector
>
);
TestUnaryVectorT
<
GpuIVector
,
int
>
TestUnaryVectorT
<
GpuIVector
,
int
>
testGpuIVector
(
testGpuIVector
(
testUnaryBaseOpInt
<
GpuIVector
>
);
testUnaryBaseOpInt
<
GpuIVector
>
);
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorExp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorExp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
exp2
();
// a = exp(a)
A1
.
exp2
();
// a = exp(a)
A2
=
A2
.
exp
();
A2
=
A2
.
exp
();
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorLog
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorLog
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
log2
();
// a = log(a)
A1
.
log2
();
// a = log(a)
A2
=
A2
.
log
();
A2
=
A2
.
log
();
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSqrt
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorSqrt
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
sqrt2
();
// a = sqrt(a)
A1
.
sqrt2
();
// a = sqrt(a)
A2
=
A2
.
sqrt
();
A2
=
A2
.
sqrt
();
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorPow
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorPow
(
Tensor
&
A1
,
Tensor
&
A2
)
{
A1
.
pow2
(
3.2
);
// a = pow(a, p)
A1
.
pow2
(
3.2
);
// a = pow(a, p)
A2
=
A2
.
pow
(
3.2
);
A2
=
A2
.
pow
(
3.2
);
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testUnayrMathOp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testUnayrMathOp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
testTensorExp
(
A1
,
A2
);
testTensorExp
(
A1
,
A2
);
testTensorLog
(
A1
,
A2
);
testTensorLog
(
A1
,
A2
);
...
@@ -321,7 +322,7 @@ TEST(Unary, MathOp) {
...
@@ -321,7 +322,7 @@ TEST(Unary, MathOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorClip
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorClip
(
Tensor
&
A1
,
Tensor
&
A2
)
{
real
p1
=
0.003
f
;
real
p1
=
0.003
f
;
real
p2
=
0.877
f
;
real
p2
=
0.877
f
;
...
@@ -331,7 +332,7 @@ void testTensorClip(Tensor& A1, Tensor& A2) {
...
@@ -331,7 +332,7 @@ void testTensorClip(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBiggerThanScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorBiggerThanScalar
(
Tensor
&
A1
,
Tensor
&
A2
)
{
real
p
=
0.5
f
;
real
p
=
0.5
f
;
A1
.
biggerThanScalar
(
p
);
// a = a > p ? 1.0f : 0.0f
A1
.
biggerThanScalar
(
p
);
// a = a > p ? 1.0f : 0.0f
...
@@ -339,7 +340,7 @@ void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
...
@@ -339,7 +340,7 @@ void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorapplyL1
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testTensorapplyL1
(
Tensor
&
A1
,
Tensor
&
A2
)
{
/**
/**
* T lambda = p;
* T lambda = p;
...
@@ -351,14 +352,15 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2) {
...
@@ -351,14 +352,15 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2) {
real
learningRate
=
0.7
f
;
real
learningRate
=
0.7
f
;
real
decayRate
=
0.6
f
;
real
decayRate
=
0.6
f
;
A1
.
applyL1
(
learningRate
,
decayRate
);
A1
.
applyL1
(
learningRate
,
decayRate
);
A2
=
(
A2
>
(
learningRate
*
decayRate
)).
condition
(
A2
=
(
A2
>
(
learningRate
*
decayRate
))
(
A2
-
(
learningRate
*
decayRate
)),
.
condition
(
(
A2
<
-
(
learningRate
*
decayRate
)).
condition
(
(
A2
-
(
learningRate
*
decayRate
)),
(
A2
+
(
learningRate
*
decayRate
)),
(
real
)
0.0
));
(
A2
<
-
(
learningRate
*
decayRate
))
.
condition
((
A2
+
(
learningRate
*
decayRate
)),
(
real
)
0.0
));
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testUnayrCompareOp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
void
testUnayrCompareOp
(
Tensor
&
A1
,
Tensor
&
A2
)
{
testTensorClip
(
A1
,
A2
);
testTensorClip
(
A1
,
A2
);
testTensorBiggerThanScalar
(
A1
,
A2
);
testTensorBiggerThanScalar
(
A1
,
A2
);
...
@@ -377,7 +379,7 @@ TEST(Unary, CompareOp) {
...
@@ -377,7 +379,7 @@ TEST(Unary, CompareOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAdd
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorAdd
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p1
=
2.5
;
real
p1
=
2.5
;
real
p2
=
3.2
;
real
p2
=
3.2
;
...
@@ -406,7 +408,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -406,7 +408,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSub
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSub
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p
=
2.5
;
real
p
=
2.5
;
A1
.
sub
(
B
);
// a -= b
A1
.
sub
(
B
);
// a -= b
...
@@ -422,7 +424,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -422,7 +424,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorMul
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorMul
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p
=
2.5
;
real
p
=
2.5
;
A1
.
mulScalar
(
B
,
p
);
// a = b * p
A1
.
mulScalar
(
B
,
p
);
// a = b * p
...
@@ -442,7 +444,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -442,7 +444,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorDiv
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorDiv
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p
=
2.5
;
real
p
=
2.5
;
A1
.
divScalar
(
B
,
p
);
// a = b / p
A1
.
divScalar
(
B
,
p
);
// a = b / p
...
@@ -454,28 +456,28 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -454,28 +456,28 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAssign
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorAssign
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
assign
(
B
);
// a = b
A1
.
assign
(
B
);
// a = b
A2
=
B
;
A2
=
B
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSquare
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSquare
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
square2
(
A1
);
// b = a * a
B
.
square2
(
A1
);
// b = a * a
A2
=
B
.
square
();
A2
=
B
.
square
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSquareDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSquareDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
squareDerivative
(
B
);
// a *= 2.0 * b
A1
.
squareDerivative
(
B
);
// a *= 2.0 * b
A2
=
A2
*
(
real
)
2.0
*
B
;
A2
=
A2
*
(
real
)
2.0
*
B
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorReciprocal
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorReciprocal
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
reciprocal2
(
A1
);
// b = 1.0f / a
B
.
reciprocal2
(
A1
);
// b = 1.0f / a
A2
=
B
.
reciprocal
();
A2
=
B
.
reciprocal
();
...
@@ -490,33 +492,33 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -490,33 +492,33 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
real
learningRate
=
0.7
f
;
real
learningRate
=
0.7
f
;
real
decayRate
=
1.2
f
;
real
decayRate
=
1.2
f
;
A1
.
applyL2
(
B
,
learningRate
,
decayRate
);
// a *= (1.0f / (1.0f + p * b))
A1
.
applyL2
(
B
,
learningRate
,
decayRate
);
// a *= (1.0f / (1.0f + p * b))
A2
*=
(
B
.
constant
(
1.0
f
)
+
A2
*=
(
B
.
constant
(
1.0
f
)
+
B
.
constant
(
learningRate
*
decayRate
)
*
B
)
B
.
constant
(
learningRate
*
decayRate
)
*
B
)
.
reciprocal
();
.
reciprocal
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorReciprocalDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorReciprocalDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
reciprocalDerivative
(
B
);
// a *= -b * b
A1
.
reciprocalDerivative
(
B
);
// a *= -b * b
A2
*=
(
-
B
)
*
B
;
A2
*=
(
-
B
)
*
B
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSign
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSign
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
sign2
(
A1
);
// b = a > 0.0f ? 1.0f : -1.0f
B
.
sign2
(
A1
);
// b = a > 0.0f ? 1.0f : -1.0f
A2
=
B
.
sign
();
A2
=
B
.
sign
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAbs
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorAbs
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
abs2
(
A1
);
// b = a > 0.0f ? a : -a
B
.
abs2
(
A1
);
// b = a > 0.0f ? a : -a
A2
=
B
.
abs
();
A2
=
B
.
abs
();
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testBinaryBaseOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testBinaryBaseOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
testTensorAdd
(
A1
,
A2
,
B
);
testTensorAdd
(
A1
,
A2
,
B
);
testTensorSub
(
A1
,
A2
,
B
);
testTensorSub
(
A1
,
A2
,
B
);
...
@@ -539,7 +541,7 @@ TEST(Binary, BaseOp) {
...
@@ -539,7 +541,7 @@ TEST(Binary, BaseOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorExp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorExp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
// a = exp(b)
// a = exp(b)
A1
.
exp2
(
B
);
A1
.
exp2
(
B
);
...
@@ -547,14 +549,14 @@ void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -547,14 +549,14 @@ void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorExpDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorExpDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
expDerivative
(
B
);
// a *= b
A1
.
expDerivative
(
B
);
// a *= b
A2
*=
B
;
A2
*=
B
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorLog
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorLog
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
// a = log(b)
// a = log(b)
A1
.
log2
(
B
);
A1
.
log2
(
B
);
...
@@ -562,7 +564,7 @@ void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -562,7 +564,7 @@ void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSqrt
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSqrt
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
// a = sqrt(b)
// a = sqrt(b)
A1
.
sqrt2
(
B
);
A1
.
sqrt2
(
B
);
...
@@ -570,7 +572,7 @@ void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -570,7 +572,7 @@ void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorInvSqrt
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorInvSqrt
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
// a = 1.0f / sqrt(b)
// a = 1.0f / sqrt(b)
A1
.
invSqrt
(
B
);
A1
.
invSqrt
(
B
);
...
@@ -578,14 +580,14 @@ void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -578,14 +580,14 @@ void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorPow
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorPow
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
pow2
(
B
,
2.5
f
);
// a = pow(b, p)
A1
.
pow2
(
B
,
2.5
f
);
// a = pow(b, p)
A2
=
B
.
pow
(
2.5
f
);
A2
=
B
.
pow
(
2.5
f
);
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSoftrelu
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSoftrelu
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
/*
/*
* const T THRESHOLD = 40.0;
* const T THRESHOLD = 40.0;
...
@@ -597,12 +599,14 @@ void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -597,12 +599,14 @@ void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
real
THRESHOLD
=
40.0
;
real
THRESHOLD
=
40.0
;
A2
=
(
B
.
constant
(
1.0
f
)
+
A2
=
(
B
.
constant
(
1.0
f
)
+
(
B
>
THRESHOLD
).
condition
(
(
B
>
THRESHOLD
)
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
)).
exp
()).
log
();
.
condition
(
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
))
.
exp
())
.
log
();
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSoftreluDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSoftreluDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
/*
/*
* const T THRESHOLD = 40.0;
* const T THRESHOLD = 40.0;
...
@@ -612,14 +616,16 @@ void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -612,14 +616,16 @@ void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
*/
*/
A1
.
softreluDerivative
(
B
);
A1
.
softreluDerivative
(
B
);
real
THRESHOLD
=
40.0
;
real
THRESHOLD
=
40.0
;
A2
=
A2
*
(
B
.
constant
(
1.0
f
)
-
A2
=
A2
*
(
B
.
constant
(
-
1.0
f
)
*
(
B
.
constant
(
1.0
f
)
-
(
B
>
THRESHOLD
).
condition
(
(
B
.
constant
(
-
1.0
f
)
*
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
))).
exp
());
(
B
>
THRESHOLD
)
.
condition
(
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
)))
.
exp
());
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSigmoid
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSigmoid
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
/*
/*
const T THRESHOLD_MIN = -40.0;
const T THRESHOLD_MIN = -40.0;
...
@@ -632,46 +638,47 @@ void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -632,46 +638,47 @@ void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
const
real
THRESHOLD_MIN
=
-
40.0
;
const
real
THRESHOLD_MIN
=
-
40.0
;
const
real
THRESHOLD_MAX
=
13.0
;
const
real
THRESHOLD_MAX
=
13.0
;
auto
tmp
=
(
B
<
THRESHOLD_MIN
).
condition
(
auto
tmp
=
(
B
<
THRESHOLD_MIN
)
THRESHOLD_MIN
,
(
B
>
THRESHOLD_MAX
).
condition
(
THRESHOLD_MAX
,
B
));
.
condition
(
THRESHOLD_MIN
,
(
B
>
THRESHOLD_MAX
).
condition
(
THRESHOLD_MAX
,
B
));
A2
=
(
B
.
constant
(
1.0
f
)
+
(
-
tmp
).
exp
()).
reciprocal
();
A2
=
(
B
.
constant
(
1.0
f
)
+
(
-
tmp
).
exp
()).
reciprocal
();
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSigmoidDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorSigmoidDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
sigmoidDerivative
(
B
);
// a *= b * (1 - b)
A1
.
sigmoidDerivative
(
B
);
// a *= b * (1 - b)
A2
*=
B
*
(
B
.
constant
(
1.0
f
)
-
B
);
A2
*=
B
*
(
B
.
constant
(
1.0
f
)
-
B
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorTanh
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorTanh
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
tanh
(
A1
);
// b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
B
.
tanh
(
A1
);
// b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
A2
=
B
.
constant
(
2.0
f
)
/
((
B
*
((
real
)
-
2.0
f
)).
exp
()
+
(
real
)
1.0
f
)
-
(
real
)
1.0
f
;
A2
=
B
.
constant
(
2.0
f
)
/
((
B
*
((
real
)
-
2.0
f
)).
exp
()
+
(
real
)
1.0
f
)
-
(
real
)
1.0
f
;
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorTanhDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorTanhDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
tanhDerivative
(
B
);
// a *= 1 - b * b
A1
.
tanhDerivative
(
B
);
// a *= 1 - b * b
A2
*=
B
.
constant
(
1.0
f
)
-
B
*
B
;
A2
*=
B
.
constant
(
1.0
f
)
-
B
*
B
;
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorScaledTanh
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorScaledTanh
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p1
=
2.5
;
real
p1
=
2.5
;
real
p2
=
3.1
;
real
p2
=
3.1
;
// b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
// b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
B
.
scaledTanh
(
A1
,
p1
,
p2
);
B
.
scaledTanh
(
A1
,
p1
,
p2
);
A2
=
B
.
constant
(
p1
)
*
A2
=
B
.
constant
(
p1
)
*
(
B
.
constant
(
2.0
f
)
/
((
B
.
constant
(
-
2.0
f
)
*
p2
*
B
).
exp
()
+
(
real
)
1.0
)
(
B
.
constant
(
2.0
f
)
/
((
B
.
constant
(
-
2.0
f
)
*
p2
*
B
).
exp
()
+
(
real
)
1.0
)
-
-
(
real
)
1.0
);
(
real
)
1.0
);
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorScaledTanhDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorScaledTanhDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p1
=
2.5
;
real
p1
=
2.5
;
real
p2
=
3.1
;
real
p2
=
3.1
;
...
@@ -681,7 +688,7 @@ void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -681,7 +688,7 @@ void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testBinaryMathOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testBinaryMathOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
testTensorTanhDerivative
(
A1
,
A2
,
B
);
testTensorTanhDerivative
(
A1
,
A2
,
B
);
testTensorScaledTanhDerivative
(
A1
,
A2
,
B
);
testTensorScaledTanhDerivative
(
A1
,
A2
,
B
);
...
@@ -708,21 +715,21 @@ TEST(Binary, MathOp) {
...
@@ -708,21 +715,21 @@ TEST(Binary, MathOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorRelu
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorRelu
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
relu
(
A1
);
// b = a > 0.0f ? a : 0.0f
B
.
relu
(
A1
);
// b = a > 0.0f ? a : 0.0f
A2
=
(
B
>
(
real
)
0.0
f
).
condition
(
B
,
(
real
)
0.0
f
);
A2
=
(
B
>
(
real
)
0.0
f
).
condition
(
B
,
(
real
)
0.0
f
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorReluDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorReluDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
reluDerivative
(
B
);
// a *= (b > 0.0f ? 1.0f : 0.0f)
A1
.
reluDerivative
(
B
);
// a *= (b > 0.0f ? 1.0f : 0.0f)
A2
*=
(
B
>
(
real
)
0.0
).
condition
((
real
)
1.0
,
(
real
)
0.0
);
A2
*=
(
B
>
(
real
)
0.0
).
condition
((
real
)
1.0
,
(
real
)
0.0
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBrelu
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorBrelu
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
/*
/*
* b = a > p1 ? a : p1
* b = a > p1 ? a : p1
...
@@ -736,7 +743,7 @@ void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -736,7 +743,7 @@ void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBreluDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorBreluDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
SetTensorValue
(
B
,
32.0
f
);
SetTensorValue
(
B
,
32.0
f
);
/*
/*
...
@@ -748,15 +755,15 @@ void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -748,15 +755,15 @@ void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAbsDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorAbsDerivative
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
A1
.
absDerivative
(
B
);
// a = (b > 0) ? a : (b < 0) ? -a : 0
A1
.
absDerivative
(
B
);
// a = (b > 0) ? a : (b < 0) ? -a : 0
A2
=
(
B
>
(
real
)
0.0
f
)
.
condition
(
A2
,
A2
=
(
B
>
(
real
)
0.0
f
)
(
B
<
(
real
)
0.0
f
).
condition
(
-
A2
,
(
real
)
0.0
f
));
.
condition
(
A2
,
(
B
<
(
real
)
0.0
f
).
condition
(
-
A2
,
(
real
)
0.0
f
));
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorIsEqualTo
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorIsEqualTo
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
real
p
=
0.613
;
real
p
=
0.613
;
SetTensorValue
(
B
,
p
);
SetTensorValue
(
B
,
p
);
...
@@ -765,7 +772,7 @@ void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -765,7 +772,7 @@ void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorapplyL1
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testTensorapplyL1
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
/**
/**
* T lambda = p * b;
* T lambda = p * b;
...
@@ -778,12 +785,13 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
...
@@ -778,12 +785,13 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
real
decayRate
=
0.6
f
;
real
decayRate
=
0.6
f
;
A1
.
applyL1
(
B
,
learningRate
,
decayRate
);
A1
.
applyL1
(
B
,
learningRate
,
decayRate
);
auto
lambda
=
B
.
constant
(
learningRate
*
decayRate
)
*
B
;
auto
lambda
=
B
.
constant
(
learningRate
*
decayRate
)
*
B
;
A2
=
(
A2
>
lambda
).
condition
(
A2
=
(
A2
>
lambda
)
(
A2
-
lambda
),
(
A2
<
-
lambda
).
condition
((
A2
+
lambda
),
(
real
)
0.0
f
));
.
condition
((
A2
-
lambda
),
(
A2
<
-
lambda
).
condition
((
A2
+
lambda
),
(
real
)
0.0
f
));
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testBinaryCompareOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
void
testBinaryCompareOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
)
{
B
.
subScalar
(
0.5
f
);
B
.
subScalar
(
0.5
f
);
SetTensorValue
(
B
,
0.0
f
);
SetTensorValue
(
B
,
0.0
f
);
...
@@ -807,7 +815,7 @@ TEST(Binary, CompareOp) {
...
@@ -807,7 +815,7 @@ TEST(Binary, CompareOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorAdd
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorAdd
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
add
(
B
,
C
);
// a = b + c
A1
.
add
(
B
,
C
);
// a = b + c
A2
=
B
+
C
;
A2
=
B
+
C
;
...
@@ -833,7 +841,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
...
@@ -833,7 +841,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSub
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorSub
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
sub
(
B
,
C
);
// a = b - c
A1
.
sub
(
B
,
C
);
// a = b - c
A2
=
B
-
C
;
A2
=
B
-
C
;
...
@@ -846,7 +854,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
...
@@ -846,7 +854,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorMul
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorMul
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
dotMul
(
B
,
C
);
// a = b * c
A1
.
dotMul
(
B
,
C
);
// a = b * c
A2
=
B
*
C
;
A2
=
B
*
C
;
...
@@ -892,7 +900,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
...
@@ -892,7 +900,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorDiv
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorDiv
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
dotDiv
(
B
,
C
);
// a = (b == 0.0) ? 0.0 : b / c
A1
.
dotDiv
(
B
,
C
);
// a = (b == 0.0) ? 0.0 : b / c
A2
=
(
B
==
(
real
)
0.0
).
condition
((
real
)
0.0
,
B
/
C
);
A2
=
(
B
==
(
real
)
0.0
).
condition
((
real
)
0.0
,
B
/
C
);
...
@@ -905,7 +913,7 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
...
@@ -905,7 +913,7 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorReciprocal
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorReciprocal
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
real
p1
=
1.5
;
real
p1
=
1.5
;
real
p2
=
2.5
;
real
p2
=
2.5
;
...
@@ -915,14 +923,14 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
...
@@ -915,14 +923,14 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSoftCrossEntropy
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorSoftCrossEntropy
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
softCrossEntropy
(
B
,
C
);
// a = -c * log(b) - (1 - c) * log(1 - b)
A1
.
softCrossEntropy
(
B
,
C
);
// a = -c * log(b) - (1 - c) * log(1 - b)
A2
=
-
C
*
B
.
log
()
-
(
C
.
constant
(
1.0
f
)
-
C
)
*
(
B
.
constant
(
1.0
f
)
-
B
).
log
();
A2
=
-
C
*
B
.
log
()
-
(
C
.
constant
(
1.0
f
)
-
C
)
*
(
B
.
constant
(
1.0
f
)
-
B
).
log
();
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorSoftCrossEntropyBp
(
Tensor
&
A1
,
void
testTensorSoftCrossEntropyBp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
B
,
...
@@ -932,7 +940,7 @@ void testTensorSoftCrossEntropyBp(Tensor& A1,
...
@@ -932,7 +940,7 @@ void testTensorSoftCrossEntropyBp(Tensor& A1,
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTernaryBaseOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTernaryBaseOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
testTensorAdd
(
A1
,
A2
,
B
,
C
);
testTensorAdd
(
A1
,
A2
,
B
,
C
);
testTensorSub
(
A1
,
A2
,
B
,
C
);
testTensorSub
(
A1
,
A2
,
B
,
C
);
...
@@ -952,30 +960,30 @@ TEST(Ternary, BaseOp) {
...
@@ -952,30 +960,30 @@ TEST(Ternary, BaseOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBinaryLabelCrossEntropy
(
Tensor
&
A1
,
void
testTensorBinaryLabelCrossEntropy
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
B
,
Tensor
&
C
)
{
Tensor
&
C
)
{
A1
.
binaryLabelCrossEntropy
(
B
,
C
);
// a = c > 0.5 ? -log(b) : -log(1.0 - b)
A1
.
binaryLabelCrossEntropy
(
B
,
C
);
// a = c > 0.5 ? -log(b) : -log(1.0 - b)
A2
=
(
C
>
(
real
)
0.5
).
condition
(
A2
=
(
C
>
(
real
)
0.5
).
condition
(
-
(
B
.
log
()),
-
((
B
.
constant
(
1.0
f
)
-
B
).
log
()));
-
(
B
.
log
()),
-
((
B
.
constant
(
1.0
f
)
-
B
).
log
()));
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBinaryLabelCrossEntropyBp
(
Tensor
&
A1
,
void
testTensorBinaryLabelCrossEntropyBp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
B
,
Tensor
&
C
)
{
Tensor
&
C
)
{
// a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
// a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
A1
.
binaryLabelCrossEntropyBp
(
B
,
C
);
A1
.
binaryLabelCrossEntropyBp
(
B
,
C
);
A2
+=
(
C
>
(
real
)
0.5
).
condition
(
A2
+=
(
C
>
(
real
)
0.5
)
(
B
.
constant
(
-
1.0
f
)
/
B
),
(
B
.
constant
(
1.0
f
)
-
B
).
reciprocal
());
.
condition
((
B
.
constant
(
-
1.0
f
)
/
B
),
(
B
.
constant
(
1.0
f
)
-
B
).
reciprocal
());
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorLogisticRegressionLoss
(
Tensor
&
A1
,
void
testTensorLogisticRegressionLoss
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
B
,
...
@@ -991,13 +999,14 @@ void testTensorLogisticRegressionLoss(Tensor& A1,
...
@@ -991,13 +999,14 @@ void testTensorLogisticRegressionLoss(Tensor& A1,
*/
*/
A1
.
logisticRegressionLoss
(
B
,
C
);
A1
.
logisticRegressionLoss
(
B
,
C
);
real
THRESHOLD
=
40.0
;
real
THRESHOLD
=
40.0
;
auto
tmp
=
(
B
>
THRESHOLD
).
condition
(
auto
tmp
=
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
));
(
B
>
THRESHOLD
)
.
condition
(
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
));
A2
=
(
C
.
constant
(
1.0
f
)
+
tmp
.
exp
()).
log
()
-
C
*
tmp
;
A2
=
(
C
.
constant
(
1.0
f
)
+
tmp
.
exp
()).
log
()
-
C
*
tmp
;
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorLogisticRegressionLossBp
(
Tensor
&
A1
,
void
testTensorLogisticRegressionLossBp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
B
,
...
@@ -1013,28 +1022,29 @@ void testTensorLogisticRegressionLossBp(Tensor& A1,
...
@@ -1013,28 +1022,29 @@ void testTensorLogisticRegressionLossBp(Tensor& A1,
*/
*/
A1
.
logisticRegressionLossBp
(
B
,
C
);
A1
.
logisticRegressionLossBp
(
B
,
C
);
real
THRESHOLD
=
40.0
;
real
THRESHOLD
=
40.0
;
auto
tmp
=
(
B
>
THRESHOLD
).
condition
(
auto
tmp
=
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
));
(
B
>
THRESHOLD
)
.
condition
(
THRESHOLD
,
(
B
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
B
));
auto
tmp2
=
tmp
.
exp
();
auto
tmp2
=
tmp
.
exp
();
A2
=
tmp2
/
(
C
.
constant
(
1.0
)
+
tmp2
)
-
C
;
A2
=
tmp2
/
(
C
.
constant
(
1.0
)
+
tmp2
)
-
C
;
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBiggerThan
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorBiggerThan
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
biggerThan
(
B
,
C
);
// a = (b > c) ? 1.0f : 0.0f
A1
.
biggerThan
(
B
,
C
);
// a = (b > c) ? 1.0f : 0.0f
A2
=
(
B
>
C
).
condition
((
real
)
1.0
f
,
(
real
)
0.0
f
);
A2
=
(
B
>
C
).
condition
((
real
)
1.0
f
,
(
real
)
0.0
f
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorMax
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTensorMax
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
A1
.
max2
(
B
,
C
);
// a = (b > c) ? b : c
A1
.
max2
(
B
,
C
);
// a = (b > c) ? b : c
A2
=
(
B
>
C
).
condition
(
B
,
C
);
A2
=
(
B
>
C
).
condition
(
B
,
C
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTernaryCompareOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
void
testTernaryCompareOp
(
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
)
{
testTensorBinaryLabelCrossEntropyBp
(
A1
,
A2
,
B
,
C
);
testTensorBinaryLabelCrossEntropyBp
(
A1
,
A2
,
B
,
C
);
testTensorBinaryLabelCrossEntropy
(
A1
,
A2
,
B
,
C
);
testTensorBinaryLabelCrossEntropy
(
A1
,
A2
,
B
,
C
);
...
@@ -1053,12 +1063,9 @@ TEST(Ternary, CompareOp) {
...
@@ -1053,12 +1063,9 @@ TEST(Ternary, CompareOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testQuaternaryAdd
(
Tensor
&
A1
,
void
testQuaternaryAdd
(
Tensor
&
A2
,
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
// A1.add3(B, C, D, 1.5f, 2.5f, 3.5f); // a = p1 * b + p2 * c + p3 * d
// A1.add3(B, C, D, 1.5f, 2.5f, 3.5f); // a = p1 * b + p2 * c + p3 * d
// A2 = B * 1.5f + C * 2.5f + D * 3.5f;
// A2 = B * 1.5f + C * 2.5f + D * 3.5f;
// TensorCheckEqual(A1, A2);
// TensorCheckEqual(A1, A2);
...
@@ -1084,25 +1091,19 @@ TEST(Quaternary, BaseOp) {
...
@@ -1084,25 +1091,19 @@ TEST(Quaternary, BaseOp) {
#endif
#endif
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorBiggerThan
(
Tensor
&
A1
,
void
testTensorBiggerThan
(
Tensor
&
A2
,
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
// a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
// a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
A1
.
biggerThan
(
B
,
C
,
D
);
A1
.
biggerThan
(
B
,
C
,
D
);
A2
=
((
B
>
C
&&
D
>
(
real
)
0.5
)
A2
=
((
B
>
C
&&
D
>
(
real
)
0.5
)
||
(
B
<
C
&&
D
<
(
real
)
0.5
))
||
(
B
<
C
&&
D
<
(
real
)
0.5
))
.
condition
((
real
)
1.0
,
(
real
)
0.0
);
.
condition
((
real
)
1.0
,
(
real
)
0.0
);
TensorCheckEqual
(
A1
,
A2
);
TensorCheckEqual
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorRankLoss
(
Tensor
&
A1
,
void
testTensorRankLoss
(
Tensor
&
A2
,
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
/**
/**
* const T THRESHOLD = 40.0; a = b - c;
* const T THRESHOLD = 40.0; a = b - c;
* a = (a > THRESHOLD)
* a = (a > THRESHOLD)
...
@@ -1114,19 +1115,17 @@ void testTensorRankLoss(Tensor& A1,
...
@@ -1114,19 +1115,17 @@ void testTensorRankLoss(Tensor& A1,
real
THRESHOLD
=
40.0
;
real
THRESHOLD
=
40.0
;
auto
tmp
=
B
-
C
;
auto
tmp
=
B
-
C
;
auto
tmp2
=
(
tmp
>
THRESHOLD
).
condition
(
auto
tmp2
=
THRESHOLD
,
(
tmp
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
tmp
));
(
tmp
>
THRESHOLD
)
.
condition
(
THRESHOLD
,
(
tmp
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
tmp
));
A2
=
(
D
.
constant
(
1.0
f
)
+
tmp2
.
exp
()).
log
()
-
tmp2
*
D
;
A2
=
(
D
.
constant
(
1.0
f
)
+
tmp2
.
exp
()).
log
()
-
tmp2
*
D
;
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testTensorRankLossBp
(
Tensor
&
A1
,
void
testTensorRankLossBp
(
Tensor
&
A2
,
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
/**
/**
* const T THRESHOLD = 40.0; a = b - c;
* const T THRESHOLD = 40.0; a = b - c;
* a = (a > THRESHOLD)
* a = (a > THRESHOLD)
...
@@ -1137,20 +1136,18 @@ void testTensorRankLossBp(Tensor& A1,
...
@@ -1137,20 +1136,18 @@ void testTensorRankLossBp(Tensor& A1,
A1
.
rankLossBp
(
B
,
C
,
D
);
A1
.
rankLossBp
(
B
,
C
,
D
);
real
THRESHOLD
=
40.0
;
real
THRESHOLD
=
40.0
;
auto
tmp
=
B
-
C
;
auto
tmp
=
B
-
C
;
auto
tmp2
=
(
tmp
>
THRESHOLD
).
condition
(
auto
tmp2
=
THRESHOLD
,
(
tmp
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
tmp
));
(
tmp
>
THRESHOLD
)
.
condition
(
THRESHOLD
,
(
tmp
<
-
THRESHOLD
).
condition
(
-
THRESHOLD
,
tmp
));
auto
tmp3
=
tmp2
.
exp
();
auto
tmp3
=
tmp2
.
exp
();
A2
=
tmp3
/
(
D
.
constant
(
1.0
f
)
+
tmp3
)
-
D
;
A2
=
tmp3
/
(
D
.
constant
(
1.0
f
)
+
tmp3
)
-
D
;
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testQuaternaryCompareOp
(
Tensor
&
A1
,
void
testQuaternaryCompareOp
(
Tensor
&
A2
,
Tensor
&
A1
,
Tensor
&
A2
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
)
{
testTensorBiggerThan
(
A1
,
A2
,
B
,
C
,
D
);
testTensorBiggerThan
(
A1
,
A2
,
B
,
C
,
D
);
testTensorRankLoss
(
A1
,
A2
,
B
,
C
,
D
);
testTensorRankLoss
(
A1
,
A2
,
B
,
C
,
D
);
testTensorRankLossBp
(
A1
,
A2
,
B
,
C
,
D
);
testTensorRankLossBp
(
A1
,
A2
,
B
,
C
,
D
);
...
...
paddle/math/tests/test_lazyAssign.cu
浏览文件 @
1d4fa243
...
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
...
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#include "PerfUtils.h"
#include "TensorCheck.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/TensorAssign.h"
#include "paddle/math/TensorAssign.h"
#include "TensorCheck.h"
#include "PerfUtils.h"
using
paddle
::
BaseMatrix
;
using
paddle
::
BaseMatrix
;
using
paddle
::
CpuMatrix
;
using
paddle
::
CpuMatrix
;
...
@@ -27,14 +27,28 @@ using autotest::TensorCheckErr;
...
@@ -27,14 +27,28 @@ using autotest::TensorCheckErr;
typedef
std
::
function
<
void
(
int
height
,
int
width
)
>
testMatrixFunc
;
typedef
std
::
function
<
void
(
int
height
,
int
width
)
>
testMatrixFunc
;
void
testMatrixCase
(
testMatrixFunc
matrixFunc
)
{
void
testMatrixCase
(
testMatrixFunc
matrixFunc
)
{
for
(
auto
height
:
{
1
})
{
for
(
auto
height
:
{
1
})
{
for
(
auto
width
:
{
1
,
32
,
64
,
128
,
512
,
1024
,
4096
,
32768
,
65536
,
131072
,
for
(
auto
width
:
{
1
,
262144
,
524288
,
1048576
,
2097152
,
4194304
,
8388608
})
{
32
,
64
,
128
,
512
,
1024
,
4096
,
32768
,
65536
,
131072
,
262144
,
524288
,
1048576
,
2097152
,
4194304
,
8388608
})
{
matrixFunc
(
height
,
width
);
matrixFunc
(
height
,
width
);
}
}
}
}
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testLazyAssign
(
int
height
,
int
width
)
{
void
testLazyAssign
(
int
height
,
int
width
)
{
Tensor
A1
(
height
,
width
);
Tensor
A1
(
height
,
width
);
Tensor
A2
(
height
,
width
);
Tensor
A2
(
height
,
width
);
...
@@ -49,40 +63,39 @@ void testLazyAssign(int height, int width) {
...
@@ -49,40 +63,39 @@ void testLazyAssign(int height, int width) {
EXPRESSION_PERFORMANCE
(
A1
=
B
+
C
;
A1
=
A1
*
D
;);
EXPRESSION_PERFORMANCE
(
A1
=
B
+
C
;
A1
=
A1
*
D
;);
EXPRESSION_PERFORMANCE
(
EXPRESSION_PERFORMANCE
(
auto
expr1
=
A2
.
lazyAssign
(
B
+
C
);
auto
expr1
=
A2
.
lazyAssign
(
B
+
C
);
auto
expr2
=
A2
.
lazyAssign
(
A2
*
D
);
auto
expr2
=
A2
.
lazyAssign
(
A2
*
D
);
AssignEvaluate
(
expr1
,
expr2
););
AssignEvaluate
(
expr1
,
expr2
););
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
}
}
TEST
(
lazyAssign
,
CPU
)
{
TEST
(
lazyAssign
,
CPU
)
{
testMatrixCase
(
testLazyAssign
<
CpuMatrix
>
);
}
testMatrixCase
(
testLazyAssign
<
CpuMatrix
>
);
}
#ifndef PADDLE_ONLY_CPU
#ifndef PADDLE_ONLY_CPU
TEST
(
lazyAssign
,
GPU
)
{
TEST
(
lazyAssign
,
GPU
)
{
testMatrixCase
(
testLazyAssign
<
GpuMatrix
>
);
}
testMatrixCase
(
testLazyAssign
<
GpuMatrix
>
);
}
#endif
#endif
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
sgdUpdateTensor
(
Tensor
&
A
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
,
void
sgdUpdateTensor
(
real
p1
,
real
p2
,
real
p3
)
{
Tensor
&
A
,
Tensor
&
B
,
Tensor
&
C
,
Tensor
&
D
,
real
p1
,
real
p2
,
real
p3
)
{
C
=
C
*
p2
-
D
*
(
B
+
A
*
p3
)
*
p1
;
C
=
C
*
p2
-
D
*
(
B
+
A
*
p3
)
*
p1
;
A
+=
C
;
A
+=
C
;
}
}
void
sgdUpdateLazyAssign
(
BaseMatrix
&
A
,
BaseMatrix
&
B
,
void
sgdUpdateLazyAssign
(
BaseMatrix
&
A
,
BaseMatrix
&
C
,
BaseMatrix
&
D
,
BaseMatrix
&
B
,
real
p1
,
real
p2
,
real
p3
)
{
BaseMatrix
&
C
,
BaseMatrix
&
D
,
real
p1
,
real
p2
,
real
p3
)
{
auto
expr1
=
C
.
lazyAssign
(
C
*
p2
-
D
*
(
B
+
A
*
p3
)
*
p1
);
auto
expr1
=
C
.
lazyAssign
(
C
*
p2
-
D
*
(
B
+
A
*
p3
)
*
p1
);
auto
expr2
=
A
.
lazyAssign
(
A
+
C
);
auto
expr2
=
A
.
lazyAssign
(
A
+
C
);
AssignEvaluate
(
expr1
,
expr2
);
AssignEvaluate
(
expr1
,
expr2
);
}
}
template
<
typename
Tensor
>
template
<
typename
Tensor
>
void
testSgdUpdate
(
int
height
,
int
width
)
{
void
testSgdUpdate
(
int
height
,
int
width
)
{
Tensor
A1
(
height
,
width
);
Tensor
A1
(
height
,
width
);
Tensor
A2
(
height
,
width
);
Tensor
A2
(
height
,
width
);
...
@@ -113,16 +126,13 @@ void testSgdUpdate(int height, int width) {
...
@@ -113,16 +126,13 @@ void testSgdUpdate(int height, int width) {
* a = a + c;
* a = a + c;
*/
*/
// BaseMatrix API
// BaseMatrix API
EXPRESSION_PERFORMANCE
(
EXPRESSION_PERFORMANCE
(
A1
.
sgdUpdate
(
B
,
C1
,
D
,
p1
,
p2
,
p3
););
A1
.
sgdUpdate
(
B
,
C1
,
D
,
p1
,
p2
,
p3
););
// Tensor expression
// Tensor expression
EXPRESSION_PERFORMANCE
(
EXPRESSION_PERFORMANCE
(
sgdUpdateTensor
(
A2
,
B
,
C2
,
D
,
p1
,
p2
,
p3
));
sgdUpdateTensor
(
A2
,
B
,
C2
,
D
,
p1
,
p2
,
p3
));
// lazyAssign
// lazyAssign
EXPRESSION_PERFORMANCE
(
EXPRESSION_PERFORMANCE
(
sgdUpdateLazyAssign
(
A3
,
B
,
C3
,
D
,
p1
,
p2
,
p3
));
sgdUpdateLazyAssign
(
A3
,
B
,
C3
,
D
,
p1
,
p2
,
p3
));
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A2
);
TensorCheckErr
(
A1
,
A3
);
TensorCheckErr
(
A1
,
A3
);
...
@@ -130,12 +140,8 @@ void testSgdUpdate(int height, int width) {
...
@@ -130,12 +140,8 @@ void testSgdUpdate(int height, int width) {
TensorCheckErr
(
C1
,
C3
);
TensorCheckErr
(
C1
,
C3
);
}
}
TEST
(
sgdUpdate
,
CPU
)
{
TEST
(
sgdUpdate
,
CPU
)
{
testMatrixCase
(
testSgdUpdate
<
CpuMatrix
>
);
}
testMatrixCase
(
testSgdUpdate
<
CpuMatrix
>
);
}
#ifndef PADDLE_ONLY_CPU
#ifndef PADDLE_ONLY_CPU
TEST
(
sgdUpdate
,
GPU
)
{
TEST
(
sgdUpdate
,
GPU
)
{
testMatrixCase
(
testSgdUpdate
<
GpuMatrix
>
);
}
testMatrixCase
(
testSgdUpdate
<
GpuMatrix
>
);
}
#endif
#endif
paddle/operators/softmax_op.cu
浏览文件 @
1d4fa243
...
@@ -3,4 +3,5 @@
...
@@ -3,4 +3,5 @@
#include "paddle/operators/softmax_op.h"
#include "paddle/operators/softmax_op.h"
REGISTER_OP_GPU_KERNEL
(
softmax
,
ops
::
SoftmaxKernel
<
ops
::
GPUPlace
,
float
>
);
REGISTER_OP_GPU_KERNEL
(
softmax
,
ops
::
SoftmaxKernel
<
ops
::
GPUPlace
,
float
>
);
REGISTER_OP_GPU_KERNEL
(
softmax_grad
,
ops
::
SoftmaxGradKernel
<
ops
::
GPUPlace
,
float
>
);
REGISTER_OP_GPU_KERNEL
(
softmax_grad
,
ops
::
SoftmaxGradKernel
<
ops
::
GPUPlace
,
float
>
);
paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
浏览文件 @
1d4fa243
无法预览此类型文件
proto/DataConfig.proto
浏览文件 @
1d4fa243
...
@@ -15,14 +15,13 @@ syntax = "proto2";
...
@@ -15,14 +15,13 @@ syntax = "proto2";
package
paddle
;
package
paddle
;
message
FileGroupConf
{
message
FileGroupConf
{
optional
uint32
queue_capacity
=
1
[
default
=
1
];
optional
uint32
queue_capacity
=
1
[
default
=
1
];
// how many files to load for a load file thread
// how many files to load for a load file thread
optional
int32
load_file_count
=
2
[
default
=
1
];
optional
int32
load_file_count
=
2
[
default
=
1
];
// how many threads to load files
// how many threads to load files
// Setting to be 5~10 is appropriate when loading files by hadoop vfs
// Setting to be 5~10 is appropriate when loading files by hadoop vfs
optional
int32
load_thread_num
=
3
[
default
=
1
];
optional
int32
load_thread_num
=
3
[
default
=
1
];
};
};
message
DataConfig
{
message
DataConfig
{
...
@@ -32,26 +31,28 @@ message DataConfig {
...
@@ -32,26 +31,28 @@ message DataConfig {
// name of a text file which contains a list of file names at each line
// name of a text file which contains a list of file names at each line
optional
string
files
=
3
;
optional
string
files
=
3
;
optional
int32
feat_dim
=
4
;
//
feature dimension of one frame
optional
int32
feat_dim
=
4
;
//
feature dimension of one frame
repeated
int32
slot_dims
=
5
;
//
feature slot dims
repeated
int32
slot_dims
=
5
;
//
feature slot dims
optional
int32
context_len
=
6
;
//
max neibour frame numbers
optional
int32
context_len
=
6
;
//
max neibour frame numbers
optional
uint64
buffer_capacity
=
7
;
//
the number of samples
optional
uint64
buffer_capacity
=
7
;
//
the number of samples
//part of data used in training
//
part of data used in training
//if not -1, part of train data is used in training
//
if not -1, part of train data is used in training
optional
int64
train_sample_num
=
8
[
default
=
-
1
];
optional
int64
train_sample_num
=
8
[
default
=
-
1
];
//The number of documents processed once
//
The number of documents processed once
optional
int32
file_load_num
=
9
[
default
=
-
1
];
optional
int32
file_load_num
=
9
[
default
=
-
1
];
optional
bool
async_load_data
=
12
[
default
=
false
];
optional
bool
async_load_data
=
12
[
default
=
false
];
/// Note the field number 10, 11 and 13 have been deprecated.
/// Note the field number 10, 11 and 13 have been deprecated.
optional
bool
for_test
=
14
[
default
=
false
];
// whether this data is for test
optional
bool
for_test
=
14
[
default
=
false
];
// whether this data is for test
optional
FileGroupConf
file_group_conf
=
15
;
optional
FileGroupConf
file_group_conf
=
15
;
repeated
int32
float_slot_dims
=
16
;
repeated
int32
float_slot_dims
=
16
;
/// Note the field number 17, 18 and 19 have been deprecated.
/// Note the field number 17, 18 and 19 have been deprecated.
// a list of values which will be used to create additional one dimensional float
// a list of values which will be used to create additional one dimensional
// float
// values slots. These one dimensional slots can be used as the weight input
// values slots. These one dimensional slots can be used as the weight input
// for cost layers.
// for cost layers.
// Currently this is only supported by ProtoDataProvider.
// Currently this is only supported by ProtoDataProvider.
...
@@ -65,21 +66,21 @@ message DataConfig {
...
@@ -65,21 +66,21 @@ message DataConfig {
// for MultiDataProvider
// for MultiDataProvider
repeated
DataConfig
sub_data_configs
=
24
;
// sub dataproviders
repeated
DataConfig
sub_data_configs
=
24
;
// sub dataproviders
/*
/*
* the ratio of each sub dataproviders:
* the ratio of each sub dataproviders:
* e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
* e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
* then each mini-batch is combined by 10 instance from A and 90 instances
* then each mini-batch is combined by 10 instance from A and 90 instances
* from B.
* from B.
*/
*/
optional
int32
data_ratio
=
25
;
optional
int32
data_ratio
=
25
;
/*
/*
* if one of the sub dataproviders is running out of data, then
* if one of the sub dataproviders is running out of data, then
* (1) it is "main data", then finish current pass.
* (1) it is "main data", then finish current pass.
* (2) it is not "main data", then reset it, and try getNextBatch again.
* (2) it is not "main data", then reset it, and try getNextBatch again.
*/
*/
optional
bool
is_main_data
=
26
[
default
=
true
];
optional
bool
is_main_data
=
26
[
default
=
true
];
// the usage ratio of instances. Setting to 1.0 means the use of all instances.
// the usage ratio of instances. Setting to 1.0 means the use of all
optional
double
usage_ratio
=
27
[
default
=
1.0
];
// instances.
optional
double
usage_ratio
=
27
[
default
=
1.0
];
};
};
proto/DataFormat.proto
浏览文件 @
1d4fa243
...
@@ -17,27 +17,32 @@ package paddle;
...
@@ -17,27 +17,32 @@ package paddle;
/*
/*
If values is not empty and ids is empty, this is a dense vector.
If values is not empty and ids is empty, this is a dense vector.
If values is not empty and ids is not empty, this is a sparse vector. The position of each value
If values is not empty and ids is not empty, this is a sparse vector. The
position of each value
is specified by ids.
is specified by ids.
If values is empty and ids is not empty, this is a sparse vector whose non-zero values are 1.
If values is empty and ids is not empty, this is a sparse vector whose non-zero
values are 1.
The position of each 1 is specified by ids.
The position of each 1 is specified by ids.
*/
*/
message
VectorSlot
{
message
VectorSlot
{
repeated
float
values
=
1
[
packed
=
true
];
repeated
float
values
=
1
[
packed
=
true
];
repeated
uint32
ids
=
2
[
packed
=
true
];
repeated
uint32
ids
=
2
[
packed
=
true
];
/* For multidimensional data, for example "image width height depth" */
/* For multidimensional data, for example "image width height depth" */
repeated
uint32
dims
=
3
[
packed
=
true
];
repeated
uint32
dims
=
3
[
packed
=
true
];
repeated
string
strs
=
4
;
repeated
string
strs
=
4
;
};
};
/*
/*
SubseqSlot use to record whether VectorSlot or any other slot in future has subseq.
SubseqSlot use to record whether VectorSlot or any other slot in future has
If not all VectorSlot have subseq, we only store the one who has subseq, and use *slot_id* to record it.
subseq.
One vector_slots has one sequence, and it may have N subseq, thus the number of *lens* will be N too.
If not all VectorSlot have subseq, we only store the one who has subseq, and
use *slot_id* to record it.
One vector_slots has one sequence, and it may have N subseq, thus the number of
*lens* will be N too.
*/
*/
message
SubseqSlot
{
message
SubseqSlot
{
required
uint32
slot_id
=
1
;
//the id of slot who has subseq
required
uint32
slot_id
=
1
;
//
the id of slot who has subseq
repeated
uint32
lens
=
2
;
// lengths of sub-sequence in the slot
repeated
uint32
lens
=
2
;
// lengths of sub-sequence in the slot
};
};
message
SlotDef
{
message
SlotDef
{
...
@@ -45,13 +50,14 @@ message SlotDef {
...
@@ -45,13 +50,14 @@ message SlotDef {
VECTOR_DENSE
=
0
;
VECTOR_DENSE
=
0
;
VECTOR_SPARSE_NON_VALUE
=
1
;
VECTOR_SPARSE_NON_VALUE
=
1
;
VECTOR_SPARSE_VALUE
=
2
;
VECTOR_SPARSE_VALUE
=
2
;
INDEX
=
3
;
// This can be used as label, or word id, etc.
INDEX
=
3
;
// This can be used as label, or word id, etc.
VAR_MDIM_DENSE
=
4
;
VAR_MDIM_DENSE
=
4
;
VAR_MDIM_INDEX
=
5
;
VAR_MDIM_INDEX
=
5
;
STRING
=
6
;
STRING
=
6
;
}
}
required
SlotType
type
=
1
;
required
SlotType
type
=
1
;
required
uint32
dim
=
2
;
// For INDEX slots, this means the maximal index plus 1.
required
uint32
dim
=
2
;
// For INDEX slots, this means the maximal index plus 1.
};
};
message
DataHeader
{
message
DataHeader
{
...
@@ -60,11 +66,11 @@ message DataHeader {
...
@@ -60,11 +66,11 @@ message DataHeader {
};
};
message
DataSample
{
message
DataSample
{
optional
bool
is_beginning
=
1
[
default
=
true
];
// is the beginning of a sequence
optional
bool
is_beginning
=
1
[
default
=
true
];
// is the beginning of a sequence
repeated
VectorSlot
vector_slots
=
2
;
repeated
VectorSlot
vector_slots
=
2
;
repeated
uint32
id_slots
=
3
[
packed
=
true
];
repeated
uint32
id_slots
=
3
[
packed
=
true
];
/* use ids of VectorSlot */
/* use ids of VectorSlot */
repeated
VectorSlot
var_id_slots
=
4
;
repeated
VectorSlot
var_id_slots
=
4
;
repeated
SubseqSlot
subseq_slots
=
5
;
repeated
SubseqSlot
subseq_slots
=
5
;
};
};
proto/ModelConfig.proto
浏览文件 @
1d4fa243
...
@@ -21,7 +21,6 @@ package paddle;
...
@@ -21,7 +21,6 @@ package paddle;
* Various structs for the configuration of a neural network
* Various structs for the configuration of a neural network
*/
*/
message
ExternalConfig
{
message
ExternalConfig
{
repeated
string
layer_names
=
1
;
repeated
string
layer_names
=
1
;
repeated
string
input_layer_names
=
2
;
repeated
string
input_layer_names
=
2
;
...
@@ -68,7 +67,7 @@ message ConvConfig {
...
@@ -68,7 +67,7 @@ message ConvConfig {
required
uint32
img_size
=
8
;
required
uint32
img_size
=
8
;
// caffe mode for output size coherence
// caffe mode for output size coherence
required
bool
caffe_mode
=
9
[
default
=
true
];
required
bool
caffe_mode
=
9
[
default
=
true
];
// if filter_size_y is set , this convolutional layer will use
// if filter_size_y is set , this convolutional layer will use
// filters of size filter_size * filter_size_y pixels.
// filters of size filter_size * filter_size_y pixels.
...
@@ -99,7 +98,7 @@ message PoolConfig {
...
@@ -99,7 +98,7 @@ message PoolConfig {
optional
uint32
start
=
4
;
optional
uint32
start
=
4
;
// Defines the stride size between successive pooling squares.
// Defines the stride size between successive pooling squares.
required
uint32
stride
=
5
[
default
=
1
];
required
uint32
stride
=
5
[
default
=
1
];
// The size of output feature map.
// The size of output feature map.
required
uint32
output_x
=
6
;
required
uint32
output_x
=
6
;
...
@@ -109,7 +108,7 @@ message PoolConfig {
...
@@ -109,7 +108,7 @@ message PoolConfig {
// padding = 4, instructs the net to implicitly
// padding = 4, instructs the net to implicitly
// pad the images with a 4-pixel border of zeros.
// pad the images with a 4-pixel border of zeros.
optional
uint32
padding
=
8
[
default
=
0
];
optional
uint32
padding
=
8
[
default
=
0
];
// if not set, use size_x
// if not set, use size_x
optional
uint32
size_y
=
9
;
optional
uint32
size_y
=
9
;
...
@@ -194,9 +193,7 @@ message MaxOutConfig {
...
@@ -194,9 +193,7 @@ message MaxOutConfig {
required
uint32
groups
=
2
;
required
uint32
groups
=
2
;
}
}
message
RowConvConfig
{
message
RowConvConfig
{
required
uint32
context_length
=
1
;
}
required
uint32
context_length
=
1
;
}
message
SliceConfig
{
message
SliceConfig
{
required
uint32
start
=
1
;
required
uint32
start
=
1
;
...
@@ -212,14 +209,14 @@ message ProjectionConfig {
...
@@ -212,14 +209,14 @@ message ProjectionConfig {
// For ShiftProjection
// For ShiftProjection
optional
int32
context_start
=
5
;
optional
int32
context_start
=
5
;
optional
int32
context_length
=
6
;
optional
int32
context_length
=
6
;
optional
bool
trainable_padding
=
7
[
default
=
false
];
optional
bool
trainable_padding
=
7
[
default
=
false
];
// For convolution
// For convolution
optional
ConvConfig
conv_conf
=
8
;
optional
ConvConfig
conv_conf
=
8
;
optional
int32
num_filters
=
9
;
optional
int32
num_filters
=
9
;
// For IdentityOffsetProjection
// For IdentityOffsetProjection
optional
uint64
offset
=
11
[
default
=
0
];
optional
uint64
offset
=
11
[
default
=
0
];
// For pool
// For pool
optional
PoolConfig
pool_conf
=
12
;
optional
PoolConfig
pool_conf
=
12
;
...
@@ -236,7 +233,7 @@ message OperatorConfig {
...
@@ -236,7 +233,7 @@ message OperatorConfig {
required
uint64
output_size
=
4
;
required
uint64
output_size
=
4
;
// For DotMulOperator
// For DotMulOperator
optional
double
dotmul_scale
=
5
[
default
=
1.0
];
optional
double
dotmul_scale
=
5
[
default
=
1.0
];
// For ConvOperator
// For ConvOperator
optional
ConvConfig
conv_conf
=
6
;
optional
ConvConfig
conv_conf
=
6
;
...
@@ -282,8 +279,8 @@ message MultiBoxLossConfig {
...
@@ -282,8 +279,8 @@ message MultiBoxLossConfig {
required
float
neg_overlap
=
4
;
required
float
neg_overlap
=
4
;
required
uint32
background_id
=
5
;
required
uint32
background_id
=
5
;
required
uint32
input_num
=
6
;
required
uint32
input_num
=
6
;
optional
uint32
height
=
7
[
default
=
1
];
optional
uint32
height
=
7
[
default
=
1
];
optional
uint32
width
=
8
[
default
=
1
];
optional
uint32
width
=
8
[
default
=
1
];
}
}
message
DetectionOutputConfig
{
message
DetectionOutputConfig
{
...
@@ -294,8 +291,8 @@ message DetectionOutputConfig {
...
@@ -294,8 +291,8 @@ message DetectionOutputConfig {
required
uint32
input_num
=
5
;
required
uint32
input_num
=
5
;
required
uint32
keep_top_k
=
6
;
required
uint32
keep_top_k
=
6
;
required
float
confidence_threshold
=
7
;
required
float
confidence_threshold
=
7
;
optional
uint32
height
=
8
[
default
=
1
];
optional
uint32
height
=
8
[
default
=
1
];
optional
uint32
width
=
9
[
default
=
1
];
optional
uint32
width
=
9
[
default
=
1
];
}
}
message
ClipConfig
{
message
ClipConfig
{
...
@@ -331,7 +328,7 @@ message LayerConfig {
...
@@ -331,7 +328,7 @@ message LayerConfig {
required
string
name
=
1
;
required
string
name
=
1
;
required
string
type
=
2
;
required
string
type
=
2
;
optional
uint64
size
=
3
;
optional
uint64
size
=
3
;
//optional ActivationConfig activation = 4;
//
optional ActivationConfig activation = 4;
optional
string
active_type
=
4
;
optional
string
active_type
=
4
;
repeated
LayerInputConfig
inputs
=
5
;
repeated
LayerInputConfig
inputs
=
5
;
optional
string
bias_parameter_name
=
6
;
optional
string
bias_parameter_name
=
6
;
...
@@ -344,7 +341,7 @@ message LayerConfig {
...
@@ -344,7 +341,7 @@ message LayerConfig {
// (which is how convnets are usually trained). Setting this to
// (which is how convnets are usually trained). Setting this to
// false will untie the biases, yielding a separate bias for
// false will untie the biases, yielding a separate bias for
// every location at which the filter is applied.
// every location at which the filter is applied.
optional
bool
shared_biases
=
8
[
default
=
false
];
optional
bool
shared_biases
=
8
[
default
=
false
];
// Valid values are ones that divide the area of the output
// Valid values are ones that divide the area of the output
// grid in this convolutional layer. For example if this layer
// grid in this convolutional layer. For example if this layer
...
@@ -362,33 +359,35 @@ message LayerConfig {
...
@@ -362,33 +359,35 @@ message LayerConfig {
// the gpu device which the Layer's data in.
// the gpu device which the Layer's data in.
// Only used by ParallelNeuralNetork. Ignored otherwise.
// Only used by ParallelNeuralNetork. Ignored otherwise.
optional
int32
device
=
12
[
default
=
-
1
];
optional
int32
device
=
12
[
default
=
-
1
];
// for recurrent layer. If true, the recurrence runs from the end to the beginning.
// for recurrent layer. If true, the recurrence runs from the end to the
optional
bool
reversed
=
13
[
default
=
false
];
// beginning.
optional
bool
reversed
=
13
[
default
=
false
];
// for lstmemory layer. Different types of nodes have different activation type.
// for lstmemory layer. Different types of nodes have different activation
optional
string
active_gate_type
=
14
;
// type.
optional
string
active_gate_type
=
14
;
optional
string
active_state_type
=
15
;
optional
string
active_state_type
=
15
;
// For NCELayer
// For NCELayer
// The number of random negative labels for each sample
// The number of random negative labels for each sample
optional
int32
num_neg_samples
=
16
[
default
=
10
];
optional
int32
num_neg_samples
=
16
[
default
=
10
];
// For NCELayer
// For NCELayer
// The distribution for generating the random negative labels.
// The distribution for generating the random negative labels.
// A uniform distribution will be used if not provided
// A uniform distribution will be used if not provided
repeated
double
neg_sampling_dist
=
17
[
packed
=
true
];
repeated
double
neg_sampling_dist
=
17
[
packed
=
true
];
// For MaxLayer
// For MaxLayer
// default: output VALUE of MaxLayer. set this flag to true for output INDEX
// default: output VALUE of MaxLayer. set this flag to true for output INDEX
// INDEX will be put in Argument::value as double values.
// INDEX will be put in Argument::value as double values.
optional
bool
output_max_index
=
19
[
default
=
false
];
optional
bool
output_max_index
=
19
[
default
=
false
];
/// The filed number 20 have been deprecated.
/// The filed number 20 have been deprecated.
// For self-normalized estimation
// For self-normalized estimation
optional
double
softmax_selfnorm_alpha
=
21
[
default
=
0.1
];
optional
double
softmax_selfnorm_alpha
=
21
[
default
=
0.1
];
/// The filed numbers 22 and 23 have been deprecated.
/// The filed numbers 22 and 23 have been deprecated.
...
@@ -399,14 +398,14 @@ message LayerConfig {
...
@@ -399,14 +398,14 @@ message LayerConfig {
optional
bool
norm_by_times
=
25
;
optional
bool
norm_by_times
=
25
;
// for CostLayers
// for CostLayers
optional
double
coeff
=
26
[
default
=
1.0
];
optional
double
coeff
=
26
[
default
=
1.0
];
// for AverageLayer
// for AverageLayer
// can be set to: 'average', 'sum' or 'squarerootn'
// can be set to: 'average', 'sum' or 'squarerootn'
optional
string
average_strategy
=
27
;
optional
string
average_strategy
=
27
;
// for error clipping
// for error clipping
optional
double
error_clipping_threshold
=
28
[
default
=
0.0
];
optional
double
error_clipping_threshold
=
28
[
default
=
0.0
];
// for operators used by mixed layer
// for operators used by mixed layer
repeated
OperatorConfig
operator_confs
=
29
;
repeated
OperatorConfig
operator_confs
=
29
;
...
@@ -434,43 +433,44 @@ message LayerConfig {
...
@@ -434,43 +433,44 @@ message LayerConfig {
optional
uint32
beam_size
=
39
;
optional
uint32
beam_size
=
39
;
// for seqlastins layer, whether select first instead last
// for seqlastins layer, whether select first instead last
optional
bool
select_first
=
40
[
default
=
false
];
optional
bool
select_first
=
40
[
default
=
false
];
// for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer
// for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer
// can be set to: 'non-seq','seq'
// can be set to: 'non-seq','seq'
optional
string
trans_type
=
41
[
default
=
'non-seq'
];
optional
string
trans_type
=
41
[
default
=
'non-seq'
];
// to indicate whether selective_fc layer
// to indicate whether selective_fc layer
// is used in sequence generation or not
// is used in sequence generation or not
optional
bool
selective_fc_pass_generation
=
42
[
default
=
false
];
optional
bool
selective_fc_pass_generation
=
42
[
default
=
false
];
// to indicate whether selective_fc layer take its last input to
// to indicate whether selective_fc layer take its last input to
// selected several columns and only compute the multiplications
// selected several columns and only compute the multiplications
// between the input matrices and the selected columns of
// between the input matrices and the selected columns of
// the parameter matrices of this layer.
// the parameter matrices of this layer.
// if set false, selective_fc degrades into fc.
// if set false, selective_fc degrades into fc.
optional
bool
has_selected_colums
=
43
[
default
=
true
];
optional
bool
has_selected_colums
=
43
[
default
=
true
];
// this parameter is for speed consideration.
// this parameter is for speed consideration.
// if number of the selected columns is less than
// if number of the selected columns is less than
// sample number * selective_fc output size * selective_fc_mull_mull_ratio
// sample number * selective_fc output size * selective_fc_mull_mull_ratio
// sparse multiplication is used, otherwise, using full multiplication.
// sparse multiplication is used, otherwise, using full multiplication.
optional
double
selective_fc_full_mul_ratio
=
44
[
default
=
0.02
];
optional
double
selective_fc_full_mul_ratio
=
44
[
default
=
0.02
];
// to indicate how many threads selective_fc use to to accelate
// to indicate how many threads selective_fc use to to accelate
// the plain_mul period
// the plain_mul period
// leave empty or set to 0 to disable multi-thread accleleration
// leave empty or set to 0 to disable multi-thread accleleration
optional
uint32
selective_fc_parallel_plain_mul_thread_num
=
45
[
default
=
0
];
optional
uint32
selective_fc_parallel_plain_mul_thread_num
=
45
[
default
=
0
];
// for batch normalization layer
// for batch normalization layer
// if set use_global_stats true, will use the loaded mean and variance.
// if set use_global_stats true, will use the loaded mean and variance.
optional
bool
use_global_stats
=
46
;
optional
bool
use_global_stats
=
46
;
// use to compute moving mean and variance.
// use to compute moving mean and variance.
optional
double
moving_average_fraction
=
47
[
default
=
0.9
];
optional
double
moving_average_fraction
=
47
[
default
=
0.9
];
// bias size
// bias size
optional
uint32
bias_size
=
48
[
default
=
0
];
optional
uint32
bias_size
=
48
[
default
=
0
];
// this parameter can be used as a user-defined parameter when necessary,
// this parameter can be used as a user-defined parameter when necessary,
// without changing the proto file.
// without changing the proto file.
...
@@ -485,18 +485,17 @@ message LayerConfig {
...
@@ -485,18 +485,17 @@ message LayerConfig {
optional
uint64
width
=
51
;
optional
uint64
width
=
51
;
// blank label used in ctc loss
// blank label used in ctc loss
optional
uint32
blank
=
52
[
default
=
0
];
optional
uint32
blank
=
52
[
default
=
0
];
// stride parameter for seqlastins layer, AverageLayer, MaxLayer, which
// stride parameter for seqlastins layer, AverageLayer, MaxLayer, which
// controls the scope of pooling operation. can be set > 0.
// controls the scope of pooling operation. can be set > 0.
// leave empty or set to -1 to disable this stride pooling.
// leave empty or set to -1 to disable this stride pooling.
optional
int32
seq_pool_stride
=
53
[
default
=
-
1
];
optional
int32
seq_pool_stride
=
53
[
default
=
-
1
];
// for crop layer
// for crop layer
optional
int32
axis
=
54
[
default
=
2
];
optional
int32
axis
=
54
[
default
=
2
];
repeated
uint32
offset
=
55
;
repeated
uint32
offset
=
55
;
repeated
uint32
shape
=
56
;
repeated
uint32
shape
=
56
;
}
}
message
EvaluatorConfig
{
message
EvaluatorConfig
{
...
@@ -512,9 +511,9 @@ message EvaluatorConfig {
...
@@ -512,9 +511,9 @@ message EvaluatorConfig {
// Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
// Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
// For multi binary labels: true if output > classification_threshold
// For multi binary labels: true if output > classification_threshold
optional
double
classification_threshold
=
6
[
default
=
0.5
];
optional
double
classification_threshold
=
6
[
default
=
0.5
];
// The positive label. -1 means average precision and recall
// The positive label. -1 means average precision and recall
optional
int32
positive_label
=
7
[
default
=
-
1
];
optional
int32
positive_label
=
7
[
default
=
-
1
];
// load dict from this file
// load dict from this file
optional
string
dict_file
=
8
;
optional
string
dict_file
=
8
;
...
@@ -523,10 +522,10 @@ message EvaluatorConfig {
...
@@ -523,10 +522,10 @@ message EvaluatorConfig {
optional
string
result_file
=
9
;
optional
string
result_file
=
9
;
// top # results for max id printer
// top # results for max id printer
optional
int32
num_results
=
10
[
default
=
1
];
optional
int32
num_results
=
10
[
default
=
1
];
// whether to delimit the sequence in the seq_text_printer
// whether to delimit the sequence in the seq_text_printer
optional
bool
delimited
=
11
[
default
=
true
];
optional
bool
delimited
=
11
[
default
=
true
];
// Used by ChunkEvaluator
// Used by ChunkEvaluator
// chunk of these types are not counted
// chunk of these types are not counted
...
@@ -534,23 +533,23 @@ message EvaluatorConfig {
...
@@ -534,23 +533,23 @@ message EvaluatorConfig {
// Used by ClassificationErrorEvaluator
// Used by ClassificationErrorEvaluator
// top # classification error
// top # classification error
optional
int32
top_k
=
13
[
default
=
1
];
optional
int32
top_k
=
13
[
default
=
1
];
// Used by DetectionMAPEvaluator
// Used by DetectionMAPEvaluator
optional
double
overlap_threshold
=
14
[
default
=
0.5
];
optional
double
overlap_threshold
=
14
[
default
=
0.5
];
optional
int32
background_id
=
15
[
default
=
0
];
optional
int32
background_id
=
15
[
default
=
0
];
optional
bool
evaluate_difficult
=
16
[
default
=
false
];
optional
bool
evaluate_difficult
=
16
[
default
=
false
];
optional
string
ap_type
=
17
[
default
=
"11point"
];
optional
string
ap_type
=
17
[
default
=
"11point"
];
}
}
message
LinkConfig
{
message
LinkConfig
{
required
string
layer_name
=
1
;
required
string
layer_name
=
1
;
required
string
link_name
=
2
;
required
string
link_name
=
2
;
// If true, this link has sub-sequence
// If true, this link has sub-sequence
optional
bool
has_subseq
=
3
[
default
=
false
];
optional
bool
has_subseq
=
3
[
default
=
false
];
}
}
message
MemoryConfig
{
message
MemoryConfig
{
...
@@ -563,18 +562,18 @@ message MemoryConfig {
...
@@ -563,18 +562,18 @@ message MemoryConfig {
optional
uint32
boot_with_const_id
=
7
;
optional
uint32
boot_with_const_id
=
7
;
// memory is a sequence, initailized by a sequence boot layer
// memory is a sequence, initailized by a sequence boot layer
optional
bool
is_sequence
=
6
[
default
=
false
];
optional
bool
is_sequence
=
6
[
default
=
false
];
}
}
message
GeneratorConfig
{
message
GeneratorConfig
{
required
uint32
max_num_frames
=
1
;
required
uint32
max_num_frames
=
1
;
required
string
eos_layer_name
=
2
;
required
string
eos_layer_name
=
2
;
optional
int32
num_results_per_sample
=
3
[
default
=
1
];
optional
int32
num_results_per_sample
=
3
[
default
=
1
];
// for beam search
// for beam search
optional
int32
beam_size
=
4
[
default
=
1
];
optional
int32
beam_size
=
4
[
default
=
1
];
optional
bool
log_prob
=
5
[
default
=
true
];
optional
bool
log_prob
=
5
[
default
=
true
];
}
}
message
SubModelConfig
{
message
SubModelConfig
{
...
@@ -584,10 +583,10 @@ message SubModelConfig {
...
@@ -584,10 +583,10 @@ message SubModelConfig {
repeated
string
output_layer_names
=
4
;
repeated
string
output_layer_names
=
4
;
repeated
string
evaluator_names
=
5
;
repeated
string
evaluator_names
=
5
;
optional
bool
is_recurrent_layer_group
=
6
[
default
=
false
];
optional
bool
is_recurrent_layer_group
=
6
[
default
=
false
];
// If true, the recurrence runs from the end to the beginning.
// If true, the recurrence runs from the end to the beginning.
optional
bool
reversed
=
7
[
default
=
false
];
optional
bool
reversed
=
7
[
default
=
false
];
// name and link name of memory
// name and link name of memory
repeated
MemoryConfig
memories
=
8
;
repeated
MemoryConfig
memories
=
8
;
...
@@ -601,14 +600,15 @@ message SubModelConfig {
...
@@ -601,14 +600,15 @@ message SubModelConfig {
optional
GeneratorConfig
generator
=
11
;
optional
GeneratorConfig
generator
=
11
;
// the id of inlink which share info with outlinks, used in recurrent layer group
// the id of inlink which share info with outlinks, used in recurrent layer
// group
optional
int32
target_inlinkid
=
12
;
optional
int32
target_inlinkid
=
12
;
}
}
message
ModelConfig
{
message
ModelConfig
{
// type of the model.
// type of the model.
// Currently, "nn", "recurrent_nn" and "recursive_nn" are supported
// Currently, "nn", "recurrent_nn" and "recursive_nn" are supported
required
string
type
=
1
[
default
=
"nn"
];
required
string
type
=
1
[
default
=
"nn"
];
// layers should be ordered in such a way that the forward propagation
// layers should be ordered in such a way that the forward propagation
// can be correctly executed by going from the first layer to the last layer
// can be correctly executed by going from the first layer to the last layer
...
...
proto/OptimizerConfig.proto
浏览文件 @
1d4fa243
syntax
=
"proto2"
;
syntax
=
"proto2"
;
option
optimize_for
=
LITE_RUNTIME
;
option
optimize_for
=
LITE_RUNTIME
;
package
paddle
;
package
paddle
;
...
@@ -9,13 +9,11 @@ message SGDConfig {
...
@@ -9,13 +9,11 @@ message SGDConfig {
// momentum: float >= 0. Parameter updates momentum.
// momentum: float >= 0. Parameter updates momentum.
// decay: float >= 0. Learning rate decay over each update.
// decay: float >= 0. Learning rate decay over each update.
// nesterov: boolean. Whether to apply Nesterov momentum.
// nesterov: boolean. Whether to apply Nesterov momentum.
optional
double
momentum
=
21
[
default
=
0.0
];
optional
double
momentum
=
21
[
default
=
0.0
];
optional
double
decay
=
23
[
default
=
0.0
];
optional
double
decay
=
23
[
default
=
0.0
];
optional
bool
nesterov
=
24
[
default
=
false
];
optional
bool
nesterov
=
24
[
default
=
false
];
}
}
message
AdadeltaConfig
{
message
AdadeltaConfig
{
// Adadelta
// Adadelta
// It is recommended to leave it at the default value.
// It is recommended to leave it at the default value.
...
@@ -23,21 +21,23 @@ message AdadeltaConfig {
...
@@ -23,21 +21,23 @@ message AdadeltaConfig {
// epsilon: float >= 0. Fuzz factor.
// epsilon: float >= 0. Fuzz factor.
// decay: float >= 0. Learning rate decay over each update.
// decay: float >= 0. Learning rate decay over each update.
// reference : [Adadelta - an adaptive learning rate
method](http://arxiv.org/abs/1212.5701)
// reference : [Adadelta - an adaptive learning rate
optional
double
rho
=
33
[
default
=
0.90
];
// method](http://arxiv.org/abs/1212.5701)
optional
double
epsilon
=
31
[
default
=
1e-5
];
optional
double
rho
=
33
[
default
=
0.90
];
optional
double
decay
=
32
[
default
=
0.0
];
optional
double
epsilon
=
31
[
default
=
1e-5
];
optional
double
decay
=
32
[
default
=
0.0
];
}
}
message
AdagradConfig
{
message
AdagradConfig
{
// Adagrad
// Adagrad
// epsilon: float >= 0.
// epsilon: float >= 0.
// decay: float >= 0. Learning rate decay over each update.
// decay: float >= 0. Learning rate decay over each update.
// reference : [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
// reference : [Adaptive Subgradient Methods for Online Learning and
optional
double
epsilon
=
41
[
default
=
1e-5
];
// Stochastic
optional
double
decay
=
42
[
default
=
0.0
];
// Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
optional
double
epsilon
=
41
[
default
=
1e-5
];
optional
double
decay
=
42
[
default
=
0.0
];
}
}
message
AdamConfig
{
message
AdamConfig
{
...
@@ -46,7 +46,8 @@ message AdamConfig {
...
@@ -46,7 +46,8 @@ message AdamConfig {
// beta_2: float, 0 < beta < 1. Generally close to 1.
// beta_2: float, 0 < beta < 1. Generally close to 1.
// epsilon: float >= 0. Fuzz factor.
// epsilon: float >= 0. Fuzz factor.
// decay: float >= 0. Learning rate decay over each update.
// decay: float >= 0. Learning rate decay over each update.
// reference : [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
// reference : [Adam - A Method for Stochastic
// Optimization](http://arxiv.org/abs/1412.6980v8)
optional
double
beta_1
=
41
;
optional
double
beta_1
=
41
;
optional
double
beta_2
=
42
;
optional
double
beta_2
=
42
;
optional
double
epsilon
=
43
;
optional
double
epsilon
=
43
;
...
@@ -55,32 +56,32 @@ message AdamConfig {
...
@@ -55,32 +56,32 @@ message AdamConfig {
message
ConstLrConfig
{
message
ConstLrConfig
{
// learninRate Policy
// learninRate Policy
optional
double
learning_rate
=
1
[
default
=
1.0
];
optional
double
learning_rate
=
1
[
default
=
1.0
];
}
}
message
LinearLrConfig
{
message
LinearLrConfig
{
// learninRate Policy
// learninRate Policy
optional
double
learning_rate
=
1
[
default
=
1.0
];
optional
double
learning_rate
=
1
[
default
=
1.0
];
optional
double
lr_decay_a
=
2
;
optional
double
lr_decay_a
=
2
;
optional
double
lr_decay_b
=
3
;
optional
double
lr_decay_b
=
3
;
}
}
message
TensorProto
{
message
TensorProto
{
enum
DataType
{
enum
DataType
{
PADDLE_ELEMENT_TYPE_INT32
=
0
;
PADDLE_ELEMENT_TYPE_INT32
=
0
;
PADDLE_ELEMENT_TYPE_UINT32
=
1
;
PADDLE_ELEMENT_TYPE_UINT32
=
1
;
PADDLE_ELEMENT_TYPE_INT64
=
2
;
PADDLE_ELEMENT_TYPE_INT64
=
2
;
PADDLE_ELEMENT_TYPE_UINT64
=
3
;
PADDLE_ELEMENT_TYPE_UINT64
=
3
;
PADDLE_ELEMENT_TYPE_FLOAT32
=
4
;
PADDLE_ELEMENT_TYPE_FLOAT32
=
4
;
PADDLE_ELEMENT_TYPE_FLOAT64
=
5
;
PADDLE_ELEMENT_TYPE_FLOAT64
=
5
;
}
}
optional
DataType
data_type
=
1
;
optional
DataType
data_type
=
1
;
repeated
bytes
content
=
2
;
repeated
bytes
content
=
2
;
}
}
message
LrPolicyState
{
message
LrPolicyState
{
// learninRate Policy
// learninRate Policy
optional
double
learning_rate
=
1
[
default
=
1.0
];
optional
double
learning_rate
=
1
[
default
=
1.0
];
optional
double
lr_decay_a
=
2
;
optional
double
lr_decay_a
=
2
;
optional
double
lr_decay_b
=
3
;
optional
double
lr_decay_b
=
3
;
}
}
...
@@ -104,7 +105,6 @@ message AdadeltaOptimizerState {
...
@@ -104,7 +105,6 @@ message AdadeltaOptimizerState {
optional
TensorProto
update_delta
=
4
;
optional
TensorProto
update_delta
=
4
;
}
}
message
AdagradOptimizerState
{
message
AdagradOptimizerState
{
optional
LrPolicyState
lr_state
=
101
;
optional
LrPolicyState
lr_state
=
101
;
optional
double
num_sample_passed
=
104
;
optional
double
num_sample_passed
=
104
;
...
@@ -124,10 +124,10 @@ message AdamOptimizerState {
...
@@ -124,10 +124,10 @@ message AdamOptimizerState {
message
OptimizerConfig
{
message
OptimizerConfig
{
enum
Optimizer
{
enum
Optimizer
{
SGD
=
1
;
SGD
=
1
;
Adadelta
=
2
;
Adadelta
=
2
;
Adagrad
=
3
;
Adagrad
=
3
;
Adam
=
4
;
Adam
=
4
;
}
}
optional
Optimizer
optimizer
=
1
;
optional
Optimizer
optimizer
=
1
;
optional
SGDConfig
sgd
=
3
;
optional
SGDConfig
sgd
=
3
;
...
@@ -136,8 +136,8 @@ message OptimizerConfig {
...
@@ -136,8 +136,8 @@ message OptimizerConfig {
optional
AdamConfig
adam
=
6
;
optional
AdamConfig
adam
=
6
;
enum
LrPolicy
{
enum
LrPolicy
{
Const
=
0
;
Const
=
0
;
Linear
=
1
;
Linear
=
1
;
}
}
optional
LrPolicy
lr_policy
=
11
;
optional
LrPolicy
lr_policy
=
11
;
optional
ConstLrConfig
const_lr
=
12
;
optional
ConstLrConfig
const_lr
=
12
;
...
...
proto/ParameterConfig.proto
浏览文件 @
1d4fa243
...
@@ -27,56 +27,57 @@ enum ParameterInitStrategy {
...
@@ -27,56 +27,57 @@ enum ParameterInitStrategy {
message
ParameterUpdaterHookConfig
{
message
ParameterUpdaterHookConfig
{
// hook type such as 'pruning'
// hook type such as 'pruning'
required
string
type
=
1
;
required
string
type
=
1
;
// this represents the ratio of zero element to be set by the Parameter
// this represents the ratio of zero element to be set by the Parameter
optional
double
sparsity_ratio
=
2
[
default
=
0.6
];
optional
double
sparsity_ratio
=
2
[
default
=
0.6
];
}
}
message
ParameterConfig
{
message
ParameterConfig
{
required
string
name
=
1
;
required
string
name
=
1
;
required
uint64
size
=
2
;
required
uint64
size
=
2
;
optional
double
learning_rate
=
3
[
default
=
1.0
];
optional
double
learning_rate
=
3
[
default
=
1.0
];
optional
double
momentum
=
4
[
default
=
0.0
];
optional
double
momentum
=
4
[
default
=
0.0
];
optional
double
initial_mean
=
5
[
default
=
0.0
];
optional
double
initial_mean
=
5
[
default
=
0.0
];
optional
double
initial_std
=
6
[
default
=
0.01
];
optional
double
initial_std
=
6
[
default
=
0.01
];
// use L2-regularization if decay_rate set and decay_rate_l1 not set
// use L2-regularization if decay_rate set and decay_rate_l1 not set
optional
double
decay_rate
=
7
[
default
=
0.0
];
optional
double
decay_rate
=
7
[
default
=
0.0
];
// use L1-regularization if decay_rate_l1 set
// use L1-regularization if decay_rate_l1 set
optional
double
decay_rate_l1
=
8
[
default
=
0.0
];
optional
double
decay_rate_l1
=
8
[
default
=
0.0
];
// dims of Parameter, e.g. dims[0] as height, dims[1] as width..
// dims of Parameter, e.g. dims[0] as height, dims[1] as width..
repeated
uint64
dims
=
9
;
repeated
uint64
dims
=
9
;
// the gpu device which the parameter in.
// the gpu device which the parameter in.
// Only used by ParallelNeuralNetork. Ignored otherwise.
// Only used by ParallelNeuralNetork. Ignored otherwise.
optional
int32
device
=
10
[
default
=
-
1
];
optional
int32
device
=
10
[
default
=
-
1
];
// how to init the parameter: 0 -> normal, 1 -> uniform
// how to init the parameter: 0 -> normal, 1 -> uniform
// 0: treat initial_mean as mean, intial_std as standard deviation
// 0: treat initial_mean as mean, intial_std as standard deviation
// 1: range is (initial_mean - initial_std) to (initial_mean + initial_std)
// 1: range is (initial_mean - initial_std) to (initial_mean + initial_std)
optional
int32
initial_strategy
=
11
[
default
=
0
];
optional
int32
initial_strategy
=
11
[
default
=
0
];
// define the variance when init the parameter, by height of the Matrix
// define the variance when init the parameter, by height of the Matrix
optional
bool
initial_smart
=
12
[
default
=
false
];
optional
bool
initial_smart
=
12
[
default
=
false
];
// apply regularization every # batches
// apply regularization every # batches
optional
int32
num_batches_regularization
=
13
[
default
=
1
];
optional
int32
num_batches_regularization
=
13
[
default
=
1
];
// if is_sparse is true, para is sparse, else para is dense
// if is_sparse is true, para is sparse, else para is dense
optional
bool
is_sparse
=
14
[
default
=
false
];
optional
bool
is_sparse
=
14
[
default
=
false
];
// if para is sparse, format should be "csc" or "csr", empty means is not sparse
// if para is sparse, format should be "csc" or "csr", empty means is not
optional
string
format
=
15
[
default
=
""
];
// sparse
optional
string
format
=
15
[
default
=
""
];
// sparse remote update or not
// sparse remote update or not
optional
bool
sparse_remote_update
=
16
[
default
=
false
];
optional
bool
sparse_remote_update
=
16
[
default
=
false
];
// gradient clipping threshold, no clipping by default
// gradient clipping threshold, no clipping by default
optional
double
gradient_clipping_threshold
=
17
[
default
=
0.0
];
optional
double
gradient_clipping_threshold
=
17
[
default
=
0.0
];
// static parameters are fixed when training
// static parameters are fixed when training
optional
bool
is_static
=
18
[
default
=
false
];
optional
bool
is_static
=
18
[
default
=
false
];
// para_id should NOT be set by config_parser. It is for
// para_id should NOT be set by config_parser. It is for
// internal use.
// internal use.
optional
uint64
para_id
=
19
;
optional
uint64
para_id
=
19
;
repeated
ParameterUpdaterHookConfig
update_hooks
=
20
;
repeated
ParameterUpdaterHookConfig
update_hooks
=
20
;
// setup load mat -> csr
// setup load mat -> csr
optional
bool
need_compact
=
21
[
default
=
false
];
optional
bool
need_compact
=
21
[
default
=
false
];
// whether to do sparse update for this parameter
// whether to do sparse update for this parameter
optional
bool
sparse_update
=
22
[
default
=
false
];
optional
bool
sparse_update
=
22
[
default
=
false
];
// whether this parameter is shared or not.
// whether this parameter is shared or not.
optional
bool
is_shared
=
23
[
default
=
false
];
optional
bool
is_shared
=
23
[
default
=
false
];
// parameter block size
// parameter block size
optional
uint64
parameter_block_size
=
24
[
default
=
0
];
optional
uint64
parameter_block_size
=
24
[
default
=
0
];
}
}
proto/ParameterServerConfig.proto
浏览文件 @
1d4fa243
...
@@ -15,13 +15,10 @@ syntax = "proto2";
...
@@ -15,13 +15,10 @@ syntax = "proto2";
package
paddle
;
package
paddle
;
/**
/**
* Configuration structure for ParameterClient2.
* Configuration structure for ParameterClient2.
*/
*/
message
ParameterClientConfig
{
message
ParameterClientConfig
{
required
int32
trainer_id
=
1
;
}
required
int32
trainer_id
=
1
;
}
/**
/**
* Configuration structure for ParameterServer2.
* Configuration structure for ParameterServer2.
...
@@ -30,24 +27,24 @@ message ParameterServerConfig {
...
@@ -30,24 +27,24 @@ message ParameterServerConfig {
// Number of ports for sending dense parameter,
// Number of ports for sending dense parameter,
// following ports on parameter server will be visited
// following ports on parameter server will be visited
// for sending dense parameter: [port, port+ports_num-1]
// for sending dense parameter: [port, port+ports_num-1]
required
int32
ports_num
=
1
[
default
=
1
];
required
int32
ports_num
=
1
[
default
=
1
];
// Number of ports for sending sparse parameter,
// Number of ports for sending sparse parameter,
// following ports on parameter server will be visited
// following ports on parameter server will be visited
// for sending sparse parameter:
// for sending sparse parameter:
// [port+ports_num, port+ports_num+ports_num_for_sparse-1]
// [port+ports_num, port+ports_num+ports_num_for_sparse-1]
required
int32
ports_num_for_sparse
=
2
[
default
=
0
];
required
int32
ports_num_for_sparse
=
2
[
default
=
0
];
// network device name for pservers
// network device name for pservers
required
string
nics
=
3
[
default
=
"xgbe0,xgbe1"
];
required
string
nics
=
3
[
default
=
"xgbe0,xgbe1"
];
required
string
rdma_tcp
=
4
[
default
=
"tcp"
];
required
string
rdma_tcp
=
4
[
default
=
"tcp"
];
// Listening port for pserver
// Listening port for pserver
required
int32
port
=
5
[
default
=
20134
];
required
int32
port
=
5
[
default
=
20134
];
// number of gradient servers
// number of gradient servers
required
int32
num_gradient_servers
=
6
[
default
=
1
];
required
int32
num_gradient_servers
=
6
[
default
=
1
];
// number of threads for sync op exec
// number of threads for sync op exec
required
int32
pserver_num_threads
=
7
[
default
=
1
];
required
int32
pserver_num_threads
=
7
[
default
=
1
];
// control config_.async_lagged_grad_discard_ratio() min value
// control config_.async_lagged_grad_discard_ratio() min value
required
double
async_lagged_ratio_min
=
8
[
default
=
1.0
];
required
double
async_lagged_ratio_min
=
8
[
default
=
1.0
];
// if async_lagged_grad_discard_ratio is not set in trainer_config.conf
// if async_lagged_grad_discard_ratio is not set in trainer_config.conf
// use it as defalut value
// use it as defalut value
required
double
async_lagged_ratio_default
=
9
[
default
=
1.5
];
required
double
async_lagged_ratio_default
=
9
[
default
=
1.5
];
}
}
\ No newline at end of file
proto/ParameterService.proto
浏览文件 @
1d4fa243
...
@@ -23,8 +23,8 @@ package paddle;
...
@@ -23,8 +23,8 @@ package paddle;
*/
*/
enum
ParameterUpdateMode
{
enum
ParameterUpdateMode
{
// Set parameter
// Set parameter
PSERVER_UPDATE_MODE_SET_PARAM
=
0
;
//
use local param
PSERVER_UPDATE_MODE_SET_PARAM
=
0
;
//
use local param
PSERVER_UPDATE_MODE_SET_PARAM_ZERO
=
1
;
//
set zero param
PSERVER_UPDATE_MODE_SET_PARAM_ZERO
=
1
;
//
set zero param
// Update parameter once a gradient is received
// Update parameter once a gradient is received
PSERVER_UPDATE_MODE_ASYNC_SGD
=
2
;
PSERVER_UPDATE_MODE_ASYNC_SGD
=
2
;
...
@@ -37,7 +37,7 @@ enum ParameterUpdateMode {
...
@@ -37,7 +37,7 @@ enum ParameterUpdateMode {
// No update. Only get parameters back.
// No update. Only get parameters back.
PSERVER_UPDATE_MODE_GET_PARAM
=
5
;
PSERVER_UPDATE_MODE_GET_PARAM
=
5
;
PSERVER_UPDATE_MODE_GET_PARAM_SPARSE
=
6
;
//
only get sparse rows
PSERVER_UPDATE_MODE_GET_PARAM_SPARSE
=
6
;
//
only get sparse rows
};
};
message
ParameterBlock
{
message
ParameterBlock
{
...
@@ -80,42 +80,34 @@ message SendParameterRequest {
...
@@ -80,42 +80,34 @@ message SendParameterRequest {
optional
int32
trainer_id
=
7
;
optional
int32
trainer_id
=
7
;
// send back parameter type on pserver, PARAMETER_VALUE by default
// send back parameter type on pserver, PARAMETER_VALUE by default
optional
int32
send_back_parameter_type
=
8
[
default
=
0
];
optional
int32
send_back_parameter_type
=
8
[
default
=
0
];
// forwardbackward time in usec
// forwardbackward time in usec
optional
uint64
forwardbackward_time
=
9
;
optional
uint64
forwardbackward_time
=
9
;
}
}
message
WaitPassStartRequest
{
message
WaitPassStartRequest
{}
}
message
WaitPassStartResponse
{
message
WaitPassStartResponse
{}
}
message
WaitPassFinishRequest
{
message
WaitPassFinishRequest
{}
}
message
WaitPassFinishResponse
{
message
WaitPassFinishResponse
{}
}
enum
SyncObject
{
enum
SyncObject
{
SYNC_DEFAULT
=
0
;
// wait for the synchronizeBarrier_
SYNC_DEFAULT
=
0
;
// wait for the synchronizeBarrier_
SYNC_DATA
=
1
;
// wait for the synchronizeDataBarrier_
SYNC_DATA
=
1
;
// wait for the synchronizeDataBarrier_
}
}
message
SynchronizeRequest
{
message
SynchronizeRequest
{
required
SyncObject
sync_object_id
=
1
[
default
=
SYNC_DEFAULT
];
required
SyncObject
sync_object_id
=
1
[
default
=
SYNC_DEFAULT
];
optional
int32
trainer_id
=
2
;
optional
int32
trainer_id
=
2
;
}
}
message
SynchronizeResponse
{
message
SynchronizeResponse
{}
}
message
SendParameterResponse
{
message
SendParameterResponse
{
repeated
ParameterBlock
blocks
=
1
;
}
repeated
ParameterBlock
blocks
=
1
;
}
message
SetConfigRequest
{
message
SetConfigRequest
{
repeated
ParameterConfig
param_configs
=
1
;
repeated
ParameterConfig
param_configs
=
1
;
...
@@ -125,26 +117,18 @@ message SetConfigRequest {
...
@@ -125,26 +117,18 @@ message SetConfigRequest {
required
bool
is_sparse_server
=
6
;
required
bool
is_sparse_server
=
6
;
}
}
message
SetConfigResponse
{
message
SetConfigResponse
{}
}
message
GetStatusRequest
{
message
GetStatusRequest
{}
}
message
GetStatusResponse
{
message
GetStatusResponse
{
required
PServerStatus
status
=
1
;
}
required
PServerStatus
status
=
1
;
}
message
SetStatusRequest
{
message
SetStatusRequest
{
required
PServerStatus
status
=
1
;
}
required
PServerStatus
status
=
1
;
}
message
SetStatusResponse
{
message
SetStatusResponse
{}
}
// create a column vector. The size is the dimension of parameter
// create a column vector. The size is the dimension of parameter
message
CreateVectorRequest
{
message
CreateVectorRequest
{}
}
message
CreateVectorResponse
{
message
CreateVectorResponse
{
// error message. Empty if success
// error message. Empty if success
...
@@ -153,9 +137,7 @@ message CreateVectorResponse {
...
@@ -153,9 +137,7 @@ message CreateVectorResponse {
required
int64
handle
=
2
;
required
int64
handle
=
2
;
}
}
message
ReleaseVectorRequest
{
message
ReleaseVectorRequest
{
required
int64
handle
=
1
;
}
required
int64
handle
=
1
;
}
message
ReleaseVectorResponse
{
message
ReleaseVectorResponse
{
// error message. Empty if success
// error message. Empty if success
...
@@ -164,9 +146,7 @@ message ReleaseVectorResponse {
...
@@ -164,9 +146,7 @@ message ReleaseVectorResponse {
// Create a column major matrix. The number of rows is the dimension
// Create a column major matrix. The number of rows is the dimension
// of parameter. The number of columns is specifed by num_cols
// of parameter. The number of columns is specifed by num_cols
message
CreateMatrixRequest
{
message
CreateMatrixRequest
{
required
int32
num_cols
=
1
;
}
required
int32
num_cols
=
1
;
}
message
CreateMatrixResponse
{
message
CreateMatrixResponse
{
// error message. Empty if success
// error message. Empty if success
...
@@ -175,16 +155,13 @@ message CreateMatrixResponse {
...
@@ -175,16 +155,13 @@ message CreateMatrixResponse {
required
int64
handle
=
2
;
required
int64
handle
=
2
;
}
}
message
ReleaseMatrixRequest
{
message
ReleaseMatrixRequest
{
required
int64
handle
=
1
;
}
required
int64
handle
=
1
;
}
message
ReleaseMatrixResponse
{
message
ReleaseMatrixResponse
{
// error message. Empty if success
// error message. Empty if success
optional
string
return_message
=
1
;
optional
string
return_message
=
1
;
}
}
/**
/**
* The operations are defined using the variables commented at Operation
* The operations are defined using the variables commented at Operation
* and OperationResult
* and OperationResult
...
@@ -245,36 +222,36 @@ enum MatrixVectorOperation {
...
@@ -245,36 +222,36 @@ enum MatrixVectorOperation {
message
ProtoVector
{
message
ProtoVector
{
required
int64
dim
=
1
;
required
int64
dim
=
1
;
repeated
double
values
=
2
[
packed
=
true
];
repeated
double
values
=
2
[
packed
=
true
];
}
}
message
ProtoMatrix
{
message
ProtoMatrix
{
required
int64
num_rows
=
1
;
required
int64
num_rows
=
1
;
required
int64
num_cols
=
2
;
required
int64
num_cols
=
2
;
repeated
double
values
=
3
[
packed
=
true
];
repeated
double
values
=
3
[
packed
=
true
];
}
}
message
Operation
{
message
Operation
{
required
MatrixVectorOperation
operation
=
1
;
required
MatrixVectorOperation
operation
=
1
;
// vector handles created on the pserver
// vector handles created on the pserver
repeated
int64
pvectors
=
2
;
// u, v, w
repeated
int64
pvectors
=
2
;
// u, v, w
// matrix handles created on the pserver
// matrix handles created on the pserver
repeated
int64
pmatrices
=
3
;
// A, B, C
repeated
int64
pmatrices
=
3
;
// A, B, C
repeated
double
scalars
=
4
;
// a, b, c
repeated
double
scalars
=
4
;
// a, b, c
repeated
ProtoVector
vectors
=
5
;
// x, y, z
repeated
ProtoVector
vectors
=
5
;
// x, y, z
repeated
ProtoMatrix
matrices
=
6
;
// X, Y, Z
repeated
ProtoMatrix
matrices
=
6
;
// X, Y, Z
}
}
message
OperationResult
{
message
OperationResult
{
// error message. Empty if success
// error message. Empty if success
optional
string
return_message
=
1
;
optional
string
return_message
=
1
;
//
//
repeated
double
scalars
=
2
;
// d, e, f
repeated
double
scalars
=
2
;
// d, e, f
repeated
ProtoVector
vectors
=
3
;
// p, q, r
repeated
ProtoVector
vectors
=
3
;
// p, q, r
repeated
ProtoMatrix
matrices
=
4
;
// P, Q, R
repeated
ProtoMatrix
matrices
=
4
;
// P, Q, R
}
}
message
DoOperationRequest
{
message
DoOperationRequest
{
...
@@ -301,18 +278,14 @@ message DoOperationResponse {
...
@@ -301,18 +278,14 @@ message DoOperationResponse {
required
bool
pass_finish
=
3
;
required
bool
pass_finish
=
3
;
}
}
message
LoadValueRequest
{
message
LoadValueRequest
{
required
string
dir_name
=
1
;
}
required
string
dir_name
=
1
;
}
message
LoadValueResponse
{
message
LoadValueResponse
{
// error message. Empty if success
// error message. Empty if success
optional
string
return_message
=
1
;
optional
string
return_message
=
1
;
}
}
message
SaveValueRequest
{
message
SaveValueRequest
{
required
string
dir_name
=
1
;
}
required
string
dir_name
=
1
;
}
message
SaveValueResponse
{
message
SaveValueResponse
{
// error message. Empty if success
// error message. Empty if success
...
@@ -331,11 +304,11 @@ enum DataUpdateMode {
...
@@ -331,11 +304,11 @@ enum DataUpdateMode {
// Client send it's own ref label to pserver
// Client send it's own ref label to pserver
DATA_UPDATE_MODE_SET_REF_LABEL
=
4
;
DATA_UPDATE_MODE_SET_REF_LABEL
=
4
;
// Client get all ref labels from all pservers
// Client get all ref labels from all pservers
DATA_UPDATE_MODE_GET_REF_LABEL
=
5
;
DATA_UPDATE_MODE_GET_REF_LABEL
=
5
;
// Client send it's own ref grad to pserver
// Client send it's own ref grad to pserver
DATA_UPDATE_MODE_SET_REF_GRAD
=
6
;
DATA_UPDATE_MODE_SET_REF_GRAD
=
6
;
// Client get all ref grad from all pservers
// Client get all ref grad from all pservers
DATA_UPDATE_MODE_GET_REF_GRAD
=
7
;
DATA_UPDATE_MODE_GET_REF_GRAD
=
7
;
}
}
enum
SendDataType
{
enum
SendDataType
{
...
@@ -360,7 +333,7 @@ message DataBlock {
...
@@ -360,7 +333,7 @@ message DataBlock {
// byte size of one data type
// byte size of one data type
required
int32
data_size
=
2
;
required
int32
data_size
=
2
;
// data_type
// data_type
optional
TransDataType
data_type
=
3
[
default
=
TRANS_DOUBLE
];
optional
TransDataType
data_type
=
3
[
default
=
TRANS_DOUBLE
];
}
}
message
SendDataRequest
{
message
SendDataRequest
{
...
...
proto/TrainerConfig.proto
浏览文件 @
1d4fa243
...
@@ -20,14 +20,14 @@ package paddle;
...
@@ -20,14 +20,14 @@ package paddle;
message
OptimizationConfig
{
message
OptimizationConfig
{
required
int32
batch_size
=
3
;
required
int32
batch_size
=
3
;
required
string
algorithm
=
4
[
default
=
"async_sgd"
];
required
string
algorithm
=
4
[
default
=
"async_sgd"
];
optional
int32
num_batches_per_send_parameter
=
5
[
default
=
1
];
optional
int32
num_batches_per_send_parameter
=
5
[
default
=
1
];
optional
int32
num_batches_per_get_parameter
=
6
[
default
=
1
];
optional
int32
num_batches_per_get_parameter
=
6
[
default
=
1
];
required
double
learning_rate
=
7
;
required
double
learning_rate
=
7
;
optional
double
learning_rate_decay_a
=
8
[
default
=
0
];
optional
double
learning_rate_decay_a
=
8
[
default
=
0
];
optional
double
learning_rate_decay_b
=
9
[
default
=
0
];
optional
double
learning_rate_decay_b
=
9
[
default
=
0
];
optional
string
learning_rate_schedule
=
27
[
default
=
"constant"
];
optional
string
learning_rate_schedule
=
27
[
default
=
"constant"
];
// learning rate will be scaled according to learning_rate_schedule
// learning rate will be scaled according to learning_rate_schedule
// 1), constant:
// 1), constant:
// lr = learning_rate
// lr = learning_rate
...
@@ -49,88 +49,92 @@ message OptimizationConfig {
...
@@ -49,88 +49,92 @@ message OptimizationConfig {
// owlqn related
// owlqn related
// L1-regularization
// L1-regularization
optional
double
l1weight
=
10
[
default
=
0.1
];
optional
double
l1weight
=
10
[
default
=
0.1
];
// L2-regularization
// L2-regularization
optional
double
l2weight
=
11
[
default
=
0
];
optional
double
l2weight
=
11
[
default
=
0
];
// "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
// "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
// then accept the step
// then accept the step
optional
double
c1
=
12
[
default
=
0.0001
];
optional
double
c1
=
12
[
default
=
0.0001
];
// multiply the step with "backoff", when wolfe condition doesn't satisfy
// multiply the step with "backoff", when wolfe condition doesn't satisfy
optional
double
backoff
=
13
[
default
=
0.5
];
optional
double
backoff
=
13
[
default
=
0.5
];
// how many "s"s and "y"s are kept in owlqn
// how many "s"s and "y"s are kept in owlqn
optional
int32
owlqn_steps
=
14
[
default
=
10
];
optional
int32
owlqn_steps
=
14
[
default
=
10
];
// accept the step if encountered "max_backoff" times of "reduce the step"
// accept the step if encountered "max_backoff" times of "reduce the step"
optional
int32
max_backoff
=
15
[
default
=
5
];
optional
int32
max_backoff
=
15
[
default
=
5
];
// L2-regularization coefficient is reduced linearly from iteration 0 to
// L2-regularization coefficient is reduced linearly from iteration 0 to
// "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter"
// "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter"
// iterations. set "l2weight_zero_iter" to 0 to disable this strategy.
// iterations. set "l2weight_zero_iter" to 0 to disable this strategy.
optional
int32
l2weight_zero_iter
=
17
[
default
=
0
];
optional
int32
l2weight_zero_iter
=
17
[
default
=
0
];
// averaged sgd
// averaged sgd
// About average_window * numBatchProcessed parameter are used
// About average_window * numBatchProcessed parameter are used
// for average. To be accurate, between average_window * numBatchProcessed
// for average. To be accurate, between average_window * numBatchProcessed
// and 2 * average_window * numBatchProcessed parameters are used for
// and 2 * average_window * numBatchProcessed parameters are used for
// average.
// average.
optional
double
average_window
=
18
[
default
=
0
];
optional
double
average_window
=
18
[
default
=
0
];
optional
int64
max_average_window
=
19
[
default
=
0x7fffffffffffffff
];
optional
int64
max_average_window
=
19
[
default
=
0x7fffffffffffffff
];
//////////////////////////
//////////////////////////
// Options Adaptive SGD //
// Options Adaptive SGD //
//////////////////////////
//////////////////////////
// learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", "rmsprop"
// learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta",
// default learning method("momentum") use global decayed learning rate with momentum.
// "rmsprop"
// default learning method("momentum") use global decayed learning rate with
// momentum.
// "adagrad", "adadelta" and "rmsprop" can set momentum too.
// "adagrad", "adadelta" and "rmsprop" can set momentum too.
optional
string
learning_method
=
23
[
default
=
"momentum"
];
optional
string
learning_method
=
23
[
default
=
"momentum"
];
optional
double
ada_epsilon
=
24
[
default
=
1e-6
];
optional
double
ada_epsilon
=
24
[
default
=
1e-6
];
optional
double
ada_rou
=
26
[
default
=
0.95
];
optional
double
ada_rou
=
26
[
default
=
0.95
];
// Force to do average in cpu in order to save gpu memory usage
// Force to do average in cpu in order to save gpu memory usage
optional
bool
do_average_in_cpu
=
25
[
default
=
false
];
optional
bool
do_average_in_cpu
=
25
[
default
=
false
];
// delta add rate in pserver, used while num_batches_per_send_parameter>1
// delta add rate in pserver, used while num_batches_per_send_parameter>1
// will be divided by #machines automatically.
// will be divided by #machines automatically.
optional
double
delta_add_rate
=
28
[
default
=
1.0
];
optional
double
delta_add_rate
=
28
[
default
=
1.0
];
// We split a large size into smaller mini-batches, whose sizes are
// We split a large size into smaller mini-batches, whose sizes are
// determined by mini_batch_size. It only takes effect when there is
// determined by mini_batch_size. It only takes effect when there is
// an ExternalMachine.
// an ExternalMachine.
optional
int32
mini_batch_size
=
29
[
default
=
128
];
optional
int32
mini_batch_size
=
29
[
default
=
128
];
// automatically set if any one of parameters set sparse remote update flag
// automatically set if any one of parameters set sparse remote update flag
optional
bool
use_sparse_remote_updater
=
30
[
default
=
false
];
optional
bool
use_sparse_remote_updater
=
30
[
default
=
false
];
// how to update center parameter and feedback to local parameter,
// how to update center parameter and feedback to local parameter,
// when use local sgd update in cluster training.
// when use local sgd update in cluster training.
// A option is elastic_average, proposed by the paper: Deep learning with elastic averaging SGD.
// A option is elastic_average, proposed by the paper: Deep learning with
// If use elastic_average method, every trainer node should sample from whole data sets.
// elastic averaging SGD.
optional
string
center_parameter_update_method
=
31
[
default
=
"average"
];
// If use elastic_average method, every trainer node should sample from whole
// data sets.
optional
string
center_parameter_update_method
=
31
[
default
=
"average"
];
// shrink sparse parameter value
// shrink sparse parameter value
// only works if parameter is remote sparse update and has L1 decay rate
// only works if parameter is remote sparse update and has L1 decay rate
optional
double
shrink_parameter_value
=
32
[
default
=
0
];
optional
double
shrink_parameter_value
=
32
[
default
=
0
];
////////////////////////////
////////////////////////////
// Options Adam Optimizer //
// Options Adam Optimizer //
////////////////////////////
////////////////////////////
optional
double
adam_beta1
=
33
[
default
=
0.9
];
optional
double
adam_beta1
=
33
[
default
=
0.9
];
optional
double
adam_beta2
=
34
[
default
=
0.999
];
optional
double
adam_beta2
=
34
[
default
=
0.999
];
optional
double
adam_epsilon
=
35
[
default
=
1e-8
];
optional
double
adam_epsilon
=
35
[
default
=
1e-8
];
// arguments for learning rate scheduler
// arguments for learning rate scheduler
// Format: num1:rate1,num2:rate2,...,numK:rateK
// Format: num1:rate1,num2:rate2,...,numK:rateK
// For learning_rate_schedule="manual", num is the number of samples,
// For learning_rate_schedule="manual", num is the number of samples,
// For learning_rate_schedule="pass_manual",
// For learning_rate_schedule="pass_manual",
// num is the number of passes (starting from 0)
// num is the number of passes (starting from 0)
optional
string
learning_rate_args
=
36
[
default
=
""
];
optional
string
learning_rate_args
=
36
[
default
=
""
];
// for async sgd gradient commit control.
// for async sgd gradient commit control.
// when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
// when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
// current async gradient will be discard silently.
// current async gradient will be discard silently.
optional
double
async_lagged_grad_discard_ratio
=
37
[
default
=
1.5
];
optional
double
async_lagged_grad_discard_ratio
=
37
[
default
=
1.5
];
// global threshold for gradient clipping
// global threshold for gradient clipping
optional
double
gradient_clipping_threshold
=
38
[
default
=
0.0
];
optional
double
gradient_clipping_threshold
=
38
[
default
=
0.0
];
};
};
message
TrainerConfig
{
message
TrainerConfig
{
...
@@ -141,7 +145,7 @@ message TrainerConfig {
...
@@ -141,7 +145,7 @@ message TrainerConfig {
repeated
string
config_files
=
5
;
repeated
string
config_files
=
5
;
// the directory to save/load model files for each training path
// the directory to save/load model files for each training path
optional
string
save_dir
=
6
[
default
=
"./output/model"
];
optional
string
save_dir
=
6
[
default
=
"./output/model"
];
// Path of the initial model parameters.
// Path of the initial model parameters.
// If it was set, start_pass will be ignored.
// If it was set, start_pass will be ignored.
...
@@ -149,7 +153,7 @@ message TrainerConfig {
...
@@ -149,7 +153,7 @@ message TrainerConfig {
// Start training from this pass.
// Start training from this pass.
// Will load parameter from the previous pass.
// Will load parameter from the previous pass.
optional
int32
start_pass
=
8
[
default
=
0
];
optional
int32
start_pass
=
8
[
default
=
0
];
// file path to the trainer config file
// file path to the trainer config file
optional
string
config_file
=
9
;
optional
string
config_file
=
9
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录