Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
1d4fa243
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1d4fa243
编写于
8月 04, 2017
作者:
L
liaogang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
ClangFormat for proto and cuda
上级
6512893b
变更
39
展开全部
显示空白变更内容
内联
并排
Showing
39 changed file
with
3661 addition
and
2920 deletion
+3661
-2920
.pre-commit-config.yaml
.pre-commit-config.yaml
+1
-1
paddle/cuda/src/hl_batch_transpose.cu
paddle/cuda/src/hl_batch_transpose.cu
+7
-9
paddle/cuda/src/hl_cuda_aggregate.cu
paddle/cuda/src/hl_cuda_aggregate.cu
+61
-101
paddle/cuda/src/hl_cuda_cnn.cu
paddle/cuda/src/hl_cuda_cnn.cu
+275
-134
paddle/cuda/src/hl_cuda_lstm.cu
paddle/cuda/src/hl_cuda_lstm.cu
+331
-159
paddle/cuda/src/hl_cuda_matrix.cu
paddle/cuda/src/hl_cuda_matrix.cu
+147
-196
paddle/cuda/src/hl_cuda_sequence.cu
paddle/cuda/src/hl_cuda_sequence.cu
+96
-88
paddle/cuda/src/hl_cuda_sparse.cu
paddle/cuda/src/hl_cuda_sparse.cu
+475
-509
paddle/cuda/src/hl_perturbation_util.cu
paddle/cuda/src/hl_perturbation_util.cu
+104
-45
paddle/cuda/src/hl_table_apply.cu
paddle/cuda/src/hl_table_apply.cu
+35
-33
paddle/cuda/src/hl_top_k.cu
paddle/cuda/src/hl_top_k.cu
+127
-114
paddle/framework/attr_type.proto
paddle/framework/attr_type.proto
+7
-7
paddle/framework/op_desc.proto
paddle/framework/op_desc.proto
+17
-17
paddle/framework/op_proto.proto
paddle/framework/op_proto.proto
+72
-70
paddle/function/ContextProjectionOpGpu.cu
paddle/function/ContextProjectionOpGpu.cu
+70
-56
paddle/function/CosSimOpGpu.cu
paddle/function/CosSimOpGpu.cu
+34
-26
paddle/function/CropOpGpu.cu
paddle/function/CropOpGpu.cu
+59
-25
paddle/function/CrossMapNormalOpGpu.cu
paddle/function/CrossMapNormalOpGpu.cu
+46
-25
paddle/function/DepthwiseConvOpGpu.cu
paddle/function/DepthwiseConvOpGpu.cu
+253
-218
paddle/function/Im2ColOpGpu.cu
paddle/function/Im2ColOpGpu.cu
+150
-106
paddle/function/MulOpGpu.cu
paddle/function/MulOpGpu.cu
+1
-1
paddle/function/PadOpGpu.cu
paddle/function/PadOpGpu.cu
+49
-15
paddle/function/RowConvOpGpu.cu
paddle/function/RowConvOpGpu.cu
+87
-68
paddle/gserver/layers/GruCompute.cu
paddle/gserver/layers/GruCompute.cu
+4
-3
paddle/gserver/layers/LstmCompute.cu
paddle/gserver/layers/LstmCompute.cu
+38
-17
paddle/math/BaseMatrix.cu
paddle/math/BaseMatrix.cu
+619
-366
paddle/math/TrainingAlgorithmOp.cu
paddle/math/TrainingAlgorithmOp.cu
+32
-33
paddle/math/tests/test_Tensor.cu
paddle/math/tests/test_Tensor.cu
+167
-170
paddle/math/tests/test_lazyAssign.cu
paddle/math/tests/test_lazyAssign.cu
+40
-34
paddle/operators/softmax_op.cu
paddle/operators/softmax_op.cu
+2
-1
paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
...a_provider_wrapper_dir/test_pydata_provider_wrapper.proto
+0
-0
proto/DataConfig.proto
proto/DataConfig.proto
+27
-26
proto/DataFormat.proto
proto/DataFormat.proto
+22
-16
proto/ModelConfig.proto
proto/ModelConfig.proto
+57
-57
proto/OptimizerConfig.proto
proto/OptimizerConfig.proto
+36
-36
proto/ParameterConfig.proto
proto/ParameterConfig.proto
+23
-22
proto/ParameterServerConfig.proto
proto/ParameterServerConfig.proto
+10
-13
proto/ParameterService.proto
proto/ParameterService.proto
+37
-64
proto/TrainerConfig.proto
proto/TrainerConfig.proto
+43
-39
未找到文件。
.pre-commit-config.yaml
浏览文件 @
1d4fa243
...
@@ -24,7 +24,7 @@
...
@@ -24,7 +24,7 @@
description
:
Format files with ClangFormat.
description
:
Format files with ClangFormat.
entry
:
clang-format -i
entry
:
clang-format -i
language
:
system
language
:
system
files
:
\.(c|cc|cxx|cpp|
h|hpp|hxx
)$
files
:
\.(c|cc|cxx|cpp|
cu|h|hpp|hxx|proto
)$
-
repo
:
https://github.com/PaddlePaddle/pre-commit-golang
-
repo
:
https://github.com/PaddlePaddle/pre-commit-golang
sha
:
8337620115c25ff8333f1b1a493bd031049bd7c0
sha
:
8337620115c25ff8333f1b1a493bd031049bd7c0
hooks
:
hooks
:
...
...
paddle/cuda/src/hl_batch_transpose.cu
浏览文件 @
1d4fa243
...
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_batch_transpose.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_batch_transpose.h"
const
int
TILE_DIM
=
64
;
const
int
TILE_DIM
=
64
;
const
int
BLOCK_ROWS
=
16
;
const
int
BLOCK_ROWS
=
16
;
// No bank-conflict transpose for a batch of data.
// No bank-conflict transpose for a batch of data.
__global__
void
batchTransposeNoBankConflicts
(
real
*
odata
,
__global__
void
batchTransposeNoBankConflicts
(
const
real
*
idata
,
real
*
odata
,
const
real
*
idata
,
int
numSamples
,
int
width
,
int
height
)
{
int
numSamples
,
int
width
,
int
height
)
{
__shared__
float
tile
[
TILE_DIM
][
TILE_DIM
+
1
];
__shared__
float
tile
[
TILE_DIM
][
TILE_DIM
+
1
];
const
int
x
=
blockIdx
.
x
*
TILE_DIM
+
threadIdx
.
x
;
const
int
x
=
blockIdx
.
x
*
TILE_DIM
+
threadIdx
.
x
;
...
@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
...
@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
newX
]
=
tile
[
threadIdx
.
x
][
j
];
newX
]
=
tile
[
threadIdx
.
x
][
j
];
}
}
void
batchTranspose
(
const
real
*
input
,
real
*
output
,
int
width
,
int
height
,
void
batchTranspose
(
int
batchSize
)
{
const
real
*
input
,
real
*
output
,
int
width
,
int
height
,
int
batchSize
)
{
dim3
dimBlock
(
TILE_DIM
,
BLOCK_ROWS
,
1
);
dim3
dimBlock
(
TILE_DIM
,
BLOCK_ROWS
,
1
);
dim3
dimGrid
(
DIVUP
(
width
,
TILE_DIM
),
DIVUP
(
height
,
TILE_DIM
),
batchSize
);
dim3
dimGrid
(
DIVUP
(
width
,
TILE_DIM
),
DIVUP
(
height
,
TILE_DIM
),
batchSize
);
batchTransposeNoBankConflicts
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
batchTransposeNoBankConflicts
<<<
dimGrid
,
dimBlock
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
input
,
batchSize
,
width
,
height
);
output
,
input
,
batchSize
,
width
,
height
);
CHECK_SYNC
(
"batchTranspose failed!"
);
CHECK_SYNC
(
"batchTranspose failed!"
);
}
}
paddle/cuda/src/hl_cuda_aggregate.cu
浏览文件 @
1d4fa243
...
@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_aggregate.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_cuda.h"
#include "hl_cuda.h"
#include "hl_cuda.ph"
#include "hl_cuda.ph"
#include "hl_aggregate.h"
#include "hl_thread.ph"
#include "hl_matrix_base.cuh"
#include "hl_matrix_base.cuh"
#include "hl_thread.ph"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
/**
/**
* @brief matrix row operator.
* @brief matrix row operator.
*/
*/
template
<
class
Agg
,
int
blockSize
>
template
<
class
Agg
,
int
blockSize
>
__global__
void
KeMatrixRowOp
(
Agg
agg
,
__global__
void
KeMatrixRowOp
(
Agg
agg
,
real
*
E
,
real
*
Sum
,
int
dimN
)
{
real
*
E
,
real
*
Sum
,
int
dimN
)
{
__shared__
real
sum_s
[
blockSize
];
__shared__
real
sum_s
[
blockSize
];
int
cnt
=
(
dimN
+
blockSize
-
1
)
/
blockSize
;
int
cnt
=
(
dimN
+
blockSize
-
1
)
/
blockSize
;
int
rowId
=
blockIdx
.
x
+
blockIdx
.
y
*
gridDim
.
x
;
int
rowId
=
blockIdx
.
x
+
blockIdx
.
y
*
gridDim
.
x
;
int
index
=
rowId
*
dimN
;
int
index
=
rowId
*
dimN
;
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
lmt
=
tid
;
int
lmt
=
tid
;
...
@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
...
@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
sum_s
[
tid
]
=
tmp
;
sum_s
[
tid
]
=
tmp
;
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
tid
<
stride
)
{
if
(
tid
<
stride
)
{
sum_s
[
tid
]
=
agg
(
sum_s
[
tid
],
sum_s
[
tid
+
stride
]);
sum_s
[
tid
]
=
agg
(
sum_s
[
tid
],
sum_s
[
tid
+
stride
]);
}
}
...
@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
...
@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
}
}
template
<
class
Agg
>
template
<
class
Agg
>
void
hl_matrix_row_op
(
Agg
agg
,
void
hl_matrix_row_op
(
Agg
agg
,
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
int
blocksX
=
dimM
;
int
blocksX
=
dimM
;
int
blocksY
=
1
;
int
blocksY
=
1
;
dim3
threads
(
128
,
1
);
dim3
threads
(
128
,
1
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixRowOp
<
Agg
,
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixRowOp
<
Agg
,
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
agg
,
A_d
,
C_d
,
dimN
);
agg
,
A_d
,
C_d
,
dimN
);
}
}
void
hl_matrix_row_sum
(
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
void
hl_matrix_row_sum
(
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_row_op
(
aggregate
::
sum
(),
hl_matrix_row_op
(
aggregate
::
sum
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_row_sum failed"
);
CHECK_SYNC
(
"hl_matrix_row_sum failed"
);
}
}
...
@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_row_op
(
aggregate
::
max
(),
hl_matrix_row_op
(
aggregate
::
max
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_row_max failed"
);
CHECK_SYNC
(
"hl_matrix_row_max failed"
);
}
}
...
@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_row_op
(
aggregate
::
min
(),
hl_matrix_row_op
(
aggregate
::
min
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_row_min failed"
);
CHECK_SYNC
(
"hl_matrix_row_min failed"
);
}
}
/**
/**
* @brief matrix column operator.
* @brief matrix column operator.
*/
*/
template
<
class
Agg
>
template
<
class
Agg
>
__global__
void
KeMatrixColumnOp
(
Agg
agg
,
__global__
void
KeMatrixColumnOp
(
real
*
E
,
Agg
agg
,
real
*
E
,
real
*
Sum
,
int
dimM
,
int
dimN
)
{
real
*
Sum
,
int
dimM
,
int
dimN
)
{
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
real
tmp
=
agg
.
init
();
real
tmp
=
agg
.
init
();
if
(
rowIdx
<
dimN
)
{
if
(
rowIdx
<
dimN
)
{
...
@@ -127,13 +104,10 @@ __global__ void KeMatrixColumnOp(Agg agg,
...
@@ -127,13 +104,10 @@ __global__ void KeMatrixColumnOp(Agg agg,
}
}
}
}
template
<
class
Agg
,
int
blockDimX
,
int
blockDimY
>
template
<
class
Agg
,
int
blockDimX
,
int
blockDimY
>
__global__
void
KeMatrixColumnOp_S
(
Agg
agg
,
__global__
void
KeMatrixColumnOp_S
(
real
*
E
,
Agg
agg
,
real
*
E
,
real
*
Sum
,
int
dimM
,
int
dimN
)
{
real
*
Sum
,
__shared__
real
_sum
[
blockDimX
*
blockDimY
];
int
dimM
,
int
dimN
)
{
__shared__
real
_sum
[
blockDimX
*
blockDimY
];
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
threadIdx
.
y
;
int
index
=
threadIdx
.
y
;
...
@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
...
@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
index
+=
blockDimY
;
index
+=
blockDimY
;
}
}
}
}
_sum
[
threadIdx
.
x
+
threadIdx
.
y
*
blockDimX
]
=
tmp
;
_sum
[
threadIdx
.
x
+
threadIdx
.
y
*
blockDimX
]
=
tmp
;
__syncthreads
();
__syncthreads
();
if
(
rowIdx
<
dimN
)
{
if
(
rowIdx
<
dimN
)
{
if
(
threadIdx
.
y
==
0
)
{
if
(
threadIdx
.
y
==
0
)
{
real
tmp
=
agg
.
init
();
real
tmp
=
agg
.
init
();
for
(
int
i
=
0
;
i
<
blockDimY
;
i
++
)
{
for
(
int
i
=
0
;
i
<
blockDimY
;
i
++
)
{
tmp
=
agg
(
tmp
,
_sum
[
threadIdx
.
x
+
i
*
blockDimX
]);
tmp
=
agg
(
tmp
,
_sum
[
threadIdx
.
x
+
i
*
blockDimX
]);
}
}
Sum
[
rowIdx
]
=
tmp
;
Sum
[
rowIdx
]
=
tmp
;
}
}
...
@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
...
@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
}
}
template
<
class
Agg
>
template
<
class
Agg
>
void
hl_matrix_column_op
(
Agg
agg
,
void
hl_matrix_column_op
(
Agg
agg
,
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
if
(
dimN
>=
8192
)
{
if
(
dimN
>=
8192
)
{
int
blocksX
=
(
dimN
+
128
-
1
)
/
128
;
int
blocksX
=
(
dimN
+
128
-
1
)
/
128
;
int
blocksY
=
1
;
int
blocksY
=
1
;
dim3
threads
(
128
,
1
);
dim3
threads
(
128
,
1
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixColumnOp
<
Agg
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixColumnOp
<
Agg
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
agg
,
A_d
,
C_d
,
dimM
,
dimN
);
agg
,
A_d
,
C_d
,
dimM
,
dimN
);
}
else
{
}
else
{
int
blocksX
=
(
dimN
+
32
-
1
)
/
32
;
int
blocksX
=
(
dimN
+
32
-
1
)
/
32
;
int
blocksY
=
1
;
int
blocksY
=
1
;
dim3
threads
(
32
,
32
);
dim3
threads
(
32
,
32
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixColumnOp_S
<
Agg
,
32
,
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixColumnOp_S
<
Agg
,
32
,
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
agg
,
A_d
,
C_d
,
dimM
,
dimN
);
agg
,
A_d
,
C_d
,
dimM
,
dimN
);
}
}
return
;
return
;
...
@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_column_op
(
aggregate
::
sum
(),
hl_matrix_column_op
(
aggregate
::
sum
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_column_sum failed"
);
CHECK_SYNC
(
"hl_matrix_column_sum failed"
);
}
}
...
@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_column_op
(
aggregate
::
max
(),
hl_matrix_column_op
(
aggregate
::
max
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_column_max failed"
);
CHECK_SYNC
(
"hl_matrix_column_max failed"
);
}
}
...
@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
...
@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_matrix_column_op
(
aggregate
::
min
(),
hl_matrix_column_op
(
aggregate
::
min
(),
A_d
,
C_d
,
dimM
,
dimN
);
A_d
,
C_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_column_min failed"
);
CHECK_SYNC
(
"hl_matrix_column_min failed"
);
}
}
...
@@ -226,16 +184,16 @@ template <int blockSize>
...
@@ -226,16 +184,16 @@ template <int blockSize>
__global__
void
KeVectorSum
(
real
*
E
,
real
*
Sum
,
int
dimM
)
{
__global__
void
KeVectorSum
(
real
*
E
,
real
*
Sum
,
int
dimM
)
{
__shared__
double
sum_s
[
blockSize
];
__shared__
double
sum_s
[
blockSize
];
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
index
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
sum_s
[
tid
]
=
0.0
f
;
sum_s
[
tid
]
=
0.0
f
;
while
(
index
<
dimM
)
{
while
(
index
<
dimM
)
{
sum_s
[
tid
]
+=
E
[
index
];
sum_s
[
tid
]
+=
E
[
index
];
index
+=
blockDim
.
x
*
gridDim
.
y
;
index
+=
blockDim
.
x
*
gridDim
.
y
;
}
}
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
tid
<
stride
)
{
if
(
tid
<
stride
)
{
sum_s
[
tid
]
+=
sum_s
[
tid
+
stride
];
sum_s
[
tid
]
+=
sum_s
[
tid
+
stride
];
}
}
...
@@ -261,36 +219,37 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
...
@@ -261,36 +219,37 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
struct
_hl_event_st
hl_event_st
=
{.
cu_event
=
t_resource
.
event
};
struct
_hl_event_st
hl_event_st
=
{.
cu_event
=
t_resource
.
event
};
hl_event_t
hl_event
=
&
hl_event_st
;
hl_event_t
hl_event
=
&
hl_event_st
;
while
(
!
hl_cuda_event_is_ready
(
hl_event
))
{}
while
(
!
hl_cuda_event_is_ready
(
hl_event
))
{
}
KeVectorSum
<
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorSum
<
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
A_d
,
t_resource
.
gpu_mem
,
dimM
);
A_d
,
t_resource
.
gpu_mem
,
dimM
);
KeVectorSum
<
128
><<<
1
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorSum
<
128
><<<
1
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
t_resource
.
gpu_mem
,
t_resource
.
cpu_mem
,
128
);
t_resource
.
gpu_mem
,
t_resource
.
cpu_mem
,
128
);
hl_memcpy_async
(
C_h
,
t_resource
.
cpu_mem
,
sizeof
(
real
),
HPPL_STREAM_DEFAULT
);
hl_memcpy_async
(
C_h
,
t_resource
.
cpu_mem
,
sizeof
(
real
),
HPPL_STREAM_DEFAULT
);
hl_stream_record_event
(
HPPL_STREAM_DEFAULT
,
hl_event
);
hl_stream_record_event
(
HPPL_STREAM_DEFAULT
,
hl_event
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
cudaError_t
err
=
(
cudaError_t
)
hl_get_device_last_error
();
cudaError_t
err
=
(
cudaError_t
)
hl_get_device_last_error
();
CHECK_EQ
(
cudaSuccess
,
err
)
CHECK_EQ
(
cudaSuccess
,
err
)
<<
"CUDA error: "
<<
"CUDA error: "
<<
hl_get_device_error_string
((
size_t
)
err
);
<<
hl_get_device_error_string
((
size_t
)
err
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeVectorAbsSum
(
real
*
E
,
real
*
Sum
,
int
dimM
)
{
__global__
void
KeVectorAbsSum
(
real
*
E
,
real
*
Sum
,
int
dimM
)
{
__shared__
double
sum_s
[
blockSize
];
__shared__
double
sum_s
[
blockSize
];
int
tid
=
threadIdx
.
x
;
int
tid
=
threadIdx
.
x
;
int
index
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
sum_s
[
tid
]
=
0.0
f
;
sum_s
[
tid
]
=
0.0
f
;
while
(
index
<
dimM
)
{
while
(
index
<
dimM
)
{
sum_s
[
tid
]
+=
abs
(
E
[
index
]);
sum_s
[
tid
]
+=
abs
(
E
[
index
]);
index
+=
blockDim
.
x
*
gridDim
.
y
;
index
+=
blockDim
.
x
*
gridDim
.
y
;
}
}
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
blockSize
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
tid
<
stride
)
{
if
(
tid
<
stride
)
{
sum_s
[
tid
]
+=
sum_s
[
tid
+
stride
];
sum_s
[
tid
]
+=
sum_s
[
tid
+
stride
];
}
}
...
@@ -316,18 +275,19 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
...
@@ -316,18 +275,19 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
struct
_hl_event_st
hl_event_st
=
{.
cu_event
=
t_resource
.
event
};
struct
_hl_event_st
hl_event_st
=
{.
cu_event
=
t_resource
.
event
};
hl_event_t
hl_event
=
&
hl_event_st
;
hl_event_t
hl_event
=
&
hl_event_st
;
while
(
!
hl_cuda_event_is_ready
(
hl_event
))
{}
while
(
!
hl_cuda_event_is_ready
(
hl_event
))
{
}
KeVectorAbsSum
<
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorAbsSum
<
128
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
A_d
,
t_resource
.
gpu_mem
,
dimM
);
A_d
,
t_resource
.
gpu_mem
,
dimM
);
KeVectorAbsSum
<
128
><<<
1
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorAbsSum
<
128
><<<
1
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
t_resource
.
gpu_mem
,
t_resource
.
cpu_mem
,
128
);
t_resource
.
gpu_mem
,
t_resource
.
cpu_mem
,
128
);
hl_memcpy_async
(
C_h
,
t_resource
.
cpu_mem
,
sizeof
(
real
),
HPPL_STREAM_DEFAULT
);
hl_memcpy_async
(
C_h
,
t_resource
.
cpu_mem
,
sizeof
(
real
),
HPPL_STREAM_DEFAULT
);
hl_stream_record_event
(
HPPL_STREAM_DEFAULT
,
hl_event
);
hl_stream_record_event
(
HPPL_STREAM_DEFAULT
,
hl_event
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
hl_stream_synchronize
(
HPPL_STREAM_DEFAULT
);
cudaError_t
err
=
(
cudaError_t
)
hl_get_device_last_error
();
cudaError_t
err
=
(
cudaError_t
)
hl_get_device_last_error
();
CHECK_EQ
(
cudaSuccess
,
err
)
CHECK_EQ
(
cudaSuccess
,
err
)
<<
"CUDA error: "
<<
"CUDA error: "
<<
hl_get_device_error_string
((
size_t
)
err
);
<<
hl_get_device_error_string
((
size_t
)
err
);
}
}
paddle/cuda/src/hl_cuda_cnn.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/cuda/src/hl_cuda_lstm.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/cuda/src/hl_cuda_matrix.cu
浏览文件 @
1d4fa243
...
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
#include "hl_matrix.h"
#include "hl_matrix.h"
#include "hl_matrix_ops.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_apply.cuh"
#include "hl_matrix_ops.cuh"
#include "hl_sequence.h"
#include "hl_sequence.h"
#include "hl_sparse.ph"
#include "hl_sparse.ph"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
DEFINE_MATRIX_UNARY_OP
(
Zero
,
a
=
0
);
DEFINE_MATRIX_UNARY_OP
(
Zero
,
a
=
0
);
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
_add
,
TWO_PARAMETER
,
c
=
p1
*
a
+
p2
*
b
);
DEFINE_MATRIX_TERNARY_PARAMETER_OP
(
_add
,
TWO_PARAMETER
,
c
=
p1
*
a
+
p2
*
b
);
void
hl_matrix_add
(
real
*
A_d
,
void
hl_matrix_add
(
real
*
A_d
,
real
*
B_d
,
real
*
B_d
,
real
*
C_d
,
real
*
C_d
,
int
dimM
,
int
dimM
,
int
dimN
,
int
dimN
,
real
alpha
,
real
alpha
,
...
@@ -36,8 +35,8 @@ void hl_matrix_add(real *A_d,
...
@@ -36,8 +35,8 @@ void hl_matrix_add(real *A_d,
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
B_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
hl_gpu_apply_ternary_op
hl_gpu_apply_ternary_op
<
real
,
ternary
::
_add
<
real
>
,
0
,
0
>
(
<
real
,
ternary
::
_add
<
real
>
,
0
,
0
>
(
ternary
::
_add
<
real
>
(
alpha
,
beta
),
ternary
::
_add
<
real
>
(
alpha
,
beta
),
A_d
,
A_d
,
B_d
,
B_d
,
C_d
,
C_d
,
...
@@ -50,12 +49,11 @@ void hl_matrix_add(real *A_d,
...
@@ -50,12 +49,11 @@ void hl_matrix_add(real *A_d,
}
}
#ifdef PADDLE_TYPE_DOUBLE
#ifdef PADDLE_TYPE_DOUBLE
#define THRESHOLD
128
#define THRESHOLD
128
#else
#else
#define THRESHOLD
64
#define THRESHOLD
64
#endif
#endif
__device__
__forceinline__
__device__
__forceinline__
void
findMax
(
real
*
I
,
void
findMax
(
real
*
I
,
real
*
dfMax_s
,
real
*
dfMax_s
,
int
blockSize
,
int
blockSize
,
int
base
,
int
base
,
...
@@ -89,8 +87,7 @@ void findMax(real* I,
...
@@ -89,8 +87,7 @@ void findMax(real* I,
__syncthreads
();
__syncthreads
();
}
}
__device__
__forceinline__
__device__
__forceinline__
void
subMaxAndExp
(
real
*
I
,
void
subMaxAndExp
(
real
*
I
,
real
*
O
,
real
*
O
,
int
curIdx
,
int
curIdx
,
int
nextIdx
,
int
nextIdx
,
...
@@ -115,8 +112,7 @@ void subMaxAndExp(real* I,
...
@@ -115,8 +112,7 @@ void subMaxAndExp(real* I,
__syncthreads
();
__syncthreads
();
}
}
__device__
__forceinline__
__device__
__forceinline__
void
valueSum
(
real
*
O
,
void
valueSum
(
real
*
O
,
real
*
dfMax_s
,
real
*
dfMax_s
,
int
blockSize
,
int
blockSize
,
int
base
,
int
base
,
...
@@ -141,13 +137,8 @@ void valueSum(real* O,
...
@@ -141,13 +137,8 @@ void valueSum(real* O,
__syncthreads
();
__syncthreads
();
}
}
__device__
__forceinline__
__device__
__forceinline__
void
divSum
(
void
divSum
(
real
*
O
,
real
*
O
,
real
sum
,
int
curIdx
,
int
nextIdx
,
int
blockSize
,
int
dimN
)
{
real
sum
,
int
curIdx
,
int
nextIdx
,
int
blockSize
,
int
dimN
)
{
while
(
curIdx
<
dimN
)
{
while
(
curIdx
<
dimN
)
{
O
[
nextIdx
]
/=
sum
;
O
[
nextIdx
]
/=
sum
;
nextIdx
+=
blockSize
;
nextIdx
+=
blockSize
;
...
@@ -155,8 +146,7 @@ void divSum(real* O,
...
@@ -155,8 +146,7 @@ void divSum(real* O,
}
}
}
}
__device__
__forceinline__
__device__
__forceinline__
void
softmax
(
real
*
I
,
void
softmax
(
real
*
I
,
real
*
O
,
real
*
O
,
real
*
dfMax_s
,
real
*
dfMax_s
,
int
blockSize
,
int
blockSize
,
...
@@ -167,8 +157,7 @@ void softmax(real* I,
...
@@ -167,8 +157,7 @@ void softmax(real* I,
__shared__
real
max
;
__shared__
real
max
;
// find the max number
// find the max number
findMax
(
I
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
findMax
(
I
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
,
&
max
);
nextIdx
,
dimN
,
&
max
);
// sub max Value and do Exp operation
// sub max Value and do Exp operation
subMaxAndExp
(
I
,
O
,
base
,
nextIdx
,
blockSize
,
dimN
,
max
);
subMaxAndExp
(
I
,
O
,
base
,
nextIdx
,
blockSize
,
dimN
,
max
);
...
@@ -181,8 +170,8 @@ void softmax(real* I,
...
@@ -181,8 +170,8 @@ void softmax(real* I,
divSum
(
O
,
dfMax_s
[
0
],
curIdx
,
nextIdx
,
blockSize
,
dimN
);
divSum
(
O
,
dfMax_s
[
0
],
curIdx
,
nextIdx
,
blockSize
,
dimN
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeMatrixSoftMax
(
real
*
O
,
real
*
I
,
int
dimN
)
{
__global__
void
KeMatrixSoftMax
(
real
*
O
,
real
*
I
,
int
dimN
)
{
int
base
=
threadIdx
.
x
;
int
base
=
threadIdx
.
x
;
__shared__
real
dfMax_s
[
blockSize
];
__shared__
real
dfMax_s
[
blockSize
];
int
nextIdx
=
blockIdx
.
x
*
dimN
+
base
;
int
nextIdx
=
blockIdx
.
x
*
dimN
+
base
;
...
@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
...
@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
softmax
(
I
,
O
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
);
softmax
(
I
,
O
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
);
}
}
void
hl_matrix_softmax
(
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
void
hl_matrix_softmax
(
real
*
A_d
,
real
*
C_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
dim3
block
(
512
,
1
);
dim3
block
(
512
,
1
);
dim3
grid
(
dimM
,
1
);
dim3
grid
(
dimM
,
1
);
KeMatrixSoftMax
<
512
>
KeMatrixSoftMax
<
512
><<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
A_d
,
dimN
);
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
A_d
,
dimN
);
CHECK_SYNC
(
"hl_matrix_softmax failed"
);
CHECK_SYNC
(
"hl_matrix_softmax failed"
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeSequenceSoftMax
(
real
*
O
,
real
*
I
,
const
int
*
index
)
{
__global__
void
KeSequenceSoftMax
(
real
*
O
,
real
*
I
,
const
int
*
index
)
{
int
base
=
threadIdx
.
x
;
int
base
=
threadIdx
.
x
;
int
bid
=
blockIdx
.
x
;
int
bid
=
blockIdx
.
x
;
__shared__
real
dfMax_s
[
blockSize
];
__shared__
real
dfMax_s
[
blockSize
];
...
@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
...
@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
softmax
(
I
,
O
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
);
softmax
(
I
,
O
,
dfMax_s
,
blockSize
,
base
,
curIdx
,
nextIdx
,
dimN
);
}
}
void
hl_sequence_softmax_forward
(
real
*
A_d
,
void
hl_sequence_softmax_forward
(
real
*
A_d
,
real
*
C_d
,
real
*
C_d
,
const
int
*
index
,
const
int
*
index
,
int
numSequence
)
{
int
numSequence
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
...
@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
...
@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
dim3
block
(
512
,
1
);
dim3
block
(
512
,
1
);
dim3
grid
(
numSequence
,
1
);
dim3
grid
(
numSequence
,
1
);
KeSequenceSoftMax
<
512
>
KeSequenceSoftMax
<
512
><<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
A_d
,
index
);
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
C_d
,
A_d
,
index
);
CHECK_SYNC
(
"hl_sequence_softmax_forward failed"
);
CHECK_SYNC
(
"hl_sequence_softmax_forward failed"
);
}
}
__global__
void
KeMatrixDerivative
(
real
*
grad_d
,
__global__
void
KeMatrixDerivative
(
real
*
output_d
,
real
*
grad_d
,
real
*
output_d
,
real
*
sftmaxSum_d
,
int
dimM
,
int
dimN
)
{
real
*
sftmaxSum_d
,
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
dimM
,
int
colIdx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
dimN
)
{
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
colIdx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
index
;
int
index
;
if
(
rowIdx
<
dimM
&&
colIdx
<
dimN
)
{
if
(
rowIdx
<
dimM
&&
colIdx
<
dimN
)
{
index
=
rowIdx
*
dimN
+
colIdx
;
index
=
rowIdx
*
dimN
+
colIdx
;
grad_d
[
index
]
=
output_d
[
index
]
*
(
grad_d
[
index
]
-
sftmaxSum_d
[
rowIdx
]);
grad_d
[
index
]
=
output_d
[
index
]
*
(
grad_d
[
index
]
-
sftmaxSum_d
[
rowIdx
]);
}
}
}
}
void
hl_matrix_softmax_derivative
(
real
*
grad_d
,
void
hl_matrix_softmax_derivative
(
real
*
output_d
,
real
*
grad_d
,
real
*
output_d
,
real
*
sftmaxSum_d
,
int
dimM
,
int
dimN
)
{
real
*
sftmaxSum_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
grad_d
);
CHECK_NOTNULL
(
grad_d
);
CHECK_NOTNULL
(
output_d
);
CHECK_NOTNULL
(
output_d
);
CHECK_NOTNULL
(
sftmaxSum_d
);
CHECK_NOTNULL
(
sftmaxSum_d
);
int
blocksX
=
(
dimM
+
0
)
/
1
;
int
blocksX
=
(
dimM
+
0
)
/
1
;
int
blocksY
=
(
dimN
+
1024
-
1
)
/
1024
;
int
blocksY
=
(
dimN
+
1024
-
1
)
/
1024
;
dim3
threads
(
1
,
1024
);
dim3
threads
(
1
,
1024
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixDerivative
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixDerivative
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad_d
,
output_d
,
sftmaxSum_d
,
dimM
,
dimN
);
grad_d
,
output_d
,
sftmaxSum_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_softmax_derivative failed"
);
CHECK_SYNC
(
"hl_matrix_softmax_derivative failed"
);
}
}
__global__
void
KeMatrixMultiBinaryCrossEntropy
(
real
*
output
,
__global__
void
KeMatrixMultiBinaryCrossEntropy
(
real
*
entropy
,
real
*
output
,
real
*
entropy
,
int
*
row
,
int
*
col
,
int
dimM
,
int
dimN
)
{
int
*
row
,
int
*
col
,
int
dimM
,
int
dimN
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
dimM
)
{
if
(
index
<
dimM
)
{
for
(
int
i
=
0
;
i
<
dimN
;
i
++
)
{
for
(
int
i
=
0
;
i
<
dimN
;
i
++
)
{
entropy
[
index
]
-=
log
(
1
-
output
[
index
*
dimN
+
i
]);
entropy
[
index
]
-=
log
(
1
-
output
[
index
*
dimN
+
i
]);
}
}
int
*
row_col
=
col
+
row
[
index
];
int
*
row_col
=
col
+
row
[
index
];
int
col_num
=
row
[
index
+
1
]
-
row
[
index
];
int
col_num
=
row
[
index
+
1
]
-
row
[
index
];
for
(
int
i
=
0
;
i
<
col_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
col_num
;
i
++
)
{
real
o
=
output
[
index
*
dimN
+
row_col
[
i
]];
real
o
=
output
[
index
*
dimN
+
row_col
[
i
]];
entropy
[
index
]
-=
log
(
o
/
(
1
-
o
));
entropy
[
index
]
-=
log
(
o
/
(
1
-
o
));
}
}
...
@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
...
@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
dim3
threads
(
n_threads
);
dim3
threads
(
n_threads
);
dim3
grid
(
blocks
);
dim3
grid
(
blocks
);
hl_csr_matrix
mat
=
(
hl_csr_matrix
)(
csr_mat
->
matrix
);
hl_csr_matrix
mat
=
(
hl_csr_matrix
)(
csr_mat
->
matrix
);
KeMatrixMultiBinaryCrossEntropy
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixMultiBinaryCrossEntropy
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
entropy
,
mat
->
csr_row
,
mat
->
csr_col
,
dimM
,
dimN
);
output
,
entropy
,
mat
->
csr_row
,
mat
->
csr_col
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_multi_binary_cross_entropy failed"
);
CHECK_SYNC
(
"hl_matrix_multi_binary_cross_entropy failed"
);
}
}
__global__
void
KeMatrixMultiBinaryCrossEntropyBp
(
real
*
output
,
__global__
void
KeMatrixMultiBinaryCrossEntropyBp
(
real
*
grad
,
real
*
output
,
real
*
grad
,
int
*
row
,
int
*
col
,
int
dimM
,
int
dimN
)
{
int
*
row
,
int
*
col
,
int
dimM
,
int
dimN
)
{
int
row_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
row_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
row_idx
<
dimM
)
{
if
(
row_idx
<
dimM
)
{
for
(
int
i
=
0
;
i
<
dimN
;
i
++
)
{
for
(
int
i
=
0
;
i
<
dimN
;
i
++
)
{
int
index
=
row_idx
*
dimN
+
i
;
int
index
=
row_idx
*
dimN
+
i
;
grad
[
index
]
+=
1.0
/
(
1
-
output
[
index
]);
grad
[
index
]
+=
1.0
/
(
1
-
output
[
index
]);
}
}
int
col_num
=
row
[
row_idx
+
1
]
-
row
[
row_idx
];
int
col_num
=
row
[
row_idx
+
1
]
-
row
[
row_idx
];
int
*
row_col
=
col
+
row
[
row_idx
];
int
*
row_col
=
col
+
row
[
row_idx
];
for
(
int
i
=
0
;
i
<
col_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
col_num
;
i
++
)
{
int
index
=
row_idx
*
dimN
+
row_col
[
i
];
int
index
=
row_idx
*
dimN
+
row_col
[
i
];
grad
[
index
]
-=
1.0
/
(
output
[
index
]
*
(
1
-
output
[
index
]));
grad
[
index
]
-=
1.0
/
(
output
[
index
]
*
(
1
-
output
[
index
]));
}
}
}
}
}
}
void
hl_matrix_multi_binary_cross_entropy_bp
(
real
*
output
,
void
hl_matrix_multi_binary_cross_entropy_bp
(
real
*
grad
,
real
*
output
,
real
*
grad
,
hl_sparse_matrix_s
csr_mat
,
int
dimM
,
int
dimN
)
{
hl_sparse_matrix_s
csr_mat
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
output
);
CHECK_NOTNULL
(
output
);
CHECK_NOTNULL
(
grad
);
CHECK_NOTNULL
(
grad
);
CHECK_NOTNULL
(
csr_mat
);
CHECK_NOTNULL
(
csr_mat
);
...
@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
...
@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
dim3
threads
(
n_threads
);
dim3
threads
(
n_threads
);
dim3
grid
(
blocks
);
dim3
grid
(
blocks
);
hl_csr_matrix
mat
=
(
hl_csr_matrix
)(
csr_mat
->
matrix
);
hl_csr_matrix
mat
=
(
hl_csr_matrix
)(
csr_mat
->
matrix
);
KeMatrixMultiBinaryCrossEntropyBp
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixMultiBinaryCrossEntropyBp
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
grad
,
mat
->
csr_row
,
mat
->
csr_col
,
dimM
,
dimN
);
output
,
grad
,
mat
->
csr_row
,
mat
->
csr_col
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_multi_binary_cross_entropy_bp failed"
);
CHECK_SYNC
(
"hl_matrix_multi_binary_cross_entropy_bp failed"
);
}
}
__global__
void
KeMatrixCrossEntropy
(
real
*
O
,
__global__
void
KeMatrixCrossEntropy
(
real
*
E
,
real
*
O
,
real
*
E
,
int
*
label
,
int
dimM
,
int
dimN
)
{
int
*
label
,
int
dimM
,
int
dimN
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
newBase
;
int
newBase
;
if
(
index
<
dimM
)
{
if
(
index
<
dimM
)
{
...
@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
...
@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
}
}
}
}
void
hl_matrix_cross_entropy
(
real
*
A_d
,
void
hl_matrix_cross_entropy
(
real
*
C_d
,
real
*
A_d
,
real
*
C_d
,
int
*
label_d
,
int
dimM
,
int
dimN
)
{
int
*
label_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
A_d
);
CHECK_NOTNULL
(
C_d
);
CHECK_NOTNULL
(
C_d
);
int
blocks
=
(
dimM
+
1024
-
1
)
/
1024
;
int
blocks
=
(
dimM
+
1024
-
1
)
/
1024
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blocks
,
1
);
dim3
grid
(
blocks
,
1
);
KeMatrixCrossEntropy
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixCrossEntropy
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
A_d
,
C_d
,
label_d
,
dimM
,
dimN
);
A_d
,
C_d
,
label_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_cross_entropy failed"
);
CHECK_SYNC
(
"hl_matrix_cross_entropy failed"
);
}
}
__global__
void
KeMatrixCrossEntropyBp
(
real
*
grad_d
,
__global__
void
KeMatrixCrossEntropyBp
(
real
*
output_d
,
real
*
grad_d
,
real
*
output_d
,
int
*
label_d
,
int
dimM
,
int
dimN
)
{
int
*
label_d
,
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
dimM
,
int
colIdx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
dimN
)
{
int
rowIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
colIdx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
index
;
int
index
;
if
(
rowIdx
<
dimM
&&
colIdx
<
dimN
)
{
if
(
rowIdx
<
dimM
&&
colIdx
<
dimN
)
{
index
=
rowIdx
*
dimN
+
colIdx
;
index
=
rowIdx
*
dimN
+
colIdx
;
if
(
label_d
[
rowIdx
]
==
colIdx
)
{
if
(
label_d
[
rowIdx
]
==
colIdx
)
{
grad_d
[
index
]
-=
1.0
f
/
output_d
[
index
];
grad_d
[
index
]
-=
1.0
f
/
output_d
[
index
];
}
}
}
}
}
}
void
hl_matrix_cross_entropy_bp
(
real
*
grad_d
,
void
hl_matrix_cross_entropy_bp
(
real
*
output_d
,
real
*
grad_d
,
real
*
output_d
,
int
*
label_d
,
int
dimM
,
int
dimN
)
{
int
*
label_d
,
int
dimM
,
int
dimN
)
{
CHECK_NOTNULL
(
grad_d
);
CHECK_NOTNULL
(
grad_d
);
CHECK_NOTNULL
(
output_d
);
CHECK_NOTNULL
(
output_d
);
CHECK_NOTNULL
(
label_d
);
CHECK_NOTNULL
(
label_d
);
int
blocksX
=
(
dimM
+
0
)
/
1
;
int
blocksX
=
(
dimM
+
0
)
/
1
;
int
blocksY
=
(
dimN
+
1024
-
1
)
/
1024
;
int
blocksY
=
(
dimN
+
1024
-
1
)
/
1024
;
dim3
threads
(
1
,
1024
);
dim3
threads
(
1
,
1024
);
dim3
grid
(
blocksX
,
blocksY
);
dim3
grid
(
blocksX
,
blocksY
);
KeMatrixCrossEntropyBp
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixCrossEntropyBp
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad_d
,
output_d
,
label_d
,
dimM
,
dimN
);
grad_d
,
output_d
,
label_d
,
dimM
,
dimN
);
CHECK_SYNC
(
"hl_matrix_cross_entropy_bp failed"
);
CHECK_SYNC
(
"hl_matrix_cross_entropy_bp failed"
);
}
}
void
hl_matrix_zero_mem
(
real
*
data
,
int
num
)
{
void
hl_matrix_zero_mem
(
real
*
data
,
int
num
)
{
hl_gpu_apply_unary_op
(
hl_gpu_apply_unary_op
(
unary
::
Zero
<
real
>
(),
data
,
1
,
num
,
num
);
unary
::
Zero
<
real
>
(),
data
,
1
,
num
,
num
);
}
}
__global__
void
KeParamReluForward
(
real
*
output
,
__global__
void
KeParamReluForward
(
real
*
output
,
...
@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
...
@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
int
ty
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
int
ty
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
if
(
tx
<
width
&&
ty
<
height
)
{
if
(
tx
<
width
&&
ty
<
height
)
{
int
index
=
ty
*
width
+
tx
;
int
index
=
ty
*
width
+
tx
;
output
[
index
]
=
input
[
index
]
>
0
?
input
[
index
]
:
output
[
index
]
=
input
[
index
]
*
w
[
tx
/
partial_sum
];
input
[
index
]
>
0
?
input
[
index
]
:
input
[
index
]
*
w
[
tx
/
partial_sum
];
}
}
}
}
...
@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
...
@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
CHECK_NOTNULL
(
w
);
CHECK_NOTNULL
(
w
);
dim3
threads
(
16
,
16
);
dim3
threads
(
16
,
16
);
int
blockX
=
(
width
+
16
-
1
)
/
16
;
int
blockX
=
(
width
+
16
-
1
)
/
16
;
int
blockY
=
(
height
+
16
-
1
)
/
16
;
int
blockY
=
(
height
+
16
-
1
)
/
16
;
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
KeParamReluForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeParamReluForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
input
,
w
,
width
,
height
,
partial_sum
);
output
,
input
,
w
,
width
,
height
,
partial_sum
);
CHECK_SYNC
(
"hl_param_relu_forward failed"
);
CHECK_SYNC
(
"hl_param_relu_forward failed"
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeParamReluBackWardW
(
real
*
grad_w
,
__global__
void
KeParamReluBackWardW
(
real
*
grad_w
,
real
*
grad_o
,
real
*
grad_o
,
real
*
input
,
real
*
input
,
...
@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
...
@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
int
grid_num
=
width
/
partial_sum
;
int
grid_num
=
width
/
partial_sum
;
dim3
threads
(
blockSize
,
1
);
dim3
threads
(
blockSize
,
1
);
dim3
grid
(
grid_num
,
1
);
dim3
grid
(
grid_num
,
1
);
KeParamReluBackWardW
<
blockSize
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeParamReluBackWardW
<
blockSize
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad_w
,
grad_o
,
input
,
width
,
height
,
partial_sum
);
grad_w
,
grad_o
,
input
,
width
,
height
,
partial_sum
);
CHECK_SYNC
(
"hl_param_relu_backward_w failed"
);
CHECK_SYNC
(
"hl_param_relu_backward_w failed"
);
}
}
...
@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
...
@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
CHECK_NOTNULL
(
diff
);
CHECK_NOTNULL
(
diff
);
dim3
threads
(
16
,
16
);
dim3
threads
(
16
,
16
);
int
blockX
=
(
width
+
16
-
1
)
/
16
;
int
blockX
=
(
width
+
16
-
1
)
/
16
;
int
blockY
=
(
height
+
16
-
1
)
/
16
;
int
blockY
=
(
height
+
16
-
1
)
/
16
;
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
KeParamReluBackwardDiff
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeParamReluBackwardDiff
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad_o
,
data
,
w
,
diff
,
width
,
height
,
partial_sum
);
grad_o
,
data
,
w
,
diff
,
width
,
height
,
partial_sum
);
CHECK_SYNC
(
"hl_param_relu_backward_diff failed"
);
CHECK_SYNC
(
"hl_param_relu_backward_diff failed"
);
}
}
__global__
void
KeMatrixAddSharedBias
(
real
*
A
,
__global__
void
KeMatrixAddSharedBias
(
real
*
B
,
real
*
A
,
real
*
B
,
const
int
channel
,
const
int
M
,
const
int
N
,
real
scale
)
{
const
int
channel
,
const
int
M
,
const
int
N
,
real
scale
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
dim
=
N
/
channel
;
int
dim
=
N
/
channel
;
if
(
index
<
M
*
N
)
{
if
(
index
<
M
*
N
)
{
...
@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
...
@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
real
scale
)
{
real
scale
)
{
const
int
blocks
=
512
;
const
int
blocks
=
512
;
const
int
grids
=
DIVUP
(
dimM
*
dimN
,
blocks
);
const
int
grids
=
DIVUP
(
dimM
*
dimN
,
blocks
);
KeMatrixAddSharedBias
<<<
grids
,
blocks
,
0
,
STREAM_DEFAULT
>>>
KeMatrixAddSharedBias
<<<
grids
,
blocks
,
0
,
STREAM_DEFAULT
>>>
(
(
A_d
,
B_d
,
channel
,
dimM
,
dimN
,
scale
);
A_d
,
B_d
,
channel
,
dimM
,
dimN
,
scale
);
CHECK_SYNC
(
"hl_matrix_add_shared_bias failed"
);
CHECK_SYNC
(
"hl_matrix_add_shared_bias failed"
);
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__global__
void
KeMatrixCollectSharedBias
(
real
*
B
,
__global__
void
KeMatrixCollectSharedBias
(
real
*
B
,
real
*
A
,
real
*
A
,
const
int
channel
,
const
int
channel
,
const
int
M
,
const
int
M
,
const
int
N
,
const
int
N
,
...
@@ -611,14 +563,13 @@ void hl_matrix_collect_shared_bias(real* B_d,
...
@@ -611,14 +563,13 @@ void hl_matrix_collect_shared_bias(real* B_d,
const
int
limit
=
64
;
const
int
limit
=
64
;
int
grids
=
(
dimM
*
dim
)
<
limit
?
DIVUP
(
channel
,
blocks
)
:
channel
;
int
grids
=
(
dimM
*
dim
)
<
limit
?
DIVUP
(
channel
,
blocks
)
:
channel
;
KeMatrixCollectSharedBias
<
blocks
>
KeMatrixCollectSharedBias
<
blocks
><<<
grids
,
blocks
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grids
,
blocks
,
0
,
STREAM_DEFAULT
>>>
B_d
,
A_d
,
channel
,
dimM
,
dimN
,
dim
,
limit
,
scale
);
(
B_d
,
A_d
,
channel
,
dimM
,
dimN
,
dim
,
limit
,
scale
);
CHECK_SYNC
(
"hl_matrix_collect_shared_bias failed"
);
CHECK_SYNC
(
"hl_matrix_collect_shared_bias failed"
);
}
}
__global__
void
keMatrixRotate
(
real
*
mat
,
real
*
matRot
,
__global__
void
keMatrixRotate
(
int
dimM
,
int
dimN
,
bool
clockWise
)
{
real
*
mat
,
real
*
matRot
,
int
dimM
,
int
dimN
,
bool
clockWise
)
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
<
dimM
*
dimN
)
{
if
(
idx
<
dimM
*
dimN
)
{
int
i
=
idx
/
dimN
;
int
i
=
idx
/
dimN
;
...
@@ -631,13 +582,13 @@ __global__ void keMatrixRotate(real* mat, real* matRot,
...
@@ -631,13 +582,13 @@ __global__ void keMatrixRotate(real* mat, real* matRot,
}
}
}
}
void
hl_matrix_rotate
(
real
*
mat
,
real
*
matRot
,
void
hl_matrix_rotate
(
int
dimM
,
int
dimN
,
bool
clockWise
)
{
real
*
mat
,
real
*
matRot
,
int
dimM
,
int
dimN
,
bool
clockWise
)
{
CHECK_NOTNULL
(
mat
);
CHECK_NOTNULL
(
mat
);
CHECK_NOTNULL
(
matRot
);
CHECK_NOTNULL
(
matRot
);
const
int
threads
=
512
;
const
int
threads
=
512
;
const
int
blocks
=
DIVUP
(
dimM
*
dimN
,
threads
);
const
int
blocks
=
DIVUP
(
dimM
*
dimN
,
threads
);
keMatrixRotate
<<<
blocks
,
threads
,
0
,
STREAM_DEFAULT
>>>
keMatrixRotate
<<<
blocks
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
mat
,
matRot
,
dimM
,
dimN
,
clockWise
);
mat
,
matRot
,
dimM
,
dimN
,
clockWise
);
CHECK_SYNC
(
"hl_matrix_rotate failed"
);
CHECK_SYNC
(
"hl_matrix_rotate failed"
);
}
}
paddle/cuda/src/hl_cuda_sequence.cu
浏览文件 @
1d4fa243
...
@@ -16,36 +16,36 @@ limitations under the License. */
...
@@ -16,36 +16,36 @@ limitations under the License. */
#include "hl_device_functions.cuh"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
__global__
void
KeMaxSequenceForward
(
real
*
input
,
__global__
void
KeMaxSequenceForward
(
real
*
input
,
const
int
*
sequence
,
const
int
*
sequence
,
real
*
output
,
real
*
output
,
int
*
index
,
int
*
index
,
int
numSequences
,
int
numSequences
,
int
dim
)
{
int
dim
)
{
int
dimIdx
=
threadIdx
.
x
;
int
dimIdx
=
threadIdx
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
sequenceId
=
blockIdx
.
x
;
if
(
sequenceId
>=
numSequences
)
return
;
if
(
sequenceId
>=
numSequences
)
return
;
int
start
=
sequence
[
sequenceId
];
int
start
=
sequence
[
sequenceId
];
int
end
=
sequence
[
sequenceId
+
1
];
int
end
=
sequence
[
sequenceId
+
1
];
for
(
int
i
=
dimIdx
;
i
<
dim
;
i
+=
blockDim
.
x
)
{
for
(
int
i
=
dimIdx
;
i
<
dim
;
i
+=
blockDim
.
x
)
{
real
tmp
=
-
HL_FLOAT_MAX
;
real
tmp
=
-
HL_FLOAT_MAX
;
int
tmpId
=
-
1
;
int
tmpId
=
-
1
;
for
(
int
insId
=
start
;
insId
<
end
;
insId
++
)
{
for
(
int
insId
=
start
;
insId
<
end
;
insId
++
)
{
if
(
tmp
<
input
[
insId
*
dim
+
i
])
{
if
(
tmp
<
input
[
insId
*
dim
+
i
])
{
tmp
=
input
[
insId
*
dim
+
i
];
tmp
=
input
[
insId
*
dim
+
i
];
tmpId
=
insId
;
tmpId
=
insId
;
}
}
}
}
output
[
sequenceId
*
dim
+
i
]
=
tmp
;
output
[
sequenceId
*
dim
+
i
]
=
tmp
;
index
[
sequenceId
*
dim
+
i
]
=
tmpId
;
index
[
sequenceId
*
dim
+
i
]
=
tmpId
;
}
}
}
}
void
hl_max_sequence_forward
(
real
*
input
,
void
hl_max_sequence_forward
(
real
*
input
,
const
int
*
sequence
,
const
int
*
sequence
,
real
*
output
,
real
*
output
,
int
*
index
,
int
*
index
,
int
numSequences
,
int
numSequences
,
int
dim
)
{
int
dim
)
{
CHECK_NOTNULL
(
input
);
CHECK_NOTNULL
(
input
);
...
@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
...
@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
dim3
threads
(
256
,
1
);
dim3
threads
(
256
,
1
);
dim3
grid
(
numSequences
,
1
);
dim3
grid
(
numSequences
,
1
);
KeMaxSequenceForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMaxSequenceForward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
input
,
sequence
,
output
,
index
,
numSequences
,
dim
);
input
,
sequence
,
output
,
index
,
numSequences
,
dim
);
CHECK_SYNC
(
"hl_max_sequence_forward failed"
);
CHECK_SYNC
(
"hl_max_sequence_forward failed"
);
}
}
__global__
void
KeMaxSequenceBackward
(
real
*
outputGrad
,
__global__
void
KeMaxSequenceBackward
(
int
*
index
,
real
*
outputGrad
,
int
*
index
,
real
*
inputGrad
,
int
numSequences
,
int
dim
)
{
real
*
inputGrad
,
int
numSequences
,
int
dim
)
{
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
int
colIdx
=
idx
%
dim
;
int
colIdx
=
idx
%
dim
;
if
(
idx
<
numSequences
*
dim
)
{
if
(
idx
<
numSequences
*
dim
)
{
int
insId
=
index
[
idx
];
int
insId
=
index
[
idx
];
inputGrad
[
insId
*
dim
+
colIdx
]
+=
outputGrad
[
idx
];
inputGrad
[
insId
*
dim
+
colIdx
]
+=
outputGrad
[
idx
];
}
}
}
}
void
hl_max_sequence_backward
(
real
*
outputGrad
,
void
hl_max_sequence_backward
(
int
*
index
,
real
*
outputGrad
,
int
*
index
,
real
*
inputGrad
,
int
numSequences
,
int
dim
)
{
real
*
inputGrad
,
int
numSequences
,
int
dim
)
{
CHECK_NOTNULL
(
outputGrad
);
CHECK_NOTNULL
(
outputGrad
);
CHECK_NOTNULL
(
index
);
CHECK_NOTNULL
(
index
);
CHECK_NOTNULL
(
inputGrad
);
CHECK_NOTNULL
(
inputGrad
);
...
@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
...
@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
unsigned
int
blocks
=
(
numSequences
*
dim
+
128
-
1
)
/
128
;
unsigned
int
blocks
=
(
numSequences
*
dim
+
128
-
1
)
/
128
;
dim3
threads
(
128
,
1
);
dim3
threads
(
128
,
1
);
dim3
grid
(
blocks
,
1
);
dim3
grid
(
blocks
,
1
);
KeMaxSequenceBackward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMaxSequenceBackward
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
outputGrad
,
index
,
inputGrad
,
numSequences
,
dim
);
outputGrad
,
index
,
inputGrad
,
numSequences
,
dim
);
CHECK_SYNC
(
"hl_max_sequence_backward failed"
);
CHECK_SYNC
(
"hl_max_sequence_backward failed"
);
}
}
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
AddRow
>
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
AddRow
>
__global__
void
KeMatrixAddRows
(
real
*
output
,
__global__
void
KeMatrixAddRows
(
real
*
output
,
real
*
table
,
real
*
table
,
int
*
ids
,
int
*
ids
,
...
@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
...
@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
while
(
sampleId
<
numSamples
)
{
while
(
sampleId
<
numSamples
)
{
int
tableId
=
ids
[
sampleId
];
int
tableId
=
ids
[
sampleId
];
if
((
0
<=
tableId
)
&&
(
tableId
<
tableSize
))
{
if
((
0
<=
tableId
)
&&
(
tableId
<
tableSize
))
{
real
*
outputData
=
output
+
sampleId
*
dim
;
real
*
outputData
=
output
+
sampleId
*
dim
;
real
*
tableData
=
table
+
tableId
*
dim
;
real
*
tableData
=
table
+
tableId
*
dim
;
for
(
int
i
=
idx
;
i
<
dim
;
i
+=
blockDimX
)
{
for
(
int
i
=
idx
;
i
<
dim
;
i
+=
blockDimX
)
{
if
(
AddRow
==
0
)
{
if
(
AddRow
==
0
)
{
outputData
[
i
]
+=
tableData
[
i
];
outputData
[
i
]
+=
tableData
[
i
];
...
@@ -114,15 +108,18 @@ __global__ void KeMatrixAddRows(real* output,
...
@@ -114,15 +108,18 @@ __global__ void KeMatrixAddRows(real* output,
}
}
}
}
}
}
sampleId
+=
blockDimY
*
gridDimX
;
sampleId
+=
blockDimY
*
gridDimX
;
}
}
}
}
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
seq2batch
,
bool
isAdd
>
template
<
int
blockDimX
,
__global__
int
blockDimY
,
void
KeSequence2Batch
(
real
*
batch
,
int
gridDimX
,
real
*
sequence
,
bool
seq2batch
,
const
int
*
batchIndex
,
bool
isAdd
>
__global__
void
KeSequence2Batch
(
real
*
batch
,
real
*
sequence
,
const
int
*
batchIndex
,
int
seqWidth
,
int
seqWidth
,
int
batchCount
)
{
int
batchCount
)
{
int
idx
=
threadIdx
.
x
;
int
idx
=
threadIdx
.
x
;
...
@@ -130,8 +127,8 @@ void KeSequence2Batch(real *batch,
...
@@ -130,8 +127,8 @@ void KeSequence2Batch(real *batch,
int
id
=
blockIdx
.
x
+
idy
*
gridDimX
;
int
id
=
blockIdx
.
x
+
idy
*
gridDimX
;
while
(
id
<
batchCount
)
{
while
(
id
<
batchCount
)
{
int
seqId
=
batchIndex
[
id
];
int
seqId
=
batchIndex
[
id
];
real
*
batchData
=
batch
+
id
*
seqWidth
;
real
*
batchData
=
batch
+
id
*
seqWidth
;
real
*
seqData
=
sequence
+
seqId
*
seqWidth
;
real
*
seqData
=
sequence
+
seqId
*
seqWidth
;
for
(
int
i
=
idx
;
i
<
seqWidth
;
i
+=
blockDimX
)
{
for
(
int
i
=
idx
;
i
<
seqWidth
;
i
+=
blockDimX
)
{
if
(
seq2batch
)
{
if
(
seq2batch
)
{
if
(
isAdd
)
{
if
(
isAdd
)
{
...
@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
...
@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
}
}
}
}
}
}
id
+=
blockDimY
*
gridDimX
;
id
+=
blockDimY
*
gridDimX
;
}
}
}
}
void
hl_sequence2batch_copy
(
real
*
batch
,
void
hl_sequence2batch_copy
(
real
*
batch
,
real
*
sequence
,
real
*
sequence
,
const
int
*
batchIndex
,
const
int
*
batchIndex
,
int
seqWidth
,
int
seqWidth
,
int
batchCount
,
int
batchCount
,
bool
seq2batch
)
{
bool
seq2batch
)
{
...
@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
...
@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
dim3
threads
(
128
,
8
);
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
if
(
seq2batch
)
{
if
(
seq2batch
)
{
KeSequence2Batch
<
128
,
8
,
8
,
1
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSequence2Batch
<
128
,
8
,
8
,
1
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
}
else
{
}
else
{
KeSequence2Batch
<
128
,
8
,
8
,
0
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSequence2Batch
<
128
,
8
,
8
,
0
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
}
}
CHECK_SYNC
(
"hl_sequence2batch_copy failed"
);
CHECK_SYNC
(
"hl_sequence2batch_copy failed"
);
}
}
void
hl_sequence2batch_add
(
real
*
batch
,
void
hl_sequence2batch_add
(
real
*
batch
,
real
*
sequence
,
real
*
sequence
,
int
*
batchIndex
,
int
*
batchIndex
,
int
seqWidth
,
int
seqWidth
,
int
batchCount
,
int
batchCount
,
bool
seq2batch
)
{
bool
seq2batch
)
{
...
@@ -186,18 +183,17 @@ void hl_sequence2batch_add(real *batch,
...
@@ -186,18 +183,17 @@ void hl_sequence2batch_add(real *batch,
dim3
threads
(
128
,
8
);
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
if
(
seq2batch
)
{
if
(
seq2batch
)
{
KeSequence2Batch
<
128
,
8
,
8
,
1
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSequence2Batch
<
128
,
8
,
8
,
1
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
}
else
{
}
else
{
KeSequence2Batch
<
128
,
8
,
8
,
0
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSequence2Batch
<
128
,
8
,
8
,
0
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
batch
,
sequence
,
batchIndex
,
seqWidth
,
batchCount
);
}
}
CHECK_SYNC
(
"hl_sequence2batch_add failed"
);
CHECK_SYNC
(
"hl_sequence2batch_add failed"
);
}
}
template
<
bool
normByTimes
,
bool
seq2batch
>
template
<
bool
normByTimes
,
bool
seq2batch
>
__global__
__global__
void
KeSequence2BatchPadding
(
real
*
batch
,
void
KeSequence2BatchPadding
(
real
*
batch
,
real
*
sequence
,
real
*
sequence
,
const
int
*
sequenceStartPositions
,
const
int
*
sequenceStartPositions
,
const
size_t
sequenceWidth
,
const
size_t
sequenceWidth
,
...
@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
...
@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
if
(
seq2batch
)
{
if
(
seq2batch
)
{
/* sequence -> batch */
/* sequence -> batch */
if
(
normByTimes
)
{
if
(
normByTimes
)
{
KeSequence2BatchPadding
<
1
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
KeSequence2BatchPadding
<
1
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
batch
,
sequence
,
sequenceStartPositions
,
batch
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
sequence
,
sequenceStartPositions
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
}
else
{
}
else
{
KeSequence2BatchPadding
<
0
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
KeSequence2BatchPadding
<
0
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
batch
,
sequence
,
sequenceStartPositions
,
batch
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
sequence
,
sequenceStartPositions
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
}
}
}
else
{
}
else
{
/* batch -> sequence */
/* batch -> sequence */
if
(
normByTimes
)
{
if
(
normByTimes
)
{
KeSequence2BatchPadding
<
1
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
KeSequence2BatchPadding
<
1
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
batch
,
sequence
,
sequenceStartPositions
,
batch
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
sequence
,
sequenceStartPositions
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
}
else
{
}
else
{
KeSequence2BatchPadding
<
0
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
KeSequence2BatchPadding
<
0
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
batch
,
sequence
,
sequenceStartPositions
,
batch
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
sequence
,
sequenceStartPositions
,
sequenceWidth
,
maxSequenceLength
,
numSequences
);
}
}
}
}
CHECK_SYNC
(
"hl_sequence2batch_copy_padding failed"
);
CHECK_SYNC
(
"hl_sequence2batch_copy_padding failed"
);
}
}
__device__
inline
float
my_rsqrt
(
float
x
)
{
__device__
inline
float
my_rsqrt
(
float
x
)
{
return
rsqrtf
(
x
);
}
return
rsqrtf
(
x
);
}
__device__
inline
double
my_rsqrt
(
double
x
)
{
__device__
inline
double
my_rsqrt
(
double
x
)
{
return
rsqrt
(
x
);
}
return
rsqrt
(
x
);
}
__global__
void
KeSequenceAvgForward
(
real
*
dst
,
__global__
void
KeSequenceAvgForward
(
real
*
dst
,
real
*
src
,
real
*
src
,
...
@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
...
@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
sum
+=
src
[
i
*
width
+
col
];
sum
+=
src
[
i
*
width
+
col
];
}
}
sum
=
mode
==
1
?
sum
:
sum
=
mode
==
1
?
sum
:
(
mode
==
0
?
sum
/
seqLength
(
mode
==
0
?
sum
/
seqLength
:
sum
*
my_rsqrt
((
real
)
seqLength
));
:
sum
*
my_rsqrt
((
real
)
seqLength
));
dst
[
gid
]
+=
sum
;
dst
[
gid
]
+=
sum
;
}
}
}
}
...
@@ -349,8 +357,8 @@ void hl_sequence_avg_forward(real* dst,
...
@@ -349,8 +357,8 @@ void hl_sequence_avg_forward(real* dst,
CHECK
(
mode
==
0
||
mode
==
1
||
mode
==
2
)
CHECK
(
mode
==
0
||
mode
==
1
||
mode
==
2
)
<<
"mode error in hl_sequence_avg_forward!"
;
<<
"mode error in hl_sequence_avg_forward!"
;
KeSequenceAvgForward
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
KeSequenceAvgForward
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
(
dst
,
src
,
starts
,
height
,
width
,
mode
);
dst
,
src
,
starts
,
height
,
width
,
mode
);
CHECK_SYNC
(
"hl_sequence_avg_forward failed"
);
CHECK_SYNC
(
"hl_sequence_avg_forward failed"
);
}
}
...
@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
...
@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
int
seqLength
=
end
-
start
;
int
seqLength
=
end
-
start
;
if
(
seqLength
==
0
)
return
;
if
(
seqLength
==
0
)
return
;
real
grad
=
src
[
gid
];
real
grad
=
src
[
gid
];
grad
=
mode
==
1
?
grad
:
grad
=
mode
==
1
?
grad
:
(
mode
==
0
?
grad
/
seqLength
(
mode
==
0
?
grad
/
seqLength
:
grad
*
my_rsqrt
((
real
)
seqLength
));
:
grad
*
my_rsqrt
((
real
)
seqLength
));
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
for
(
int
i
=
start
;
i
<
end
;
i
++
)
{
dst
[
i
*
width
+
col
]
+=
grad
;
dst
[
i
*
width
+
col
]
+=
grad
;
}
}
...
@@ -394,7 +402,7 @@ void hl_sequence_avg_backward(real* dst,
...
@@ -394,7 +402,7 @@ void hl_sequence_avg_backward(real* dst,
CHECK
(
mode
==
0
||
mode
==
1
||
mode
==
2
)
CHECK
(
mode
==
0
||
mode
==
1
||
mode
==
2
)
<<
"mode error in hl_sequence_avg_backward!"
;
<<
"mode error in hl_sequence_avg_backward!"
;
KeSequenceAvgBackward
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
KeSequenceAvgBackward
<<<
grid
,
block
,
0
,
STREAM_DEFAULT
>>>
(
(
dst
,
src
,
starts
,
height
,
width
,
mode
);
dst
,
src
,
starts
,
height
,
width
,
mode
);
CHECK_SYNC
(
"hl_sequence_avg_backward failed"
);
CHECK_SYNC
(
"hl_sequence_avg_backward failed"
);
}
}
paddle/cuda/src/hl_cuda_sparse.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/cuda/src/hl_perturbation_util.cu
浏览文件 @
1d4fa243
...
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <cmath>
#include <stdlib.h>
#include <stdlib.h>
#include "hl_cuda.h"
#include <cmath>
#include "hl_time.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_cuda.h"
#include "hl_perturbation_util.cuh"
#include "hl_perturbation_util.cuh"
#include "hl_time.h"
#define _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
...
@@ -30,10 +29,16 @@ limitations under the License. */
...
@@ -30,10 +29,16 @@ limitations under the License. */
* centerX, centerY: translation.
* centerX, centerY: translation.
* sourceX, sourceY: output coordinates in the original image.
* sourceX, sourceY: output coordinates in the original image.
*/
*/
__device__
void
getTranformCoord
(
int
x
,
int
y
,
real
theta
,
real
scale
,
__device__
void
getTranformCoord
(
int
x
,
real
tgtCenter
,
real
imgCenter
,
int
y
,
real
centerR
,
real
centerC
,
real
theta
,
int
*
sourceX
,
int
*
sourceY
)
{
real
scale
,
real
tgtCenter
,
real
imgCenter
,
real
centerR
,
real
centerC
,
int
*
sourceX
,
int
*
sourceY
)
{
real
H
[
4
]
=
{
cosf
(
-
theta
),
-
sinf
(
-
theta
),
sinf
(
-
theta
),
cosf
(
-
theta
)};
real
H
[
4
]
=
{
cosf
(
-
theta
),
-
sinf
(
-
theta
),
sinf
(
-
theta
),
cosf
(
-
theta
)};
// compute coornidates in the rotated and scaled image
// compute coornidates in the rotated and scaled image
...
@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
...
@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
* created by Wei Xu (genome), converted by Jiang Wang
* created by Wei Xu (genome), converted by Jiang Wang
*/
*/
__global__
void
kSamplingPatches
(
const
real
*
imgs
,
real
*
targets
,
__global__
void
kSamplingPatches
(
const
real
*
imgs
,
int
imgSize
,
int
tgtSize
,
const
int
channels
,
real
*
targets
,
int
samplingRate
,
const
real
*
thetas
,
int
imgSize
,
const
real
*
scales
,
const
int
*
centerRs
,
int
tgtSize
,
const
int
*
centerCs
,
const
real
padValue
,
const
int
channels
,
int
samplingRate
,
const
real
*
thetas
,
const
real
*
scales
,
const
int
*
centerRs
,
const
int
*
centerCs
,
const
real
padValue
,
const
int
numImages
)
{
const
int
numImages
)
{
const
int
caseIdx
=
blockIdx
.
x
*
4
+
threadIdx
.
x
;
const
int
caseIdx
=
blockIdx
.
x
*
4
+
threadIdx
.
x
;
const
int
pxIdx
=
blockIdx
.
y
*
128
+
threadIdx
.
y
;
const
int
pxIdx
=
blockIdx
.
y
*
128
+
threadIdx
.
y
;
...
@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
...
@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
const
int
pxY
=
pxIdx
/
tgtSize
;
const
int
pxY
=
pxIdx
/
tgtSize
;
int
srcPxX
,
srcPxY
;
int
srcPxX
,
srcPxY
;
getTranformCoord
(
pxX
,
pxY
,
thetas
[
imgIdx
],
scales
[
imgIdx
],
tgtCenter
,
getTranformCoord
(
pxX
,
imgCenter
,
centerCs
[
caseIdx
],
centerRs
[
caseIdx
],
&
srcPxX
,
pxY
,
thetas
[
imgIdx
],
scales
[
imgIdx
],
tgtCenter
,
imgCenter
,
centerCs
[
caseIdx
],
centerRs
[
caseIdx
],
&
srcPxX
,
&
srcPxY
);
&
srcPxY
);
imgs
+=
(
imgIdx
*
imgPixels
+
srcPxY
*
imgSize
+
srcPxX
)
*
channels
;
imgs
+=
(
imgIdx
*
imgPixels
+
srcPxY
*
imgSize
+
srcPxX
)
*
channels
;
...
@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
...
@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
*
*
* created by Wei Xu
* created by Wei Xu
*/
*/
void
hl_generate_disturb_params
(
real
*&
gpuAngle
,
real
*&
gpuScaleRatio
,
void
hl_generate_disturb_params
(
real
*&
gpuAngle
,
int
*&
gpuCenterR
,
int
*&
gpuCenterC
,
real
*&
gpuScaleRatio
,
int
numImages
,
int
imgSize
,
real
rotateAngle
,
int
*&
gpuCenterR
,
real
scaleRatio
,
int
samplingRate
,
int
*&
gpuCenterC
,
int
numImages
,
int
imgSize
,
real
rotateAngle
,
real
scaleRatio
,
int
samplingRate
,
bool
isTrain
)
{
bool
isTrain
)
{
// The number of output samples.
// The number of output samples.
int
numPatches
=
numImages
*
samplingRate
;
int
numPatches
=
numImages
*
samplingRate
;
...
@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
...
@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
for
(
int
i
=
0
;
i
<
numImages
;
i
++
)
{
for
(
int
i
=
0
;
i
<
numImages
;
i
++
)
{
r_angle
[
i
]
=
r_angle
[
i
]
=
(
rotateAngle
*
M_PI
/
180.0
)
*
(
rand
()
/
(
RAND_MAX
+
1.0
)
// NOLINT
(
rotateAngle
*
M_PI
/
180.0
)
*
(
rand
()
/
(
RAND_MAX
+
1.0
)
// NOLINT
-
0.5
);
-
0.5
);
s_ratio
[
i
]
=
s_ratio
[
i
]
=
1
+
(
rand
()
/
(
RAND_MAX
+
1.0
)
-
0.5
)
*
scaleRatio
;
// NOLINT
1
+
(
rand
()
/
(
RAND_MAX
+
1.0
)
-
0.5
)
*
scaleRatio
;
// NOLINT
}
}
...
@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
...
@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
int
pxY
=
int
pxY
=
(
int
)(
real
(
imgSize
-
1
)
*
rand
()
/
(
RAND_MAX
+
1.0
));
// NOLINT
(
int
)(
real
(
imgSize
-
1
)
*
rand
()
/
(
RAND_MAX
+
1.0
));
// NOLINT
const
real
H
[
4
]
=
{
cos
(
-
r_angle
[
i
]),
-
sin
(
-
r_angle
[
i
]),
const
real
H
[
4
]
=
{
cos
(
-
r_angle
[
i
]),
sin
(
-
r_angle
[
i
]),
cos
(
-
r_angle
[
i
])};
-
sin
(
-
r_angle
[
i
]),
sin
(
-
r_angle
[
i
]),
cos
(
-
r_angle
[
i
])};
real
x
=
pxX
-
imgCenter
;
real
x
=
pxX
-
imgCenter
;
real
y
=
pxY
-
imgCenter
;
real
y
=
pxY
-
imgCenter
;
real
xx
=
H
[
0
]
*
x
+
H
[
1
]
*
y
;
real
xx
=
H
[
0
]
*
x
+
H
[
1
]
*
y
;
...
@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
...
@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
delete
[]
center_c
;
delete
[]
center_c
;
}
}
void
hl_conv_random_disturb_with_params
(
const
real
*
images
,
int
imgSize
,
void
hl_conv_random_disturb_with_params
(
const
real
*
images
,
int
tgtSize
,
int
channels
,
int
imgSize
,
int
numImages
,
int
samplingRate
,
int
tgtSize
,
int
channels
,
int
numImages
,
int
samplingRate
,
const
real
*
gpuRotationAngle
,
const
real
*
gpuRotationAngle
,
const
real
*
gpuScaleRatio
,
const
real
*
gpuScaleRatio
,
const
int
*
gpuCenterR
,
const
int
*
gpuCenterR
,
...
@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
...
@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
dim3
threadsPerBlock
(
4
,
128
);
dim3
threadsPerBlock
(
4
,
128
);
dim3
numBlocks
(
DIVUP
(
numPatches
,
4
),
DIVUP
(
targetSize
,
128
));
dim3
numBlocks
(
DIVUP
(
numPatches
,
4
),
DIVUP
(
targetSize
,
128
));
kSamplingPatches
<<<
numBlocks
,
threadsPerBlock
>>>
kSamplingPatches
<<<
numBlocks
,
threadsPerBlock
>>>
(
images
,
(
images
,
target
,
imgSize
,
tgtSize
,
channels
,
samplingRate
,
target
,
gpuRotationAngle
,
gpuScaleRatio
,
gpuCenterR
,
gpuCenterC
,
imgSize
,
paddingValue
,
numImages
);
tgtSize
,
channels
,
samplingRate
,
gpuRotationAngle
,
gpuScaleRatio
,
gpuCenterR
,
gpuCenterC
,
paddingValue
,
numImages
);
hl_device_synchronize
();
hl_device_synchronize
();
}
}
void
hl_conv_random_disturb
(
const
real
*
images
,
int
imgSize
,
void
hl_conv_random_disturb
(
const
real
*
images
,
int
tgtSize
,
int
channels
,
int
numImages
,
int
imgSize
,
real
scaleRatio
,
real
rotateAngle
,
int
tgtSize
,
int
samplingRate
,
real
*
gpu_r_angle
,
int
channels
,
real
*
gpu_s_ratio
,
int
*
gpu_center_r
,
int
numImages
,
int
*
gpu_center_c
,
int
paddingValue
,
real
scaleRatio
,
bool
isTrain
,
real
*
targets
)
{
real
rotateAngle
,
int
samplingRate
,
real
*
gpu_r_angle
,
real
*
gpu_s_ratio
,
int
*
gpu_center_r
,
int
*
gpu_center_c
,
int
paddingValue
,
bool
isTrain
,
real
*
targets
)
{
// generate the random disturbance sequence and the sampling locations
// generate the random disturbance sequence and the sampling locations
hl_generate_disturb_params
(
gpu_r_angle
,
gpu_s_ratio
,
gpu_center_r
,
hl_generate_disturb_params
(
gpu_r_angle
,
gpu_center_c
,
numImages
,
imgSize
,
rotateAngle
,
gpu_s_ratio
,
scaleRatio
,
samplingRate
,
isTrain
);
gpu_center_r
,
gpu_center_c
,
hl_conv_random_disturb_with_params
(
numImages
,
images
,
imgSize
,
tgtSize
,
channels
,
numImages
,
imgSize
,
samplingRate
,
gpu_r_angle
,
gpu_s_ratio
,
rotateAngle
,
gpu_center_r
,
gpu_center_r
,
paddingValue
,
scaleRatio
,
samplingRate
,
isTrain
);
hl_conv_random_disturb_with_params
(
images
,
imgSize
,
tgtSize
,
channels
,
numImages
,
samplingRate
,
gpu_r_angle
,
gpu_s_ratio
,
gpu_center_r
,
gpu_center_r
,
paddingValue
,
targets
);
targets
);
}
}
paddle/cuda/src/hl_table_apply.cu
浏览文件 @
1d4fa243
...
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_cuda.h"
#include "hl_cuda.h"
#include "hl_device_functions.cuh"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
AddRow
>
template
<
int
blockDimX
,
int
blockDimY
,
int
gridDimX
,
bool
AddRow
>
__global__
void
KeMatrixAddRows
(
real
*
output
,
int
ldo
,
__global__
void
KeMatrixAddRows
(
real
*
output
,
real
*
table
,
int
ldt
,
int
ldo
,
real
*
table
,
int
ldt
,
int
*
ids
,
int
*
ids
,
int
numSamples
,
int
numSamples
,
int
tableSize
,
int
tableSize
,
...
@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
...
@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
while
(
idy
<
numSamples
)
{
while
(
idy
<
numSamples
)
{
int
tableId
=
ids
[
idy
];
int
tableId
=
ids
[
idy
];
if
((
0
<=
tableId
)
&&
(
tableId
<
tableSize
))
{
if
((
0
<=
tableId
)
&&
(
tableId
<
tableSize
))
{
real
*
out
=
output
+
idy
*
ldo
;
real
*
out
=
output
+
idy
*
ldo
;
real
*
tab
=
table
+
tableId
*
ldt
;
real
*
tab
=
table
+
tableId
*
ldt
;
for
(
int
i
=
idx
;
i
<
dim
;
i
+=
blockDimX
)
{
for
(
int
i
=
idx
;
i
<
dim
;
i
+=
blockDimX
)
{
if
(
AddRow
)
{
if
(
AddRow
)
{
paddle
::
paddleAtomicAdd
(
&
tab
[
i
],
out
[
i
]);
paddle
::
paddleAtomicAdd
(
&
tab
[
i
],
out
[
i
]);
...
@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
...
@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
}
}
}
}
void
hl_matrix_select_rows
(
real
*
output
,
int
ldo
,
void
hl_matrix_select_rows
(
real
*
output
,
real
*
table
,
int
ldt
,
int
ldo
,
real
*
table
,
int
ldt
,
int
*
ids
,
int
*
ids
,
int
numSamples
,
int
numSamples
,
int
tableSize
,
int
tableSize
,
...
@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
...
@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
dim3
threads
(
128
,
8
);
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
KeMatrixAddRows
<
128
,
8
,
8
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixAddRows
<
128
,
8
,
8
,
0
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
ldo
,
table
,
ldt
,
ids
,
numSamples
,
tableSize
,
dim
);
output
,
ldo
,
table
,
ldt
,
ids
,
numSamples
,
tableSize
,
dim
);
CHECK_SYNC
(
"hl_matrix_select_rows failed"
);
CHECK_SYNC
(
"hl_matrix_select_rows failed"
);
}
}
void
hl_matrix_add_to_rows
(
real
*
table
,
int
ldt
,
void
hl_matrix_add_to_rows
(
real
*
table
,
real
*
input
,
int
ldi
,
int
ldt
,
real
*
input
,
int
ldi
,
int
*
ids
,
int
*
ids
,
int
numSamples
,
int
numSamples
,
int
tableSize
,
int
tableSize
,
...
@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
...
@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
dim3
threads
(
128
,
8
);
dim3
threads
(
128
,
8
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
KeMatrixAddRows
<
128
,
8
,
8
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixAddRows
<
128
,
8
,
8
,
1
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
input
,
ldi
,
table
,
ldt
,
ids
,
numSamples
,
tableSize
,
dim
);
input
,
ldi
,
table
,
ldt
,
ids
,
numSamples
,
tableSize
,
dim
);
CHECK_SYNC
(
"hl_matrix_add_to_rows failed"
);
CHECK_SYNC
(
"hl_matrix_add_to_rows failed"
);
}
}
template
<
class
T
,
int
blockDimX
,
int
gridDimX
>
template
<
class
T
,
int
blockDimX
,
int
gridDimX
>
__global__
void
KeVectorSelect
(
T
*
dst
,
int
sized
,
__global__
void
KeVectorSelect
(
const
T
*
src
,
int
sizes
,
T
*
dst
,
int
sized
,
const
T
*
src
,
int
sizes
,
const
int
*
ids
,
int
sizei
)
{
const
int
*
ids
,
int
sizei
)
{
int
idx
=
threadIdx
.
x
+
blockDimX
*
blockIdx
.
x
;
int
idx
=
threadIdx
.
x
+
blockDimX
*
blockIdx
.
x
;
while
(
idx
<
sizei
)
{
while
(
idx
<
sizei
)
{
int
index
=
ids
[
idx
];
int
index
=
ids
[
idx
];
...
@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
...
@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
}
}
template
<
class
T
>
template
<
class
T
>
void
hl_vector_select_from
(
T
*
dst
,
int
sized
,
void
hl_vector_select_from
(
const
T
*
src
,
int
sizes
,
T
*
dst
,
int
sized
,
const
T
*
src
,
int
sizes
,
const
int
*
ids
,
int
sizei
)
{
const
int
*
ids
,
int
sizei
)
{
CHECK_NOTNULL
(
dst
);
CHECK_NOTNULL
(
dst
);
CHECK_NOTNULL
(
src
);
CHECK_NOTNULL
(
src
);
CHECK_NOTNULL
(
ids
);
CHECK_NOTNULL
(
ids
);
...
@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
...
@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
dim3
threads
(
512
,
1
);
dim3
threads
(
512
,
1
);
dim3
grid
(
8
,
1
);
dim3
grid
(
8
,
1
);
KeVectorSelect
<
T
,
512
,
8
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeVectorSelect
<
T
,
512
,
8
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
dst
,
sized
,
src
,
sizes
,
ids
,
sizei
);
dst
,
sized
,
src
,
sizes
,
ids
,
sizei
);
CHECK_SYNC
(
"hl_vector_select_from failed"
);
CHECK_SYNC
(
"hl_vector_select_from failed"
);
}
}
template
template
void
hl_vector_select_from
(
real
*
dst
,
void
hl_vector_select_from
(
real
*
dst
,
int
sized
,
int
sized
,
const
real
*
src
,
int
sizes
,
const
real
*
src
,
const
int
*
ids
,
int
sizei
);
int
sizes
,
template
const
int
*
ids
,
void
hl_vector_select_from
(
int
*
dst
,
int
sized
,
int
sizei
);
const
int
*
src
,
int
sizes
,
template
void
hl_vector_select_from
(
const
int
*
ids
,
int
sizei
);
int
*
dst
,
int
sized
,
const
int
*
src
,
int
sizes
,
const
int
*
ids
,
int
sizei
);
paddle/cuda/src/hl_top_k.cu
浏览文件 @
1d4fa243
...
@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "hl_base.h"
#include "hl_top_k.h"
#include "hl_sparse.ph"
#include "hl_sparse.ph"
#include "hl_top_k.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Logging.h"
// using namespace hppl;
// using namespace hppl;
struct
Pair
{
struct
Pair
{
__device__
__forceinline__
__device__
__forceinline__
Pair
()
{}
Pair
()
{}
__device__
__forceinline__
__device__
__forceinline__
Pair
(
real
value
,
int
id
)
:
v_
(
value
),
id_
(
id
)
{}
Pair
(
real
value
,
int
id
)
:
v_
(
value
),
id_
(
id
)
{}
__device__
__forceinline__
__device__
__forceinline__
void
set
(
real
value
,
int
id
)
{
void
set
(
real
value
,
int
id
)
{
v_
=
value
;
v_
=
value
;
id_
=
id
;
id_
=
id
;
}
}
__device__
__forceinline__
__device__
__forceinline__
void
operator
=
(
const
Pair
&
in
)
{
void
operator
=
(
const
Pair
&
in
)
{
v_
=
in
.
v_
;
v_
=
in
.
v_
;
id_
=
in
.
id_
;
id_
=
in
.
id_
;
}
}
__device__
__forceinline__
__device__
__forceinline__
bool
operator
<
(
const
real
value
)
const
{
bool
operator
<
(
const
real
value
)
const
{
return
(
v_
<
value
);
return
(
v_
<
value
);
}
}
__device__
__forceinline__
__device__
__forceinline__
bool
operator
<
(
const
Pair
&
in
)
const
{
bool
operator
<
(
const
Pair
&
in
)
const
{
return
(
v_
<
in
.
v_
)
||
((
v_
==
in
.
v_
)
&&
(
id_
>
in
.
id_
));
return
(
v_
<
in
.
v_
)
||
((
v_
==
in
.
v_
)
&&
(
id_
>
in
.
id_
));
}
}
__device__
__forceinline__
__device__
__forceinline__
bool
operator
>
(
const
Pair
&
in
)
const
{
bool
operator
>
(
const
Pair
&
in
)
const
{
return
(
v_
>
in
.
v_
)
||
((
v_
==
in
.
v_
)
&&
(
id_
<
in
.
id_
));
return
(
v_
>
in
.
v_
)
||
((
v_
==
in
.
v_
)
&&
(
id_
<
in
.
id_
));
}
}
...
@@ -58,8 +50,9 @@ struct Pair {
...
@@ -58,8 +50,9 @@ struct Pair {
int
id_
;
int
id_
;
};
};
__device__
__forceinline__
__device__
__forceinline__
void
addTo
(
Pair
topK
[],
void
addTo
(
Pair
topK
[],
const
Pair
&
p
,
int
beamSize
)
{
const
Pair
&
p
,
int
beamSize
)
{
for
(
int
k
=
beamSize
-
2
;
k
>=
0
;
k
--
)
{
for
(
int
k
=
beamSize
-
2
;
k
>=
0
;
k
--
)
{
if
(
topK
[
k
]
<
p
)
{
if
(
topK
[
k
]
<
p
)
{
topK
[
k
+
1
]
=
topK
[
k
];
topK
[
k
+
1
]
=
topK
[
k
];
...
@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
...
@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
topK
[
0
]
=
p
;
topK
[
0
]
=
p
;
}
}
template
<
int
beamSize
>
template
<
int
beamSize
>
__device__
__forceinline__
__device__
__forceinline__
void
addTo
(
Pair
topK
[],
const
Pair
&
p
)
{
void
addTo
(
Pair
topK
[],
const
Pair
&
p
)
{
for
(
int
k
=
beamSize
-
2
;
k
>=
0
;
k
--
)
{
for
(
int
k
=
beamSize
-
2
;
k
>=
0
;
k
--
)
{
if
(
topK
[
k
]
<
p
)
{
if
(
topK
[
k
]
<
p
)
{
topK
[
k
+
1
]
=
topK
[
k
];
topK
[
k
+
1
]
=
topK
[
k
];
...
@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
...
@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
topK
[
0
]
=
p
;
topK
[
0
]
=
p
;
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
getTopK
(
void
getTopK
(
Pair
topK
[],
real
*
src
,
int
idx
,
int
dim
,
int
beamSize
)
{
Pair
topK
[],
real
*
src
,
int
idx
,
int
dim
,
int
beamSize
)
{
while
(
idx
<
dim
)
{
while
(
idx
<
dim
)
{
if
(
topK
[
beamSize
-
1
]
<
src
[
idx
])
{
if
(
topK
[
beamSize
-
1
]
<
src
[
idx
])
{
Pair
tmp
(
src
[
idx
],
idx
);
Pair
tmp
(
src
[
idx
],
idx
);
...
@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
...
@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
}
}
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
getTopK
(
void
getTopK
(
Pair
topK
[],
real
*
src
,
int
idx
,
int
dim
,
Pair
topK
[],
real
*
src
,
int
idx
,
int
dim
,
const
Pair
&
max
,
int
beamSize
)
{
const
Pair
&
max
,
int
beamSize
)
{
while
(
idx
<
dim
)
{
while
(
idx
<
dim
)
{
if
(
topK
[
beamSize
-
1
]
<
src
[
idx
])
{
if
(
topK
[
beamSize
-
1
]
<
src
[
idx
])
{
Pair
tmp
(
src
[
idx
],
idx
);
Pair
tmp
(
src
[
idx
],
idx
);
...
@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
...
@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
}
}
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
getTopK
(
void
getTopK
(
Pair
topK
[],
real
*
val
,
int
*
col
,
Pair
topK
[],
real
*
val
,
int
*
col
,
int
idx
,
int
dim
,
int
beamSize
)
{
int
idx
,
int
dim
,
int
beamSize
)
{
while
(
idx
<
dim
)
{
while
(
idx
<
dim
)
{
if
(
topK
[
beamSize
-
1
]
<
val
[
idx
])
{
if
(
topK
[
beamSize
-
1
]
<
val
[
idx
])
{
Pair
tmp
(
val
[
idx
],
col
[
idx
]);
Pair
tmp
(
val
[
idx
],
col
[
idx
]);
...
@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
...
@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
}
}
}
}
template
<
int
blockSize
>
template
<
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
getTopK
(
Pair
topK
[],
void
getTopK
(
Pair
topK
[],
real
*
val
,
int
*
col
,
int
idx
,
int
dim
,
real
*
val
,
const
Pair
&
max
,
int
beamSize
)
{
int
*
col
,
int
idx
,
int
dim
,
const
Pair
&
max
,
int
beamSize
)
{
while
(
idx
<
dim
)
{
while
(
idx
<
dim
)
{
if
(
topK
[
beamSize
-
1
]
<
val
[
idx
])
{
if
(
topK
[
beamSize
-
1
]
<
val
[
idx
])
{
Pair
tmp
(
val
[
idx
],
col
[
idx
]);
Pair
tmp
(
val
[
idx
],
col
[
idx
]);
...
@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
...
@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
}
}
}
}
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
threadGetTopK
(
Pair
topK
[],
void
threadGetTopK
(
Pair
topK
[],
int
&
beam
,
int
beamSize
,
int
&
beam
,
int
beamSize
,
real
*
src
,
real
*
src
,
bool
&
firstStep
,
bool
&
isEmpty
,
Pair
&
max
,
bool
&
firstStep
,
int
dim
,
const
int
tid
)
{
bool
&
isEmpty
,
Pair
&
max
,
int
dim
,
const
int
tid
)
{
if
(
beam
>
0
)
{
if
(
beam
>
0
)
{
int
length
=
beam
<
beamSize
?
beam
:
beamSize
;
int
length
=
beam
<
beamSize
?
beam
:
beamSize
;
if
(
firstStep
)
{
if
(
firstStep
)
{
...
@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
...
@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
}
}
if
(
!
isEmpty
)
{
if
(
!
isEmpty
)
{
getTopK
<
blockSize
>
(
topK
+
maxLength
-
beam
,
src
,
tid
,
dim
,
getTopK
<
blockSize
>
(
topK
+
maxLength
-
beam
,
src
,
tid
,
dim
,
max
,
length
);
max
,
length
);
}
}
}
}
...
@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
...
@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
}
}
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
threadGetTopK
(
Pair
topK
[],
void
threadGetTopK
(
Pair
topK
[],
int
&
beam
,
int
beamSize
,
int
&
beam
,
real
*
val
,
int
*
col
,
int
beamSize
,
bool
&
firstStep
,
bool
&
isEmpty
,
Pair
&
max
,
real
*
val
,
int
dim
,
const
int
tid
)
{
int
*
col
,
bool
&
firstStep
,
bool
&
isEmpty
,
Pair
&
max
,
int
dim
,
const
int
tid
)
{
if
(
beam
>
0
)
{
if
(
beam
>
0
)
{
int
length
=
beam
<
beamSize
?
beam
:
beamSize
;
int
length
=
beam
<
beamSize
?
beam
:
beamSize
;
if
(
firstStep
)
{
if
(
firstStep
)
{
...
@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
...
@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
}
}
if
(
!
isEmpty
)
{
if
(
!
isEmpty
)
{
getTopK
<
blockSize
>
(
topK
+
maxLength
-
beam
,
val
,
col
,
tid
,
dim
,
getTopK
<
blockSize
>
(
max
,
length
);
topK
+
maxLength
-
beam
,
val
,
col
,
tid
,
dim
,
max
,
length
);
}
}
}
}
...
@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
...
@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
}
}
}
}
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__device__
__forceinline__
__device__
__forceinline__
void
blockReduce
(
Pair
*
shTopK
,
void
blockReduce
(
Pair
*
shTopK
,
int
*
maxId
,
Pair
topK
[],
int
*
maxId
,
real
**
topVal
,
int
**
topIds
,
Pair
topK
[],
int
&
beam
,
int
&
beamSize
,
real
**
topVal
,
const
int
tid
,
const
int
warp
)
{
int
**
topIds
,
int
&
beam
,
int
&
beamSize
,
const
int
tid
,
const
int
warp
)
{
while
(
true
)
{
while
(
true
)
{
__syncthreads
();
__syncthreads
();
if
(
tid
<
blockSize
/
2
)
{
if
(
tid
<
blockSize
/
2
)
{
...
@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
...
@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
}
}
}
}
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
blockSize
/
4
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
blockSize
/
4
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
tid
<
stride
)
{
if
(
tid
<
stride
)
{
if
(
shTopK
[
maxId
[
tid
]]
<
shTopK
[
maxId
[
tid
+
stride
]])
{
if
(
shTopK
[
maxId
[
tid
]]
<
shTopK
[
maxId
[
tid
+
stride
]])
{
maxId
[
tid
]
=
maxId
[
tid
+
stride
];
maxId
[
tid
]
=
maxId
[
tid
+
stride
];
...
@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
...
@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
* 3. go to the second setp, until one thread's topK value is null;
* 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value.
* 4. go to the first setp, until get the topK value.
*/
*/
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__global__
void
KeMatrixTopK
(
real
*
topVal
,
int
ldv
,
__global__
void
KeMatrixTopK
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
real
*
src
,
int
lds
,
int
*
topIds
,
real
*
src
,
int
lds
,
int
dim
,
int
dim
,
int
beamSize
)
{
int
beamSize
)
{
__shared__
Pair
shTopK
[
blockSize
];
__shared__
Pair
shTopK
[
blockSize
];
...
@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
...
@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
topK
[
k
].
set
(
-
HL_FLOAT_MAX
,
-
1
);
topK
[
k
].
set
(
-
HL_FLOAT_MAX
,
-
1
);
}
}
while
(
beamSize
)
{
while
(
beamSize
)
{
threadGetTopK
<
maxLength
,
blockSize
>
threadGetTopK
<
maxLength
,
blockSize
>
(
(
topK
,
beam
,
beamSize
,
src
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
topK
,
beam
,
beamSize
,
src
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
shTopK
[
tid
]
=
topK
[
0
];
shTopK
[
tid
]
=
topK
[
0
];
blockReduce
<
maxLength
,
blockSize
>
blockReduce
<
maxLength
,
blockSize
>
(
(
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
}
}
}
}
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__global__
void
KeSMatrixTopK
(
real
*
topVal
,
int
ldv
,
__global__
void
KeSMatrixTopK
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
int
*
topIds
,
real
*
val
,
real
*
val
,
int
*
row
,
int
*
row
,
int
*
col
,
int
*
col
,
...
@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
...
@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
topK
[
k
].
set
(
-
HL_FLOAT_MAX
,
-
1
);
topK
[
k
].
set
(
-
HL_FLOAT_MAX
,
-
1
);
}
}
while
(
beamSize
)
{
while
(
beamSize
)
{
threadGetTopK
<
maxLength
,
blockSize
>
threadGetTopK
<
maxLength
,
blockSize
>
(
(
topK
,
beam
,
beamSize
,
val
,
col
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
topK
,
beam
,
beamSize
,
val
,
col
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
shTopK
[
tid
]
=
topK
[
0
];
shTopK
[
tid
]
=
topK
[
0
];
blockReduce
<
maxLength
,
blockSize
>
blockReduce
<
maxLength
,
blockSize
>
(
(
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
}
}
}
}
void
hl_matrix_top_k
(
real
*
topVal
,
int
ldv
,
void
hl_matrix_top_k
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
real
*
src
,
int
lds
,
int
*
topIds
,
real
*
src
,
int
lds
,
int
dim
,
int
dim
,
int
beamSize
,
int
beamSize
,
int
numSamples
)
{
int
numSamples
)
{
...
@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
...
@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
dim3
threads
(
256
,
1
);
dim3
threads
(
256
,
1
);
dim3
grid
(
numSamples
,
1
);
dim3
grid
(
numSamples
,
1
);
KeMatrixTopK
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeMatrixTopK
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
topVal
,
ldv
,
topIds
,
src
,
lds
,
dim
,
beamSize
);
topVal
,
ldv
,
topIds
,
src
,
lds
,
dim
,
beamSize
);
CHECK_SYNC
(
"hl_matrix_top_k failed"
);
CHECK_SYNC
(
"hl_matrix_top_k failed"
);
}
}
void
hl_sparse_matrix_top_k
(
real
*
topVal
,
int
ldv
,
void
hl_sparse_matrix_top_k
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
int
*
topIds
,
hl_sparse_matrix_s
src
,
hl_sparse_matrix_s
src
,
int
beamSize
,
int
beamSize
,
int
numSamples
)
{
int
numSamples
)
{
CHECK_NOTNULL
(
topVal
);
CHECK_NOTNULL
(
topVal
);
CHECK_NOTNULL
(
topIds
);
CHECK_NOTNULL
(
topIds
);
CHECK_NOTNULL
(
src
);
CHECK_NOTNULL
(
src
);
CHECK_EQ
(
src
->
format
,
HL_SPARSE_CSR
)
CHECK_EQ
(
src
->
format
,
HL_SPARSE_CSR
)
<<
"sparse matrix format error!"
;
<<
"sparse matrix format error!"
;
hl_csr_matrix
csr
=
(
hl_csr_matrix
)
src
->
matrix
;
hl_csr_matrix
csr
=
(
hl_csr_matrix
)
src
->
matrix
;
if
(
csr
->
csr_val
==
NULL
||
csr
->
csr_row
==
NULL
||
if
(
csr
->
csr_val
==
NULL
||
csr
->
csr_row
==
NULL
||
csr
->
csr_col
==
NULL
)
{
csr
->
csr_col
==
NULL
)
{
LOG
(
FATAL
)
<<
"parameter src is null!"
;
LOG
(
FATAL
)
<<
"parameter src is null!"
;
}
}
dim3
threads
(
256
,
1
);
dim3
threads
(
256
,
1
);
dim3
grid
(
numSamples
,
1
);
dim3
grid
(
numSamples
,
1
);
KeSMatrixTopK
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeSMatrixTopK
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
topVal
,
ldv
,
topIds
,
csr
->
csr_val
,
csr
->
csr_row
,
csr
->
csr_col
,
beamSize
);
topVal
,
ldv
,
topIds
,
csr
->
csr_val
,
csr
->
csr_row
,
csr
->
csr_col
,
beamSize
);
CHECK_SYNC
(
"hl_sparse_matrix_top_k failed"
);
CHECK_SYNC
(
"hl_sparse_matrix_top_k failed"
);
}
}
...
@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
...
@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
* 3. go to the second setp, until one thread's topK value is null;
* 3. go to the second setp, until one thread's topK value is null;
* 4. go to the first setp, until get the topK value.
* 4. go to the first setp, until get the topK value.
*/
*/
template
<
int
maxLength
,
int
blockSize
>
template
<
int
maxLength
,
int
blockSize
>
__global__
void
KeMatrixTopKClassificationError
(
real
*
topVal
,
int
ldv
,
__global__
void
KeMatrixTopKClassificationError
(
real
*
topVal
,
int
*
topIds
,
int
ldv
,
real
*
src
,
int
lds
,
int
*
topIds
,
real
*
src
,
int
lds
,
int
dim
,
int
dim
,
int
beamSize
,
int
beamSize
,
int
*
label
,
int
*
label
,
...
@@ -420,12 +432,12 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
...
@@ -420,12 +432,12 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
}
}
while
(
beamSize
)
{
while
(
beamSize
)
{
threadGetTopK
<
maxLength
,
blockSize
>
threadGetTopK
<
maxLength
,
blockSize
>
(
(
topK
,
beam
,
beamSize
,
src
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
topK
,
beam
,
beamSize
,
src
,
firstStep
,
isEmpty
,
max
,
dim
,
tid
);
shTopK
[
tid
]
=
topK
[
0
];
shTopK
[
tid
]
=
topK
[
0
];
blockReduce
<
maxLength
,
blockSize
>
blockReduce
<
maxLength
,
blockSize
>
(
(
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
shTopK
,
maxId
,
topK
,
&
topVal
,
&
topIds
,
beam
,
beamSize
,
tid
,
warp
);
}
}
__syncthreads
();
__syncthreads
();
...
@@ -440,9 +452,11 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
...
@@ -440,9 +452,11 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
}
}
}
}
void
hl_matrix_classification_error
(
real
*
topVal
,
int
ldv
,
void
hl_matrix_classification_error
(
real
*
topVal
,
int
ldv
,
int
*
topIds
,
int
*
topIds
,
real
*
src
,
int
lds
,
real
*
src
,
int
lds
,
int
dim
,
int
dim
,
int
topkSize
,
int
topkSize
,
int
numSamples
,
int
numSamples
,
...
@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
...
@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
dim3
threads
(
256
,
1
);
dim3
threads
(
256
,
1
);
dim3
grid
(
numSamples
,
1
);
dim3
grid
(
numSamples
,
1
);
KeMatrixTopKClassificationError
<
5
,
256
>
KeMatrixTopKClassificationError
<
5
,
256
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
topVal
,
ldv
,
topIds
,
src
,
lds
,
dim
,
topkSize
,
label
,
recResult
);
(
topVal
,
ldv
,
topIds
,
src
,
lds
,
dim
,
topkSize
,
label
,
recResult
);
CHECK_SYNC
(
"hl_matrix_top_k classification error failed"
);
CHECK_SYNC
(
"hl_matrix_top_k classification error failed"
);
}
}
paddle/framework/attr_type.proto
浏览文件 @
1d4fa243
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
syntax
=
"proto2"
;
syntax
=
"proto2"
;
package
paddle
.
framework
;
package
paddle
.
framework
;
// Attribute Type for paddle's Op.
// Attribute Type for paddle's Op.
...
...
paddle/framework/op_desc.proto
浏览文件 @
1d4fa243
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
syntax
=
"proto2"
;
syntax
=
"proto2"
;
package
paddle
.
framework
;
package
paddle
.
framework
;
import
"attr_type.proto"
;
import
"attr_type.proto"
;
...
...
paddle/framework/op_proto.proto
浏览文件 @
1d4fa243
...
@@ -15,10 +15,11 @@ limitations under the License. */
...
@@ -15,10 +15,11 @@ limitations under the License. */
// Protocol Message for 3rd-party language binding.
// Protocol Message for 3rd-party language binding.
//
//
// Paddle Python package will use `OpProto` to generate op creation methods.
// Paddle Python package will use `OpProto` to generate op creation methods.
// The op creation methods take user's input and generate `OpDesc` proto message,
// The op creation methods take user's input and generate `OpDesc` proto
// message,
// then pass `OpDesc` to C++ side and create Op pointer.
// then pass `OpDesc` to C++ side and create Op pointer.
//
//
syntax
=
"proto2"
;
syntax
=
"proto2"
;
package
paddle
.
framework
;
package
paddle
.
framework
;
import
"attr_type.proto"
;
import
"attr_type.proto"
;
...
@@ -32,13 +33,14 @@ message AttrProto {
...
@@ -32,13 +33,14 @@ message AttrProto {
// Supported attribute type.
// Supported attribute type.
required
AttrType
type
=
2
;
required
AttrType
type
=
2
;
// Supported attribute comments. It helps 3rd-party language generate doc-string.
// Supported attribute comments. It helps 3rd-party language generate
// doc-string.
required
string
comment
=
3
;
required
string
comment
=
3
;
// If that attribute is generated, it means the Paddle third language
// If that attribute is generated, it means the Paddle third language
// binding has responsibility to fill that attribute. End-User should
// binding has responsibility to fill that attribute. End-User should
// not set that attribute.
// not set that attribute.
optional
bool
generated
=
4
[
default
=
false
];
optional
bool
generated
=
4
[
default
=
false
];
}
}
// Input or output message for 3rd-party language binding.
// Input or output message for 3rd-party language binding.
...
@@ -48,7 +50,8 @@ message VarProto {
...
@@ -48,7 +50,8 @@ message VarProto {
// e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
// e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
required
string
name
=
1
;
required
string
name
=
1
;
// The comment for that input. It helps 3rd-party language generate doc-string.
// The comment for that input. It helps 3rd-party language generate
// doc-string.
required
string
comment
=
2
;
required
string
comment
=
2
;
// Is that input/output could be a list or not.
// Is that input/output could be a list or not.
...
@@ -70,7 +73,7 @@ message VarProto {
...
@@ -70,7 +73,7 @@ message VarProto {
// }
// }
// }
// }
//
//
optional
bool
multiple
=
3
[
default
=
false
];
optional
bool
multiple
=
3
[
default
=
false
];
// It marks that output is a temporary output. That output is not used by
// It marks that output is a temporary output. That output is not used by
// user, but used by other op internally as input. If other op is not use
// user, but used by other op internally as input. If other op is not use
...
@@ -83,7 +86,7 @@ message VarProto {
...
@@ -83,7 +86,7 @@ message VarProto {
// attrs = {
// attrs = {
// "temporary_index": [1]
// "temporary_index": [1]
// }
// }
optional
bool
temporary
=
4
[
default
=
false
];
optional
bool
temporary
=
4
[
default
=
false
];
// The gradient of operator can be ignored immediately
// The gradient of operator can be ignored immediately
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
// e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
...
@@ -110,5 +113,4 @@ message OpProto {
...
@@ -110,5 +113,4 @@ message OpProto {
// The type of that Op.
// The type of that Op.
required
string
type
=
5
;
required
string
type
=
5
;
}
}
paddle/function/ContextProjectionOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "ContextProjectionOp.h"
#include "ContextProjectionOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
...
@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
int
block_size
=
blockDim
.
x
;
int
block_size
=
blockDim
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
seq_start
=
sequence
[
sequenceId
];
int
seq_start
=
sequence
[
sequenceId
];
int
seq_end
=
sequence
[
sequenceId
+
1
];
int
seq_end
=
sequence
[
sequenceId
+
1
];
real
value
=
0
;
real
value
=
0
;
int
instances
=
seq_end
-
seq_start
+
context_length
-
1
;
int
instances
=
seq_end
-
seq_start
+
context_length
-
1
;
...
@@ -50,7 +50,8 @@ __global__ void KeContextProjectionForward(const real* input,
...
@@ -50,7 +50,8 @@ __global__ void KeContextProjectionForward(const real* input,
if
(
padding
)
{
if
(
padding
)
{
value
=
value
=
weight
[(
begin_pad
+
i
+
context_start
-
(
seq_end
-
seq_start
))
*
weight
[(
begin_pad
+
i
+
context_start
-
(
seq_end
-
seq_start
))
*
input_dim
+
idx
];
input_dim
+
idx
];
}
else
{
}
else
{
continue
;
continue
;
}
}
...
@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
...
@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
dim3
grid
(
blocks_x
,
blocks_y
);
dim3
grid
(
blocks_x
,
blocks_y
);
if
(
weight
)
{
if
(
weight
)
{
KeContextProjectionForward
<
true
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeContextProjectionForward
<
true
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
input
,
sequence
,
weight
,
output
,
input_dim
,
input
,
context_length
,
context_start
,
begin_pad
);
sequence
,
weight
,
output
,
input_dim
,
context_length
,
context_start
,
begin_pad
);
}
else
{
}
else
{
KeContextProjectionForward
<
false
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeContextProjectionForward
<
false
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
input
,
sequence
,
weight
,
output
,
input_dim
,
input
,
context_length
,
context_start
,
begin_pad
);
sequence
,
weight
,
output
,
input_dim
,
context_length
,
context_start
,
begin_pad
);
}
}
CHECK_SYNC
(
"hl_context_projection_forward failed"
);
CHECK_SYNC
(
"hl_context_projection_forward failed"
);
}
}
...
@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
...
@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
int
block_size
=
blockDim
.
x
;
int
block_size
=
blockDim
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
sequenceId
=
blockIdx
.
x
;
int
seq_start
=
sequence
[
sequenceId
];
int
seq_start
=
sequence
[
sequenceId
];
int
seq_end
=
sequence
[
sequenceId
+
1
];
int
seq_end
=
sequence
[
sequenceId
+
1
];
real
value
=
0
;
real
value
=
0
;
int
instances
=
seq_end
-
seq_start
+
context_length
-
1
;
int
instances
=
seq_end
-
seq_start
+
context_length
-
1
;
...
@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
...
@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
int
blocks_y
=
1
;
int
blocks_y
=
1
;
dim3
threads
(
block_size
,
1
);
dim3
threads
(
block_size
,
1
);
dim3
grid
(
blocks_x
,
blocks_y
);
dim3
grid
(
blocks_x
,
blocks_y
);
KeContextProjectionBackwardData
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeContextProjectionBackwardData
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
out_grad
,
sequence
,
input_grad
,
input_dim
,
context_length
,
context_start
);
out_grad
,
sequence
,
input_grad
,
input_dim
,
context_length
,
context_start
);
CHECK_SYNC
(
"hl_context_projection_backward_data failed"
);
CHECK_SYNC
(
"hl_context_projection_backward_data failed"
);
}
}
...
@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
...
@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
context_start
);
context_start
);
}
}
template
<
int
THREADS_X
,
int
THREADS_Y
>
template
<
int
THREADS_X
,
int
THREADS_Y
>
__global__
void
KeContextProjectionBackwardWeight
(
const
real
*
out_grad
,
__global__
void
KeContextProjectionBackwardWeight
(
const
real
*
out_grad
,
const
int
*
sequence
,
const
int
*
sequence
,
real
*
w_grad
,
real
*
w_grad
,
...
@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
...
@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
if
(
weight_idx
<
w_dim
)
{
if
(
weight_idx
<
w_dim
)
{
for
(
int
seqId
=
idy
;
seqId
<
num_sequences
;
seqId
+=
THREADS_Y
)
{
for
(
int
seqId
=
idy
;
seqId
<
num_sequences
;
seqId
+=
THREADS_Y
)
{
int
seq_start
=
sequence
[
seqId
];
int
seq_start
=
sequence
[
seqId
];
int
seq_end
=
sequence
[
seqId
+
1
];
int
seq_end
=
sequence
[
seqId
+
1
];
output_r
=
const_cast
<
real
*>
(
out_grad
)
output_r
=
+
seq_start
*
w_dim
*
context_length
;
const_cast
<
real
*>
(
out_grad
)
+
seq_start
*
w_dim
*
context_length
;
if
(
context_start
<
0
)
{
if
(
context_start
<
0
)
{
if
(
padId
+
context_start
<
0
)
{
if
(
padId
+
context_start
<
0
)
{
instanceId
=
padId
;
instanceId
=
padId
;
}
else
{
}
else
{
// begin_pad > 0;
// begin_pad > 0;
instanceId
=
(
padId
-
begin_pad
)
+
instanceId
=
(
seq_end
-
seq_start
)
-
context_start
;
(
padId
-
begin_pad
)
+
(
seq_end
-
seq_start
)
-
context_start
;
}
}
}
else
{
}
else
{
if
(
padId
+
(
seq_end
-
seq_start
)
<
context_start
)
{
if
(
padId
+
(
seq_end
-
seq_start
)
<
context_start
)
{
...
@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
...
@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
}
}
}
}
int
outx
=
(
instanceId
-
context_length
)
<
0
?
int
outx
=
instanceId
:
(
context_length
-
1
);
(
instanceId
-
context_length
)
<
0
?
instanceId
:
(
context_length
-
1
);
int
outy
=
(
instanceId
-
context_length
)
<
0
?
int
outy
=
(
instanceId
-
context_length
)
<
0
0
:
(
instanceId
-
(
context_length
-
1
));
?
0
:
(
instanceId
-
(
context_length
-
1
));
output_r
+=
outy
*
w_dim
*
context_length
+
outx
*
w_dim
;
output_r
+=
outy
*
w_dim
*
context_length
+
outx
*
w_dim
;
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
for
(
int
j
=
outy
;
j
<
seq_end
-
seq_start
;
j
++
)
{
value
+=
output_r
[
weight_idx
];
value
+=
output_r
[
weight_idx
];
...
@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
...
@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
}
}
__syncthreads
();
__syncthreads
();
for
(
int
stride
=
THREADS_Y
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
for
(
int
stride
=
THREADS_Y
/
2
;
stride
>
0
;
stride
=
stride
/
2
)
{
if
(
idy
<
stride
)
{
if
(
idy
<
stride
)
{
sum_s
[
idy
][
idx
]
+=
sum_s
[
idy
+
stride
][
idx
];
sum_s
[
idy
][
idx
]
+=
sum_s
[
idy
+
stride
][
idx
];
}
}
...
@@ -339,16 +353,21 @@ void hl_context_projection_backward_weight(const real* out_grad,
...
@@ -339,16 +353,21 @@ void hl_context_projection_backward_weight(const real* out_grad,
dim3
threads
(
threads_x
,
threads_y
);
dim3
threads
(
threads_x
,
threads_y
);
dim3
grid
(
blocks_x
,
1
);
dim3
grid
(
blocks_x
,
1
);
KeContextProjectionBackwardWeight
<
32
,
32
>
KeContextProjectionBackwardWeight
<
32
,
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
32
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
out_grad
,
sequence
,
w_grad
,
num_sequences
,
w_dim
,
out_grad
,
context_length
,
context_start
,
begin_pad
);
sequence
,
w_grad
,
num_sequences
,
w_dim
,
context_length
,
context_start
,
begin_pad
);
CHECK_SYNC
(
"hl_context_projection_backward_weight failed"
);
CHECK_SYNC
(
"hl_context_projection_backward_weight failed"
);
}
}
template
<
>
template
<
>
void
ContextProjectionBackwardWeight
<
DEVICE_TYPE_GPU
>
(
void
ContextProjectionBackwardWeight
<
DEVICE_TYPE_GPU
>
(
const
GpuMatrix
&
out_grad
,
const
GpuMatrix
&
out_grad
,
GpuMatrix
&
w_grad
,
GpuMatrix
&
w_grad
,
const
GpuIVector
&
seq_vec
,
const
GpuIVector
&
seq_vec
,
size_t
context_length
,
size_t
context_length
,
...
@@ -378,15 +397,10 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
...
@@ -378,15 +397,10 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
size_t
total_pad
)
{
size_t
total_pad
)
{
if
(
in_grad
)
{
if
(
in_grad
)
{
ContextProjectionBackwardData
<
DEVICE_TYPE_GPU
>
(
ContextProjectionBackwardData
<
DEVICE_TYPE_GPU
>
(
out_grad
,
out_grad
,
in_grad
,
sequence
,
context_length
,
context_start
);
in_grad
,
sequence
,
context_length
,
context_start
);
}
}
if
(
is_padding
&&
w_grad
)
{
if
(
is_padding
&&
w_grad
)
{
ContextProjectionBackwardWeight
<
DEVICE_TYPE_GPU
>
(
ContextProjectionBackwardWeight
<
DEVICE_TYPE_GPU
>
(
out_grad
,
out_grad
,
w_grad
,
w_grad
,
sequence
,
sequence
,
context_length
,
context_length
,
...
...
paddle/function/CosSimOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "CosSimOp.h"
#include "hl_base.h"
#include "hl_base.h"
#include "hl_device_functions.cuh"
#include "hl_device_functions.cuh"
#include "CosSimOp.h"
namespace
paddle
{
namespace
paddle
{
template
<
int
block_size
>
template
<
int
block_size
>
__global__
void
KeCosSim
(
real
*
output
,
__global__
void
KeCosSim
(
real
*
output
,
const
real
*
input1
,
const
real
*
input1
,
const
real
*
input2
,
const
real
*
input2
,
...
@@ -78,8 +78,8 @@ void hlCossim(real* output,
...
@@ -78,8 +78,8 @@ void hlCossim(real* output,
dim3
threads
(
block_size
,
1
);
dim3
threads
(
block_size
,
1
);
dim3
grid
(
1
,
input1_height
);
dim3
grid
(
1
,
input1_height
);
KeCosSim
<
block_size
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeCosSim
<
block_size
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
output
,
input1
,
input2
,
width
,
input1_height
,
input2_height
,
scale
);
output
,
input1
,
input2
,
width
,
input1_height
,
input2_height
,
scale
);
CHECK_SYNC
(
"hlCossim failed"
);
CHECK_SYNC
(
"hlCossim failed"
);
}
}
...
@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
...
@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
hlCossim
(
out
,
x
,
y
,
dim
,
in1_mat
.
getHeight
(),
in2_mat
.
getHeight
(),
scale
);
hlCossim
(
out
,
x
,
y
,
dim
,
in1_mat
.
getHeight
(),
in2_mat
.
getHeight
(),
scale
);
}
}
template
<
int
block_size
>
template
<
int
block_size
>
__global__
void
KeCosSimDerivative
(
const
real
*
grad
,
__global__
void
KeCosSimDerivative
(
const
real
*
grad
,
const
real
*
output
,
const
real
*
output
,
const
real
*
prev_out_x
,
const
real
*
prev_out_x
,
...
@@ -148,13 +148,12 @@ __global__ void KeCosSimDerivative(const real* grad,
...
@@ -148,13 +148,12 @@ __global__ void KeCosSimDerivative(const real* grad,
if
(
xy
[
0
]
==
0
)
{
if
(
xy
[
0
]
==
0
)
{
real
reciprocal
=
1.0
/
(
sqrt
(
xx
[
0
])
*
sqrt
(
yy
[
0
]));
real
reciprocal
=
1.0
/
(
sqrt
(
xx
[
0
])
*
sqrt
(
yy
[
0
]));
for
(
int
index
=
tid
;
index
<
width
;
index
+=
block_size
)
{
for
(
int
index
=
tid
;
index
<
width
;
index
+=
block_size
)
{
prev_grad_x
[
index
]
+=
prev_grad_x
[
index
]
+=
scale
*
grad
[
ty
]
*
prev_out_y
[
index
]
*
reciprocal
;
scale
*
grad
[
ty
]
*
prev_out_y
[
index
]
*
reciprocal
;
if
(
input2_height
>
1
)
{
if
(
input2_height
>
1
)
{
prev_grad_y
[
index
]
+=
prev_grad_y
[
index
]
+=
scale
*
grad
[
ty
]
*
prev_out_x
[
index
]
*
reciprocal
;
scale
*
grad
[
ty
]
*
prev_out_x
[
index
]
*
reciprocal
;
}
else
{
}
else
{
paddle
::
paddleAtomicAdd
(
prev_grad_y
+
index
,
paddle
::
paddleAtomicAdd
(
prev_grad_y
+
index
,
scale
*
grad
[
ty
]
*
prev_out_x
[
index
]
*
reciprocal
);
scale
*
grad
[
ty
]
*
prev_out_x
[
index
]
*
reciprocal
);
}
}
}
}
...
@@ -163,16 +162,17 @@ __global__ void KeCosSimDerivative(const real* grad,
...
@@ -163,16 +162,17 @@ __global__ void KeCosSimDerivative(const real* grad,
real
reciprocalSquareSumX
=
1.0
/
xx
[
0
];
real
reciprocalSquareSumX
=
1.0
/
xx
[
0
];
real
reciprocalSquareSumY
=
1.0
/
yy
[
0
];
real
reciprocalSquareSumY
=
1.0
/
yy
[
0
];
for
(
int
index
=
tid
;
index
<
width
;
index
+=
block_size
)
{
for
(
int
index
=
tid
;
index
<
width
;
index
+=
block_size
)
{
prev_grad_x
[
index
]
+=
output
[
ty
]
*
grad
[
ty
]
*
prev_grad_x
[
index
]
+=
(
prev_out_y
[
index
]
*
reciprocalXY
-
output
[
ty
]
*
grad
[
ty
]
*
(
prev_out_y
[
index
]
*
reciprocalXY
-
prev_out_x
[
index
]
*
reciprocalSquareSumX
);
prev_out_x
[
index
]
*
reciprocalSquareSumX
);
if
(
input2_height
>
1
)
{
if
(
input2_height
>
1
)
{
prev_grad_y
[
index
]
+=
output
[
ty
]
*
grad
[
ty
]
*
prev_grad_y
[
index
]
+=
(
prev_out_x
[
index
]
*
reciprocalXY
-
output
[
ty
]
*
grad
[
ty
]
*
(
prev_out_x
[
index
]
*
reciprocalXY
-
prev_out_y
[
index
]
*
reciprocalSquareSumY
);
prev_out_y
[
index
]
*
reciprocalSquareSumY
);
}
else
{
}
else
{
paddle
::
paddleAtomicAdd
(
prev_grad_y
+
index
,
output
[
ty
]
*
grad
[
ty
]
*
paddle
::
paddleAtomicAdd
(
(
prev_out_x
[
index
]
*
reciprocalXY
-
prev_grad_y
+
index
,
output
[
ty
]
*
grad
[
ty
]
*
(
prev_out_x
[
index
]
*
reciprocalXY
-
prev_out_y
[
index
]
*
reciprocalSquareSumY
));
prev_out_y
[
index
]
*
reciprocalSquareSumY
));
}
}
}
}
...
@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
...
@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
const
int
block_size
=
256
;
const
int
block_size
=
256
;
dim3
threads
(
block_size
,
1
);
dim3
threads
(
block_size
,
1
);
dim3
grid
(
1
,
input1_height
);
dim3
grid
(
1
,
input1_height
);
KeCosSimDerivative
<
block_size
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
KeCosSimDerivative
<
block_size
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
(
grad
,
output
,
prev_out_x
,
prev_out_y
,
prev_grad_x
,
prev_grad_y
,
width
,
grad
,
input1_height
,
input2_height
,
scale
);
output
,
prev_out_x
,
prev_out_y
,
prev_grad_x
,
prev_grad_y
,
width
,
input1_height
,
input2_height
,
scale
);
CHECK_SYNC
(
"hlCossimDerivate failed"
);
CHECK_SYNC
(
"hlCossimDerivate failed"
);
}
}
...
@@ -214,8 +222,8 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
...
@@ -214,8 +222,8 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
real
scale
)
{
real
scale
)
{
CHECK
(
out_grad
.
getData
()
&&
out_val
.
getData
()
&&
in1_val
.
getData
()
&&
CHECK
(
out_grad
.
getData
()
&&
out_val
.
getData
()
&&
in1_val
.
getData
()
&&
in2_val
.
getData
()
&&
in1_grad
.
getData
()
&&
in2_grad
.
getData
());
in2_val
.
getData
()
&&
in1_grad
.
getData
()
&&
in2_grad
.
getData
());
CHECK
(
out_grad
.
useGpu_
&&
out_val
.
useGpu_
&&
in1_val
.
useGpu_
CHECK
(
out_grad
.
useGpu_
&&
out_val
.
useGpu_
&&
in1_val
.
useGpu_
&&
&&
in2_val
.
useGpu_
&&
in1_grad
.
useGpu_
&&
in2_grad
.
useGpu_
)
in2_val
.
useGpu_
&&
in1_grad
.
useGpu_
&&
in2_grad
.
useGpu_
)
<<
"Matrix types are not equally GPU"
;
<<
"Matrix types are not equally GPU"
;
size_t
dim
=
in1_val
.
getWidth
();
size_t
dim
=
in1_val
.
getWidth
();
...
...
paddle/function/CropOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "CropOp.h"
#include "CropOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
__global__
void
KeCrop
(
real
*
outputs
,
const
real
*
inputs
,
__global__
void
KeCrop
(
real
*
outputs
,
int
inC
,
int
inH
,
int
inW
,
const
real
*
inputs
,
int
cropC
,
int
cropH
,
int
cropW
,
int
inC
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
int
inH
,
int
inW
,
int
cropC
,
int
cropH
,
int
cropW
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
nthreads
)
{
if
(
idx
<
nthreads
)
{
const
int
w
=
idx
%
outW
;
const
int
w
=
idx
%
outW
;
...
@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
...
@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
int
blockSize
=
1024
;
int
blockSize
=
1024
;
int
gridSize
=
(
nth
+
blockSize
-
1
)
/
blockSize
;
int
gridSize
=
(
nth
+
blockSize
-
1
)
/
blockSize
;
KeCrop
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCrop
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
outputs
,
(
outputs
,
inputs
,
inC
,
inH
,
inW
,
cropC
,
cropH
,
cropW
,
inputs
,
outC
,
outH
,
outW
,
nth
);
inC
,
inH
,
inW
,
cropC
,
cropH
,
cropW
,
outC
,
outH
,
outW
,
nth
);
CHECK_SYNC
(
"Crop"
);
CHECK_SYNC
(
"Crop"
);
}
}
__global__
void
KeCropDiff
(
const
real
*
inGrad
,
real
*
outGrad
,
__global__
void
KeCropDiff
(
const
real
*
inGrad
,
int
inC
,
int
inH
,
int
inW
,
real
*
outGrad
,
int
cropC
,
int
cropH
,
int
cropW
,
int
inC
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
int
inH
,
int
inW
,
int
cropC
,
int
cropH
,
int
cropW
,
int
outC
,
int
outH
,
int
outW
,
int
nthreads
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
nthreads
)
{
if
(
idx
<
nthreads
)
{
const
int
w
=
idx
%
inW
;
const
int
w
=
idx
%
inW
;
...
@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
...
@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
int
blockSize
=
1024
;
int
blockSize
=
1024
;
int
gridSize
=
(
nth
+
blockSize
-
1
)
/
blockSize
;
int
gridSize
=
(
nth
+
blockSize
-
1
)
/
blockSize
;
KeCropDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCropDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
inGrad
,
(
inGrad
,
outGrad
,
inC
,
inH
,
inW
,
cropC
,
cropH
,
cropW
,
outGrad
,
outC
,
outH
,
outW
,
nth
);
inC
,
inH
,
inW
,
cropC
,
cropH
,
cropW
,
outC
,
outH
,
outW
,
nth
);
CHECK_SYNC
(
"CropGrad"
);
CHECK_SYNC
(
"CropGrad"
);
}
}
...
...
paddle/function/CrossMapNormalOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "hl_base.h"
#include "CrossMapNormalOp.h"
#include "CrossMapNormalOp.h"
#include "hl_base.h"
namespace
paddle
{
namespace
paddle
{
__global__
void
KeCMRNormFillScale
(
size_t
imageSize
,
const
real
*
in
,
__global__
void
KeCMRNormFillScale
(
size_t
imageSize
,
real
*
scale
,
size_t
channels
,
const
real
*
in
,
size_t
height
,
size_t
width
,
size_t
size
,
real
*
scale
,
size_t
channels
,
size_t
height
,
size_t
width
,
size_t
size
,
real
alpha
)
{
real
alpha
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
imageSize
)
{
if
(
idx
<
imageSize
)
{
...
@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
...
@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
}
}
}
}
__global__
void
KeCMRNormOutput
(
size_t
inputSize
,
const
real
*
in
,
__global__
void
KeCMRNormOutput
(
size_t
inputSize
,
const
real
*
scale
,
real
negative_beta
,
const
real
*
in
,
const
real
*
scale
,
real
negative_beta
,
real
*
out
)
{
real
*
out
)
{
const
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
index
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
index
<
inputSize
)
{
if
(
index
<
inputSize
)
{
...
@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
...
@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
size_t
imageSize
=
numSamples
*
height
*
width
;
size_t
imageSize
=
numSamples
*
height
*
width
;
int
blockSize
=
1024
;
int
blockSize
=
1024
;
int
gridSize
=
(
imageSize
+
1024
-
1
)
/
1024
;
int
gridSize
=
(
imageSize
+
1024
-
1
)
/
1024
;
KeCMRNormFillScale
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCMRNormFillScale
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
(
imageSize
,
inputs
,
denoms
,
channels
,
height
,
width
,
size
,
scale
);
imageSize
,
inputs
,
denoms
,
channels
,
height
,
width
,
size
,
scale
);
size_t
inputSize
=
numSamples
*
height
*
width
*
channels
;
size_t
inputSize
=
numSamples
*
height
*
width
*
channels
;
blockSize
=
1024
;
blockSize
=
1024
;
gridSize
=
(
inputSize
+
1024
-
1
)
/
1024
;
gridSize
=
(
inputSize
+
1024
-
1
)
/
1024
;
KeCMRNormOutput
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCMRNormOutput
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
(
inputSize
,
inputs
,
denoms
,
-
pow
,
outputs
);
inputSize
,
inputs
,
denoms
,
-
pow
,
outputs
);
CHECK_SYNC
(
"CrossMapNormal"
);
CHECK_SYNC
(
"CrossMapNormal"
);
}
}
__global__
void
KeCMRNormDiff
(
size_t
imageSize
,
const
real
*
bottom_data
,
__global__
void
KeCMRNormDiff
(
size_t
imageSize
,
const
real
*
top_data
,
const
real
*
scale
,
const
real
*
bottom_data
,
const
real
*
top_diff
,
size_t
channels
,
const
real
*
top_data
,
size_t
height
,
size_t
width
,
size_t
size
,
const
real
*
scale
,
real
negative_beta
,
real
cache_ratio
,
const
real
*
top_diff
,
real
*
bottom_diff
)
{
size_t
channels
,
size_t
height
,
size_t
width
,
size_t
size
,
real
negative_beta
,
real
cache_ratio
,
real
*
bottom_diff
)
{
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
const
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
imageSize
)
{
if
(
idx
<
imageSize
)
{
const
int
w
=
idx
%
width
;
const
int
w
=
idx
%
width
;
...
@@ -122,8 +134,8 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
...
@@ -122,8 +134,8 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
if
(
index
>=
post_pad
)
{
if
(
index
>=
post_pad
)
{
bottom_diff
[(
index
-
post_pad
)
*
step
]
+=
bottom_diff
[(
index
-
post_pad
)
*
step
]
+=
top_diff
[(
index
-
post_pad
)
*
step
]
*
top_diff
[(
index
-
post_pad
)
*
step
]
*
pow
(
scale
[(
index
-
post_pad
)
*
step
],
negative_beta
)
-
cache_ratio
*
pow
(
scale
[(
index
-
post_pad
)
*
step
],
negative_beta
)
-
bottom_data
[(
index
-
post_pad
)
*
step
]
*
accum
;
cache_ratio
*
bottom_data
[(
index
-
post_pad
)
*
step
]
*
accum
;
}
}
++
index
;
++
index
;
}
}
...
@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
...
@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
int
blockSize
=
1024
;
int
blockSize
=
1024
;
int
gridSize
=
(
imageSize
+
1024
-
1
)
/
1024
;
int
gridSize
=
(
imageSize
+
1024
-
1
)
/
1024
;
KeCMRNormDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
KeCMRNormDiff
<<<
gridSize
,
blockSize
,
0
,
STREAM_DEFAULT
>>>
(
imageSize
,
(
imageSize
,
inputsValue
,
outputsValue
,
denoms
,
outputsGrad
,
channels
,
inputsValue
,
height
,
width
,
size
,
-
pow
,
2.0
f
*
pow
*
scale
,
inputsGrad
);
outputsValue
,
denoms
,
outputsGrad
,
channels
,
height
,
width
,
size
,
-
pow
,
2.0
f
*
pow
*
scale
,
inputsGrad
);
CHECK_SYNC
(
"CrossMapNormalGrad"
);
CHECK_SYNC
(
"CrossMapNormalGrad"
);
}
}
...
...
paddle/function/DepthwiseConvOpGpu.cu
浏览文件 @
1d4fa243
...
@@ -20,17 +20,25 @@ namespace paddle {
...
@@ -20,17 +20,25 @@ namespace paddle {
// CUDA kernel to compute the depthwise convolution forward pass
// CUDA kernel to compute the depthwise convolution forward pass
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
ConvolutionDepthwiseForward
(
const
int
nthreads
,
void
ConvolutionDepthwiseForward
(
const
int
nthreads
,
const
T
*
const
inputData
,
const
T
*
const
inputData
,
const
T
*
const
filterData
,
const
T
*
const
filterData
,
const
int
batchSize
,
const
int
outputChannels
,
const
int
outputHeight
,
const
int
batchSize
,
const
int
outputWidth
,
const
int
inputChannels
,
const
int
inputHeight
,
const
int
outputChannels
,
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
outputHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
outputWidth
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
outputData
)
{
const
int
inputChannels
,
const
int
inputHeight
,
int
index
=
const
int
inputWidth
,
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
outputData
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
const
int
batch
=
index
/
outputChannels
/
outputHeight
/
outputWidth
;
const
int
batch
=
index
/
outputChannels
/
outputHeight
/
outputWidth
;
...
@@ -45,14 +53,16 @@ void ConvolutionDepthwiseForward(const int nthreads,
...
@@ -45,14 +53,16 @@ void ConvolutionDepthwiseForward(const int nthreads,
const
int
w_in_start
=
-
paddingW
+
w_out
*
strideW
;
const
int
w_in_start
=
-
paddingW
+
w_out
*
strideW
;
const
int
h_in_end
=
-
paddingH
+
h_out
*
strideH
+
filterHeight
-
1
;
const
int
h_in_end
=
-
paddingH
+
h_out
*
strideH
+
filterHeight
-
1
;
const
int
w_in_end
=
-
paddingW
+
w_out
*
strideW
+
filterWidth
-
1
;
const
int
w_in_end
=
-
paddingW
+
w_out
*
strideW
+
filterWidth
-
1
;
if
((
h_in_start
>=
0
)
&&
(
h_in_end
<
inputHeight
)
if
((
h_in_start
>=
0
)
&&
(
h_in_end
<
inputHeight
)
&&
(
w_in_start
>=
0
)
&&
&&
(
w_in_start
>=
0
)
&&
(
w_in_end
<
inputWidth
))
{
(
w_in_end
<
inputWidth
))
{
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kh
=
0
;
kh
<
filterHeight
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
const
int
offset
=
((
batch
*
inputChannels
+
c_in
)
const
int
offset
=
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
((
batch
*
inputChannels
+
c_in
)
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
value
+=
(
*
weight
)
*
inputData
[
offset
];
value
+=
(
*
weight
)
*
inputData
[
offset
];
++
weight
;
++
weight
;
}
}
...
@@ -62,10 +72,12 @@ void ConvolutionDepthwiseForward(const int nthreads,
...
@@ -62,10 +72,12 @@ void ConvolutionDepthwiseForward(const int nthreads,
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
for
(
int
kw
=
0
;
kw
<
filterWidth
;
++
kw
)
{
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
&&
(
w_in
>=
0
)
&&
&&
(
w_in
>=
0
)
&&
(
w_in
<
inputWidth
))
{
(
w_in
<
inputWidth
))
{
const
int
offset
=
((
batch
*
inputChannels
+
c_in
)
const
int
offset
=
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
((
batch
*
inputChannels
+
c_in
)
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
value
+=
(
*
weight
)
*
inputData
[
offset
];
value
+=
(
*
weight
)
*
inputData
[
offset
];
}
}
++
weight
;
++
weight
;
...
@@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads,
...
@@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads,
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
ConvolutionDepthwiseInputBackward
(
const
int
nthreads
,
void
ConvolutionDepthwiseInputBackward
(
const
int
nthreads
,
const
T
*
const
top_diff
,
const
T
*
const
top_diff
,
const
T
*
const
weight_data
,
const
T
*
const
weight_data
,
const
int
num
,
const
int
outputChannels
,
const
int
outputHeight
,
const
int
num
,
const
int
outputWidth
,
const
int
inputChannels
,
const
int
inputHeight
,
const
int
outputChannels
,
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
outputHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
outputWidth
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
bottom_diff
)
{
const
int
inputChannels
,
int
index
=
const
int
inputHeight
,
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
bottom_diff
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
const
int
batch
=
index
/
inputChannels
/
inputHeight
/
inputWidth
;
const
int
batch
=
index
/
inputChannels
/
inputHeight
/
inputWidth
;
const
int
c_in
=
(
index
/
inputHeight
/
inputWidth
)
%
inputChannels
;
const
int
c_in
=
(
index
/
inputHeight
/
inputWidth
)
%
inputChannels
;
...
@@ -96,27 +117,29 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
...
@@ -96,27 +117,29 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
const
int
c_out_start
=
c_in
*
filterMultiplier
;
const
int
c_out_start
=
c_in
*
filterMultiplier
;
int
h_out_start
=
(
h_in
-
filterHeight
+
paddingH
+
strideH
)
/
strideH
;
int
h_out_start
=
(
h_in
-
filterHeight
+
paddingH
+
strideH
)
/
strideH
;
h_out_start
=
0
>
h_out_start
?
0
:
h_out_start
;
h_out_start
=
0
>
h_out_start
?
0
:
h_out_start
;
int
h_out_end
=
(
h_in
+
paddingH
)
/
strideH
;
int
h_out_end
=
(
h_in
+
paddingH
)
/
strideH
;
h_out_end
=
outputHeight
-
1
<
h_out_end
?
outputHeight
-
1
:
h_out_end
;
h_out_end
=
outputHeight
-
1
<
h_out_end
?
outputHeight
-
1
:
h_out_end
;
int
w_out_start
=
(
w_in
-
filterWidth
+
paddingW
+
strideW
)
/
strideW
;
int
w_out_start
=
(
w_in
-
filterWidth
+
paddingW
+
strideW
)
/
strideW
;
w_out_start
=
0
>
w_out_start
?
0
:
w_out_start
;
w_out_start
=
0
>
w_out_start
?
0
:
w_out_start
;
int
w_out_end
=
(
w_in
+
paddingW
)
/
strideW
;
int
w_out_end
=
(
w_in
+
paddingW
)
/
strideW
;
w_out_end
=
outputWidth
-
1
<
w_out_end
?
outputWidth
-
1
:
w_out_end
;
w_out_end
=
outputWidth
-
1
<
w_out_end
?
outputWidth
-
1
:
w_out_end
;
T
value
=
0
;
T
value
=
0
;
for
(
int
c_out
=
c_out_start
;
for
(
int
c_out
=
c_out_start
;
c_out
<
c_out_start
+
filterMultiplier
;
c_out
<
c_out_start
+
filterMultiplier
;
c_out
++
)
{
c_out
++
)
{
for
(
int
h_out
=
h_out_start
;
h_out
<=
h_out_end
;
++
h_out
)
{
for
(
int
h_out
=
h_out_start
;
h_out
<=
h_out_end
;
++
h_out
)
{
const
int
filter_h
=
h_in
+
paddingH
-
h_out
*
strideH
;
const
int
filter_h
=
h_in
+
paddingH
-
h_out
*
strideH
;
for
(
int
w_out
=
w_out_start
;
w_out
<=
w_out_end
;
++
w_out
)
{
for
(
int
w_out
=
w_out_start
;
w_out
<=
w_out_end
;
++
w_out
)
{
const
int
filter_w
=
w_in
+
paddingW
-
w_out
*
strideW
;
const
int
filter_w
=
w_in
+
paddingW
-
w_out
*
strideW
;
const
int
filter_offset
=
c_out
*
filterHeight
*
filterWidth
const
int
filter_offset
=
c_out
*
filterHeight
*
filterWidth
+
+
filter_h
*
filterWidth
+
filter_w
;
filter_h
*
filterWidth
+
filter_w
;
const
int
top_diff_offset
=
((
batch
*
outputChannels
+
c_out
)
*
const
int
top_diff_offset
=
outputHeight
+
h_out
)
*
outputWidth
+
w_out
;
((
batch
*
outputChannels
+
c_out
)
*
outputHeight
+
h_out
)
*
outputWidth
+
w_out
;
value
+=
top_diff
[
top_diff_offset
]
*
weight_data
[
filter_offset
];
value
+=
top_diff
[
top_diff_offset
]
*
weight_data
[
filter_offset
];
}
}
}
}
...
@@ -127,34 +150,47 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
...
@@ -127,34 +150,47 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
template
<
class
T
>
template
<
class
T
>
__global__
__global__
void
ConvolutionDepthwiseFilterBackward
(
const
int
num_i
,
void
ConvolutionDepthwiseFilterBackward
(
const
int
num_i
,
const
int
nthreads
,
const
int
nthreads
,
const
T
*
const
top_diff
,
const
T
*
const
inputData
,
const
T
*
const
top_diff
,
const
int
num
,
const
int
outputChannels
,
const
int
outputHeight
,
const
T
*
const
inputData
,
const
int
outputWidth
,
const
int
inputChannels
,
const
int
inputHeight
,
const
int
num
,
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
outputChannels
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
outputHeight
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
buffer_data
)
{
const
int
outputWidth
,
int
index
=
const
int
inputChannels
,
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
inputHeight
,
const
int
inputWidth
,
const
int
filterMultiplier
,
const
int
filterHeight
,
const
int
filterWidth
,
const
int
strideH
,
const
int
strideW
,
const
int
paddingH
,
const
int
paddingW
,
T
*
const
buffer_data
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
if
(
index
<
nthreads
)
{
const
int
h_out
=
(
index
/
outputWidth
)
%
outputHeight
;
const
int
h_out
=
(
index
/
outputWidth
)
%
outputHeight
;
const
int
w_out
=
index
%
outputWidth
;
const
int
w_out
=
index
%
outputWidth
;
const
int
kh
=
(
index
/
filterWidth
/
outputHeight
/
outputWidth
)
const
int
kh
=
%
filterHeight
;
(
index
/
filterWidth
/
outputHeight
/
outputWidth
)
%
filterHeight
;
const
int
kw
=
(
index
/
outputHeight
/
outputWidth
)
%
filterWidth
;
const
int
kw
=
(
index
/
outputHeight
/
outputWidth
)
%
filterWidth
;
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
h_in
=
-
paddingH
+
h_out
*
strideH
+
kh
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
const
int
w_in
=
-
paddingW
+
w_out
*
strideW
+
kw
;
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
if
((
h_in
>=
0
)
&&
(
h_in
<
inputHeight
)
&&
(
w_in
>=
0
)
&&
&&
(
w_in
>=
0
)
&&
(
w_in
<
inputWidth
))
{
(
w_in
<
inputWidth
))
{
const
int
c_out
=
index
/
const
int
c_out
=
(
filterHeight
*
filterWidth
*
outputHeight
*
outputWidth
);
index
/
(
filterHeight
*
filterWidth
*
outputHeight
*
outputWidth
);
const
int
c_in
=
c_out
/
filterMultiplier
;
const
int
c_in
=
c_out
/
filterMultiplier
;
const
int
batch
=
num_i
;
const
int
batch
=
num_i
;
const
int
top_offset
=
((
batch
*
outputChannels
+
c_out
)
*
const
int
top_offset
=
outputHeight
+
h_out
)
*
outputWidth
+
w_out
;
((
batch
*
outputChannels
+
c_out
)
*
outputHeight
+
h_out
)
*
const
int
bottom_offset
=
((
batch
*
inputChannels
+
c_in
)
outputWidth
+
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
w_out
;
const
int
bottom_offset
=
((
batch
*
inputChannels
+
c_in
)
*
inputHeight
+
h_in
)
*
inputWidth
+
w_in
;
buffer_data
[
index
]
=
top_diff
[
top_offset
]
*
inputData
[
bottom_offset
];
buffer_data
[
index
]
=
top_diff
[
top_offset
]
*
inputData
[
bottom_offset
];
}
else
{
}
else
{
buffer_data
[
index
]
=
0
;
buffer_data
[
index
]
=
0
;
...
@@ -163,7 +199,7 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
...
@@ -163,7 +199,7 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
}
}
template
<
class
T
>
template
<
class
T
>
class
DepthwiseConvFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
class
DepthwiseConvFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
public:
public:
void
operator
()(
const
T
*
inputData
,
void
operator
()(
const
T
*
inputData
,
const
T
*
filterData
,
const
T
*
filterData
,
...
@@ -181,17 +217,16 @@ public:
...
@@ -181,17 +217,16 @@ public:
int
strideW
,
int
strideW
,
int
paddingH
,
int
paddingH
,
int
paddingW
,
int
paddingW
,
T
*
outputData
)
{
T
*
outputData
)
{
int
outputSize
=
batchSize
*
outputChannels
*
outputHeight
*
outputWidth
;
int
outputSize
=
batchSize
*
outputChannels
*
outputHeight
*
outputWidth
;
size_t
blocks
=
(
outputSize
+
1024
-
1
)
/
1024
;
size_t
blocks
=
(
outputSize
+
1024
-
1
)
/
1024
;
size_t
blockX
=
512
;
size_t
blockX
=
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
ConvolutionDepthwiseForward
<
T
>
ConvolutionDepthwiseForward
<
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
outputSize
,
outputSize
,
inputData
,
inputData
,
filterData
,
filterData
,
...
@@ -214,7 +249,7 @@ public:
...
@@ -214,7 +249,7 @@ public:
};
};
template
<
class
T
>
template
<
class
T
>
class
DepthwiseConvGradInputFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
class
DepthwiseConvGradInputFunctor
<
DEVICE_TYPE_GPU
,
T
>
{
public:
public:
void
operator
()(
const
T
*
outputGrad
,
void
operator
()(
const
T
*
outputGrad
,
const
T
*
filterData
,
const
T
*
filterData
,
...
@@ -232,20 +267,18 @@ public:
...
@@ -232,20 +267,18 @@ public:
int
strideW
,
int
strideW
,
int
paddingH
,
int
paddingH
,
int
paddingW
,
int
paddingW
,
T
*
inputGrad
)
{
T
*
inputGrad
)
{
int
inputSize
=
batchSize
*
inputChannels
*
inputHeight
*
inputWidth
;
int
inputSize
=
batchSize
*
inputChannels
*
inputHeight
*
inputWidth
;
size_t
blocks
=
(
inputSize
+
1024
-
1
)
/
1024
;
size_t
blocks
=
(
inputSize
+
1024
-
1
)
/
1024
;
size_t
blockX
=
512
;
size_t
blockX
=
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
ConvolutionDepthwiseInputBackward
<
T
>
ConvolutionDepthwiseInputBackward
<
T
>
// NOLINT_NEXT_LINE(whitespace/operators)
// NOLINT_NEXT_LINE(whitespace/operators)
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
inputSize
,
inputSize
,
outputGrad
,
outputGrad
,
filterData
,
filterData
,
batchSize
,
batchSize
,
...
@@ -286,22 +319,24 @@ public:
...
@@ -286,22 +319,24 @@ public:
int
paddingH
,
int
paddingH
,
int
paddingW
,
int
paddingW
,
T
*
colData
,
T
*
colData
,
T
*
filterGrad
)
{
T
*
filterGrad
)
{
int
colDataSize
=
outputChannels
*
filterHeight
*
filterWidth
int
colDataSize
=
outputChannels
*
filterHeight
*
filterWidth
*
*
outputHeight
*
outputWidth
;
outputHeight
*
outputWidth
;
size_t
blocks
=
(
colDataSize
+
1024
-
1
)
/
1024
;
size_t
blocks
=
(
colDataSize
+
1024
-
1
)
/
1024
;
size_t
blockX
=
512
;
size_t
blockX
=
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
size_t
blockY
=
(
blocks
+
512
-
1
)
/
512
;
dim3
threads
(
1024
,
1
);
dim3
threads
(
1024
,
1
);
dim3
grid
(
blockX
,
blockY
);
dim3
grid
(
blockX
,
blockY
);
BaseMatrix
filterGradMatrix
(
outputChannels
*
filterHeight
*
filterWidth
,
BaseMatrix
filterGradMatrix
(
outputChannels
*
filterHeight
*
filterWidth
,
1
,
filterGrad
,
false
,
true
);
1
,
filterGrad
,
false
,
true
);
for
(
int
i
=
0
;
i
<
batchSize
;
i
++
)
{
for
(
int
i
=
0
;
i
<
batchSize
;
i
++
)
{
ConvolutionDepthwiseFilterBackward
<
T
>
ConvolutionDepthwiseFilterBackward
<
<<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
T
><<<
grid
,
threads
,
0
,
STREAM_DEFAULT
>>>
(
i
,
i
,
colDataSize
,
colDataSize
,
outputGrad
,
outputGrad
,
inputData
,
inputData
,
...
...
paddle/function/Im2ColOpGpu.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/function/MulOpGpu.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/function/PadOpGpu.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/function/RowConvOpGpu.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/gserver/layers/GruCompute.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/gserver/layers/LstmCompute.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/math/BaseMatrix.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/math/TrainingAlgorithmOp.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/math/tests/test_Tensor.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/math/tests/test_lazyAssign.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/operators/softmax_op.cu
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
浏览文件 @
1d4fa243
无法预览此类型文件
proto/DataConfig.proto
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
proto/DataFormat.proto
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
proto/ModelConfig.proto
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
proto/OptimizerConfig.proto
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
proto/ParameterConfig.proto
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
proto/ParameterServerConfig.proto
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
proto/ParameterService.proto
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
proto/TrainerConfig.proto
浏览文件 @
1d4fa243
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录