Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
ecce973d
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ecce973d
编写于
9月 30, 2018
作者:
X
Xin Pan
提交者:
GitHub
9月 30, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #13675 from qingqing01/v1.0.0_deeeplabv3
Optimization of Kernels that related to DeepLabv3+ (#13534)
上级
2210a317
1fb82148
变更
9
显示空白变更内容
内联
并排
Showing
9 changed file
with
817 addition
and
188 deletion
+817
-188
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+1
-0
paddle/fluid/operators/conv_op.h
paddle/fluid/operators/conv_op.h
+4
-3
paddle/fluid/operators/conv_transpose_op.h
paddle/fluid/operators/conv_transpose_op.h
+4
-3
paddle/fluid/operators/cub_reduce.h
paddle/fluid/operators/cub_reduce.h
+322
-0
paddle/fluid/operators/math/depthwise_conv.cu
paddle/fluid/operators/math/depthwise_conv.cu
+323
-156
paddle/fluid/operators/math/depthwise_conv.h
paddle/fluid/operators/math/depthwise_conv.h
+4
-1
paddle/fluid/operators/reduce_mean_op.cu
paddle/fluid/operators/reduce_mean_op.cu
+56
-9
paddle/fluid/operators/reduce_sum_op.cu
paddle/fluid/operators/reduce_sum_op.cu
+51
-9
python/paddle/fluid/tests/unittests/test_conv2d_op.py
python/paddle/fluid/tests/unittests/test_conv2d_op.py
+52
-7
未找到文件。
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
ecce973d
...
...
@@ -301,6 +301,7 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute)
if
(
WITH_GPU
)
op_library
(
conv_op DEPS vol2col depthwise_conv im2col
)
op_library
(
layer_norm_op DEPS cub
)
op_library
(
reduce_mean_op DEPS cub
)
else
()
op_library
(
conv_op DEPS vol2col im2col
)
endif
()
...
...
paddle/fluid/operators/conv_op.h
浏览文件 @
ecce973d
...
...
@@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
math
::
DepthwiseConvFunctor
<
DeviceContext
,
T
>
depthwiseConv
;
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
depthwiseConv
(
dev_ctx
,
*
input
,
filter
,
strides
,
paddings
,
output
);
depthwiseConv
(
dev_ctx
,
*
input
,
filter
,
strides
,
paddings
,
dilations
,
output
);
}
};
...
...
@@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
set_zero
(
dev_ctx
,
input_grad
,
static_cast
<
T
>
(
0
));
depthwiseConvInputGrad
(
dev_ctx
,
*
input
,
filter
,
*
output_grad
,
strides
,
paddings
,
input_grad
);
paddings
,
dilations
,
input_grad
);
}
if
(
filter_grad
)
{
filter_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
set_zero
(
dev_ctx
,
filter_grad
,
static_cast
<
T
>
(
0
));
depthwiseConvFilterGrad
(
dev_ctx
,
*
input
,
*
output_grad
,
strides
,
paddings
,
filter_grad
);
dilations
,
filter_grad
);
}
}
};
...
...
paddle/fluid/operators/conv_transpose_op.h
浏览文件 @
ecce973d
...
...
@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
math
::
DepthwiseConvInputGradFunctor
<
DeviceContext
,
T
>
depthwiseConvInputGrad
;
depthwiseConvInputGrad
(
dev_ctx
,
*
output
,
filter
,
*
input
,
strides
,
paddings
,
output
);
dilations
,
output
);
}
};
...
...
@@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
std
::
vector
<
int
>
strides
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
if
(
input_grad
)
{
math
::
DepthwiseConvFunctor
<
DeviceContext
,
T
>
depthwiseConv
;
depthwiseConv
(
dev_ctx
,
*
output_grad
,
filter
,
strides
,
paddings
,
depthwiseConv
(
dev_ctx
,
*
output_grad
,
filter
,
strides
,
paddings
,
dilations
,
input_grad
);
}
...
...
@@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
math
::
DepthwiseConvFilterGradFunctor
<
DeviceContext
,
T
>
depthwiseConvFilterGrad
;
depthwiseConvFilterGrad
(
dev_ctx
,
*
output_grad
,
*
input
,
strides
,
paddings
,
filter_grad
);
dilations
,
filter_grad
);
}
}
};
...
...
paddle/fluid/operators/cub_reduce.h
0 → 100644
浏览文件 @
ecce973d
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cmath>
#include <numeric>
#include <set>
#include <vector>
#include <cub/cub.cuh> // NOLINT
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
operators
{
namespace
detail
{
template
<
typename
T
,
size_t
ElementCount
>
struct
Array
{
public:
HOSTDEVICE
inline
Array
()
{}
HOSTDEVICE
inline
T
&
operator
[](
size_t
index
)
{
return
data_
[
index
];
}
HOSTDEVICE
inline
const
T
&
operator
[](
size_t
index
)
const
{
return
data_
[
index
];
}
HOSTDEVICE
constexpr
inline
size_t
size
()
const
{
return
ElementCount
;
}
template
<
typename
VectorLikeType
>
static
inline
Array
<
T
,
ElementCount
>
From
(
const
VectorLikeType
&
vec
)
{
PADDLE_ENFORCE_EQ
(
vec
.
size
(),
ElementCount
,
"size not match"
);
size_t
n
=
static_cast
<
size_t
>
(
vec
.
size
());
Array
<
T
,
ElementCount
>
ret
;
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
ret
[
i
]
=
vec
[
i
];
return
ret
;
}
private:
T
data_
[
ElementCount
];
};
// reduce the last axis of 2d array
template
<
typename
Tx
,
typename
Ty
,
typename
ReduceOp
,
typename
TransformOp
,
int
BlockDim
>
__global__
void
ReduceKernel2D
(
const
Tx
*
x
,
Ty
*
y
,
ReduceOp
reducer
,
TransformOp
transformer
,
Ty
init
,
int
reduce_num
)
{
__shared__
typename
cub
::
BlockReduce
<
Ty
,
BlockDim
>::
TempStorage
temp_storage
;
int
idx_x
=
blockIdx
.
x
*
reduce_num
;
int
idx_y
=
threadIdx
.
x
;
Ty
reduce_var
=
init
;
for
(
int
idx_y
=
threadIdx
.
x
;
idx_y
<
reduce_num
;
idx_y
+=
BlockDim
)
reduce_var
=
reducer
(
reduce_var
,
transformer
(
x
[
idx_x
+
idx_y
]));
reduce_var
=
cub
::
BlockReduce
<
Ty
,
BlockDim
>
(
temp_storage
).
Reduce
(
reduce_var
,
reducer
);
if
(
threadIdx
.
x
==
0
)
{
y
[
blockIdx
.
x
]
=
reduce_var
;
}
}
template
<
typename
Tx
,
typename
Ty
,
typename
ReduceOp
,
typename
TransformOp
,
int
BlockDim
,
int
Rank
,
int
ReduceRank
>
__global__
void
ReduceKernel
(
const
Tx
*
x
,
Ty
*
y
,
ReduceOp
reducer
,
TransformOp
transformer
,
Ty
init
,
int
reduce_num
,
Array
<
int
,
Rank
>
x_strides
,
Array
<
int
,
ReduceRank
>
reduce_dim
,
Array
<
int
,
ReduceRank
>
reduce_strides
,
Array
<
int
,
Rank
-
ReduceRank
>
left_dim
,
Array
<
int
,
Rank
-
ReduceRank
>
left_strides
)
{
__shared__
typename
cub
::
BlockReduce
<
Ty
,
BlockDim
>::
TempStorage
temp_storage
;
Array
<
int
,
Rank
>
sub_index
;
int
left_idx
=
blockIdx
.
x
;
for
(
int
i
=
0
;
i
<
Rank
-
ReduceRank
;
++
i
)
{
sub_index
[
left_dim
[
i
]]
=
left_idx
/
left_strides
[
i
];
left_idx
%=
left_strides
[
i
];
}
int
reduce_idx
=
threadIdx
.
x
;
for
(
int
j
=
0
;
j
<
ReduceRank
;
++
j
)
{
sub_index
[
reduce_dim
[
j
]]
=
reduce_idx
/
reduce_strides
[
j
];
reduce_idx
%=
reduce_strides
[
j
];
}
int
idx_x
=
0
;
for
(
int
k
=
0
;
k
<
Rank
;
++
k
)
idx_x
+=
(
sub_index
[
k
]
*
x_strides
[
k
]);
Ty
reduce_var
=
static_cast
<
Ty
>
(
transformer
(
x
[
idx_x
]));
for
(
int
i
=
threadIdx
.
x
+
BlockDim
;
i
<
reduce_num
;
i
+=
BlockDim
)
{
int
reduce_idx
=
i
;
for
(
int
j
=
0
;
j
<
ReduceRank
;
++
j
)
{
sub_index
[
reduce_dim
[
j
]]
=
reduce_idx
/
reduce_strides
[
j
];
reduce_idx
%=
reduce_strides
[
j
];
}
int
idx_x
=
0
;
for
(
int
k
=
0
;
k
<
Rank
;
++
k
)
idx_x
+=
(
sub_index
[
k
]
*
x_strides
[
k
]);
reduce_var
=
static_cast
<
Ty
>
(
reducer
(
reduce_var
,
transformer
(
x
[
idx_x
])));
}
reduce_var
=
cub
::
BlockReduce
<
Ty
,
BlockDim
>
(
temp_storage
).
Reduce
(
reduce_var
,
reducer
);
if
(
threadIdx
.
x
==
0
)
{
y
[
blockIdx
.
x
]
=
reduce_var
;
}
}
static
inline
std
::
vector
<
int
>
GetStrides
(
const
std
::
vector
<
int
>&
dims
)
{
int
n
=
static_cast
<
int
>
(
dims
.
size
());
if
(
n
==
0
)
return
std
::
vector
<
int
>
();
std
::
vector
<
int
>
strides
(
n
);
strides
.
back
()
=
1
;
for
(
int
i
=
n
-
2
;
i
>=
0
;
--
i
)
{
strides
[
i
]
=
strides
[
i
+
1
]
*
dims
[
i
+
1
];
}
return
strides
;
}
static
inline
std
::
vector
<
int
>
GetStrides
(
const
std
::
vector
<
int
>&
dims
,
const
std
::
vector
<
int
>&
idx
)
{
int
n
=
static_cast
<
int
>
(
idx
.
size
());
if
(
n
==
0
)
return
std
::
vector
<
int
>
();
std
::
vector
<
int
>
strides
(
n
);
strides
.
back
()
=
1
;
for
(
int
i
=
n
-
2
;
i
>=
0
;
--
i
)
{
strides
[
i
]
=
strides
[
i
+
1
]
*
dims
[
idx
[
i
+
1
]];
}
return
strides
;
}
constexpr
int
kMaxBlockDim
=
512
;
static
inline
int
GetDesiredBlockDim
(
int
block_dim
)
{
return
block_dim
>=
kMaxBlockDim
?
kMaxBlockDim
:
(
1
<<
static_cast
<
int
>
(
std
::
log2
(
block_dim
)));
}
template
<
typename
Tx
,
typename
Ty
,
int
BlockDim
,
typename
ReduceOp
,
typename
TransformOp
>
static
void
TensorReduceImpl
(
const
Tx
*
x_data
,
Ty
*
y_data
,
const
platform
::
Place
&
place
,
const
ReduceOp
&
reducer
,
const
TransformOp
&
transformer
,
const
Ty
&
init
,
int
left_num
,
int
reduce_num
,
const
std
::
vector
<
int
>&
x_strides
,
const
std
::
vector
<
int
>&
reduce_dim
,
const
std
::
vector
<
int
>&
reduce_strides
,
const
std
::
vector
<
int
>&
left_dim
,
const
std
::
vector
<
int
>&
left_strides
,
cudaStream_t
stream
)
{
#define CUB_RANK_CASE(i, ...) \
case i: { \
constexpr auto kRank = i; \
switch (reduce_rank) { __VA_ARGS__; } \
} break
#define CUB_REDUCE_RANK_CASE(i, ...) \
case i: { \
constexpr auto kReduceRank = i; \
ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, \
kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
x_data, y_data, reducer, transformer, init, reduce_num, \
Array<int, kRank>::From(x_strides), \
Array<int, kReduceRank>::From(reduce_dim), \
Array<int, kReduceRank>::From(reduce_strides), \
Array<int, kRank - kReduceRank>::From(left_dim), \
Array<int, kRank - kReduceRank>::From(left_strides)); \
} break
int
rank
=
x_strides
.
size
();
int
reduce_rank
=
reduce_strides
.
size
();
if
(
rank
==
reduce_rank
)
{
cub
::
TransformInputIterator
<
Ty
,
TransformOp
,
const
Tx
*>
trans_x
(
x_data
,
transformer
);
size_t
temp_storage_bytes
=
0
;
cub
::
DeviceReduce
::
Reduce
(
nullptr
,
temp_storage_bytes
,
trans_x
,
y_data
,
reduce_num
,
reducer
,
init
,
stream
);
framework
::
Tensor
tmp
;
auto
*
temp_storage
=
tmp
.
mutable_data
<
uint8_t
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
temp_storage_bytes
)}),
place
);
cub
::
DeviceReduce
::
Reduce
(
temp_storage
,
temp_storage_bytes
,
trans_x
,
y_data
,
reduce_num
,
reducer
,
init
,
stream
);
return
;
}
if
(
rank
==
2
&&
reduce_rank
==
1
&&
reduce_dim
[
0
]
==
1
)
{
ReduceKernel2D
<
Tx
,
Ty
,
ReduceOp
,
TransformOp
,
BlockDim
><<<
left_num
,
BlockDim
,
0
,
stream
>>>
(
x_data
,
y_data
,
reducer
,
transformer
,
init
,
reduce_num
);
return
;
}
/*
if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
// TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
// Currently, it is handled by code below, but inefficient
return;
}
*/
switch
(
rank
)
{
CUB_RANK_CASE
(
2
,
CUB_REDUCE_RANK_CASE
(
1
););
CUB_RANK_CASE
(
3
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
););
CUB_RANK_CASE
(
4
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
);
CUB_REDUCE_RANK_CASE
(
3
););
CUB_RANK_CASE
(
5
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
);
CUB_REDUCE_RANK_CASE
(
3
);
CUB_REDUCE_RANK_CASE
(
4
););
CUB_RANK_CASE
(
6
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
);
CUB_REDUCE_RANK_CASE
(
3
);
CUB_REDUCE_RANK_CASE
(
4
);
CUB_REDUCE_RANK_CASE
(
5
););
CUB_RANK_CASE
(
7
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
);
CUB_REDUCE_RANK_CASE
(
3
);
CUB_REDUCE_RANK_CASE
(
4
);
CUB_REDUCE_RANK_CASE
(
5
);
CUB_REDUCE_RANK_CASE
(
6
););
CUB_RANK_CASE
(
8
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
);
CUB_REDUCE_RANK_CASE
(
3
);
CUB_REDUCE_RANK_CASE
(
4
);
CUB_REDUCE_RANK_CASE
(
5
);
CUB_REDUCE_RANK_CASE
(
6
););
CUB_RANK_CASE
(
9
,
CUB_REDUCE_RANK_CASE
(
1
);
CUB_REDUCE_RANK_CASE
(
2
);
CUB_REDUCE_RANK_CASE
(
3
);
CUB_REDUCE_RANK_CASE
(
4
);
CUB_REDUCE_RANK_CASE
(
5
);
CUB_REDUCE_RANK_CASE
(
6
);
CUB_REDUCE_RANK_CASE
(
7
);
CUB_REDUCE_RANK_CASE
(
8
););
}
#undef CUB_REDUCE_RANK_CASE
#undef CUB_RANK_CASE
}
}
// namespace detail
template
<
typename
Tx
,
typename
Ty
,
typename
ReduceOp
,
typename
TransformOp
>
void
TensorReduce
(
const
framework
::
Tensor
&
x
,
framework
::
Tensor
*
y
,
std
::
vector
<
int
>
origin_reduce_dims
,
const
Ty
&
init
,
const
ReduceOp
&
reducer
,
const
TransformOp
&
transformer
,
cudaStream_t
stream
)
{
auto
x_dim
=
framework
::
vectorize2int
(
x
.
dims
());
std
::
vector
<
int
>
new_x_dim
,
new_reduce_dims
;
int
is_reduced
=
0
;
for
(
auto
e
:
origin_reduce_dims
)
{
auto
pos
=
e
>=
0
?
e
:
e
+
x_dim
.
size
();
is_reduced
|=
1
<<
e
;
}
for
(
int
i
=
0
;
i
<
x_dim
.
size
();
i
++
)
{
if
((
i
==
0
)
||
(((
is_reduced
>>
i
)
^
(
is_reduced
>>
(
i
-
1
)))
&
1
))
{
new_x_dim
.
push_back
(
x_dim
[
i
]);
if
((
is_reduced
>>
i
)
&
1
)
new_reduce_dims
.
push_back
(
new_x_dim
.
size
()
-
1
);
}
else
{
new_x_dim
[
new_x_dim
.
size
()
-
1
]
*=
x_dim
[
i
];
}
}
x_dim
=
new_x_dim
;
origin_reduce_dims
=
new_reduce_dims
;
int
x_rank
=
static_cast
<
int
>
(
x_dim
.
size
());
std
::
set
<
int
>
left_set
,
reduce_set
;
for
(
int
i
=
0
;
i
<
x_rank
;
++
i
)
left_set
.
insert
(
i
);
for
(
auto
e
:
origin_reduce_dims
)
{
left_set
.
erase
(
e
);
reduce_set
.
insert
(
e
);
}
std
::
vector
<
int
>
reduce_dim
(
reduce_set
.
begin
(),
reduce_set
.
end
());
std
::
vector
<
int
>
left_dim
(
left_set
.
begin
(),
left_set
.
end
());
std
::
vector
<
int
>
x_strides
=
detail
::
GetStrides
(
x_dim
);
std
::
vector
<
int
>
reduce_strides
=
detail
::
GetStrides
(
x_dim
,
reduce_dim
);
std
::
vector
<
int
>
left_strides
=
detail
::
GetStrides
(
x_dim
,
left_dim
);
int
reduce_num
=
reduce_strides
[
0
]
*
x_dim
[
reduce_dim
[
0
]];
int
left_num
=
1
;
if
(
left_dim
.
size
())
left_num
=
left_strides
[
0
]
*
x_dim
[
left_dim
[
0
]];
std
::
vector
<
int
>
y_dim
(
left_dim
.
size
());
for
(
int
i
=
0
;
i
<
left_dim
.
size
();
++
i
)
{
y_dim
[
i
]
=
x_dim
[
left_dim
[
i
]];
}
auto
x_data
=
x
.
data
<
Tx
>
();
auto
y_data
=
y
->
mutable_data
<
Ty
>
(
x
.
place
());
if
(
reduce_num
==
1
)
return
;
#define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \
constexpr auto kBlockDim = block_dim; \
detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>( \
x_data, y_data, x.place(), reducer, transformer, init, left_num, \
reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, \
left_strides, stream); \
} break
switch
(
detail
::
GetDesiredBlockDim
(
reduce_num
))
{
CUB_BLOCK_DIM_CASE
(
512
);
CUB_BLOCK_DIM_CASE
(
256
);
CUB_BLOCK_DIM_CASE
(
128
);
CUB_BLOCK_DIM_CASE
(
64
);
CUB_BLOCK_DIM_CASE
(
32
);
CUB_BLOCK_DIM_CASE
(
16
);
CUB_BLOCK_DIM_CASE
(
8
);
CUB_BLOCK_DIM_CASE
(
4
);
CUB_BLOCK_DIM_CASE
(
2
);
}
#undef CUB_BLOCK_DIM_CASE
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/depthwise_conv.cu
浏览文件 @
ecce973d
...
...
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/platform/cuda_primitives.h"
...
...
@@ -20,31 +21,54 @@ namespace paddle {
namespace
operators
{
namespace
math
{
template
<
typename
T
>
__inline__
__device__
T
warpReduceSum
(
T
val
)
{
#if CUDA_VERSION < 9000
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
val
+=
__shfl_down
(
val
,
offset
);
return
val
;
#else
#define FULL_MASK 0xffffffff
for
(
int
offset
=
16
;
offset
>
0
;
offset
/=
2
)
val
+=
__shfl_down_sync
(
FULL_MASK
,
val
,
offset
);
return
val
;
#endif
}
__forceinline__
__device__
unsigned
lane_id
()
{
unsigned
ret
;
asm
volatile
(
"mov.u32 %0, %laneid;"
:
"=r"
(
ret
));
return
ret
;
}
__forceinline__
__device__
unsigned
warp_id
()
{
unsigned
ret
;
asm
volatile
(
"mov.u32 %0, %warpid;"
:
"=r"
(
ret
));
return
ret
;
}
// A Cuda kernel to compute the depthwise convolution forward pass
// in NCHW format.
template
<
typename
T
>
__
global
__
void
KernelDepthwiseConv
(
const
int
nthreads
,
const
T
*
const
input_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
__
device__
__inline
__
void
KernelDepthwiseConv
(
const
T
*
const
input_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
T
*
const
output_data
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
const
int
batch
=
index
/
output_channels
/
output_height
/
output_width
;
const
int
c_out
=
(
index
/
output_height
/
output_width
)
%
output_channels
;
const
int
h_out
=
(
index
/
output_width
)
%
output_height
;
const
int
w_out
=
index
%
output_width
;
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
const
output_data
)
{
for
(
int
w_out
=
threadIdx
.
x
;
w_out
<
output_width
;
w_out
+=
blockDim
.
x
)
{
for
(
int
h_out
=
threadIdx
.
y
;
h_out
<
output_height
;
h_out
+=
blockDim
.
y
)
{
const
int
batch
=
blockIdx
.
y
;
const
int
c_out
=
blockIdx
.
x
;
const
int
c_in
=
c_out
/
filter_multiplier
;
const
T
*
weight
=
filter_data
+
c_out
*
filter_height
*
filter_width
;
T
value
=
0
;
const
int
h_in_start
=
-
padding_height
+
h_out
*
stride_height
;
const
int
w_in_start
=
-
padding_width
+
w_out
*
stride_width
;
const
int
h_in_end
=
h_in_start
+
filter
_height
;
const
int
w_in_end
=
w_in_start
+
filter
_width
;
const
int
h_in_end
=
h_in_start
+
filter_height
*
dilate
_height
;
const
int
w_in_end
=
w_in_start
+
filter_width
*
dilate
_width
;
const
int
in_offset
=
((
batch
*
input_channels
+
c_in
)
*
input_height
)
*
input_width
;
...
...
@@ -53,116 +77,212 @@ __global__ void KernelDepthwiseConv(
const
int
w_end
=
w_in_end
<
input_width
?
w_in_end
:
input_width
;
const
int
h_start
=
h_in_start
>
0
?
h_in_start
:
0
;
const
int
w_start
=
w_in_start
>
0
?
w_in_start
:
0
;
int
weight_offset
=
0
;
for
(
int
h_in
=
h_start
;
h_in
<
h_end
;
h_in
++
)
{
for
(
int
w_in
=
w_start
;
w_in
<
w_end
;
w_in
++
)
{
for
(
int
h_in
=
h_in_start
;
h_in
<
h_in_end
;
h_in
+=
dilate_height
)
{
for
(
int
w_in
=
w_in_start
;
w_in
<
w_in_end
;
w_in
+=
dilate_width
)
{
if
(
h_in
>=
h_start
&&
h_in
<
h_end
&&
w_in
>=
w_start
&&
w_in
<
w_end
)
{
const
int
offset
=
in_offset
+
h_in
*
input_width
+
w_in
;
value
+=
weight
[(
h_in
-
h_in_start
)
*
filter_width
+
(
w_in
-
w_in_start
)]
*
input_data
[
offset
];
value
+=
weight
[
weight_offset
]
*
input_data
[
offset
];
}
weight_offset
++
;
}
}
int
index
=
((
batch
*
gridDim
.
x
+
c_out
)
*
output_height
+
h_out
)
*
output_width
+
w_out
;
output_data
[
index
]
=
value
;
}
}
}
template
<
typename
T
,
int
c_filter_multiplier
,
int
c_stride
>
__global__
void
KernelDepthwiseConvSp
(
const
T
*
const
input_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
const
output_data
)
{
if
(
c_filter_multiplier
==
0
)
KernelDepthwiseConv
<
T
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
);
else
KernelDepthwiseConv
<
T
>
(
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_height
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
output_data
);
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template
<
typename
T
>
__global__
void
KernelDepthwiseConvInputGrad
(
const
int
nthreads
,
const
T
*
const
output_grad_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
T
*
const
input_grad_data
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
const
int
batch
=
index
/
input_channels
/
input_height
/
input_width
;
const
int
c_in
=
(
index
/
input_height
/
input_width
)
%
input_channels
;
const
int
h_in
=
(
index
/
input_width
)
%
input_height
;
const
int
w_in
=
index
%
input_width
;
__device__
__inline__
void
KernelDepthwiseConvInputGrad
(
const
T
*
const
output_grad_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
const
input_grad_data
)
{
for
(
int
w_in
=
threadIdx
.
x
;
w_in
<
input_width
;
w_in
+=
blockDim
.
x
)
{
for
(
int
h_in
=
threadIdx
.
y
;
h_in
<
input_height
;
h_in
+=
blockDim
.
y
)
{
const
int
batch
=
blockIdx
.
y
;
const
int
c_in
=
blockIdx
.
x
;
const
int
c_out_start
=
c_in
*
filter_multiplier
;
int
h_out_start
=
(
h_in
-
filter_height
+
padding_height
+
stride_height
)
/
stride_height
;
h_out_start
=
0
>
h_out_start
?
0
:
h_out_start
;
h_in
-
(
filter_height
-
1
)
*
dilate_height
+
padding_height
;
int
h_out_end
=
(
h_in
+
padding_height
)
/
stride_height
;
h_out_end
=
output_height
-
1
<
h_out_end
?
output_height
-
1
:
h_out_end
;
int
h_out_end
=
h_in
+
padding_height
;
int
w_out_start
=
(
w_in
-
filter_width
+
padding_width
+
stride_width
)
/
stride_width
;
w_out_start
=
0
>
w_out_start
?
0
:
w_out_start
;
w_in
-
(
filter_width
-
1
)
*
dilate_width
+
padding_width
;
int
w_out_end
=
(
w_in
+
padding_width
)
/
stride_width
;
w_out_end
=
output_width
-
1
<
w_out_end
?
output_width
-
1
:
w_out_end
;
int
w_out_end
=
w_in
+
padding_width
;
T
value
=
0
;
for
(
int
c_out
=
c_out_start
;
c_out
<
c_out_start
+
filter_multiplier
;
c_out
++
)
{
for
(
int
h_out
=
h_out_start
;
h_out
<=
h_out_end
;
++
h_out
)
{
const
int
filter_h
=
h_in
+
padding_height
-
h_out
*
stride_height
;
for
(
int
w_out
=
w_out_start
;
w_out
<=
w_out_end
;
++
w_out
)
{
const
int
filter_w
=
w_in
+
padding_width
-
w_out
*
stride_width
;
const
int
filter_offset
=
c_out
*
filter_height
*
filter_width
+
filter_h
*
filter_width
+
filter_w
;
int
filter_offset
=
(
c_out
+
1
)
*
filter_height
*
filter_width
;
for
(
int
h_out
=
h_out_start
;
h_out
<=
h_out_end
;
h_out
+=
dilate_height
)
{
for
(
int
w_out
=
w_out_start
;
w_out
<=
w_out_end
;
w_out
+=
dilate_width
)
{
filter_offset
--
;
int
s_h_out
=
h_out
/
stride_height
;
int
s_w_out
=
w_out
/
stride_width
;
if
(
h_out
%
stride_height
==
0
&&
w_out
%
stride_width
==
0
&&
s_h_out
>=
0
&&
s_h_out
<
output_height
&&
s_w_out
>=
0
&&
s_w_out
<
output_width
)
{
const
int
output_grad_offset
=
((
batch
*
output_channels
+
c_out
)
*
output_height
+
h_out
)
*
((
batch
*
output_channels
+
c_out
)
*
output_height
+
s_h_out
)
*
output_width
+
w_out
;
value
+=
output_grad_data
[
output_grad_offset
]
*
filter_data
[
filter_offset
];
s_w_out
;
value
+=
output_grad_data
[
output_grad_offset
]
*
filter_data
[
filter_offset
];
}
}
}
}
int
index
=
((
batch
*
gridDim
.
x
+
c_in
)
*
input_height
+
h_in
)
*
input_width
+
w_in
;
input_grad_data
[
index
]
=
value
;
}
input_grad_data
[
index
]
+=
value
;
}
}
template
<
typename
T
,
int
c_filter_multiplier
,
int
c_stride
>
__global__
void
KernelDepthwiseConvInputGradSp
(
const
T
*
const
output_grad_data
,
const
T
*
const
filter_data
,
const
int
batch_size
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
const
input_grad_data
)
{
if
(
c_filter_multiplier
==
0
)
KernelDepthwiseConvInputGrad
<
T
>
(
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
);
else
KernelDepthwiseConvInputGrad
<
T
>
(
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_width
,
c_stride
,
c_stride
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
input_grad_data
);
}
// Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template
<
typename
T
>
__global__
void
KernelDepthwiseConvFilterGrad
(
const
int
nthreads
,
const
T
*
const
output_grad_data
,
const
T
*
const
input_data
,
const
int
num
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
T
*
const
filter_grad_data
)
{
int
index
=
(
blockIdx
.
x
*
gridDim
.
y
+
blockIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
nthreads
)
{
const
int
w_out
=
index
%
output_width
;
const
int
h_out
=
(
index
/
output_width
)
%
output_height
;
const
int
c_out
=
(
index
/
output_width
/
output_height
)
%
output_channels
;
const
int
batch
=
(
index
/
output_width
/
output_height
/
output_channels
);
const
int
c_in
=
c_out
/
filter_multiplier
;
const
int
h_in_start
=
-
padding_height
+
h_out
*
stride_height
;
const
int
w_in_start
=
-
padding_width
+
w_out
*
stride_width
;
const
int
h_in_end
=
-
padding_height
+
h_out
*
stride_height
+
filter_height
;
const
int
w_in_end
=
-
padding_width
+
w_out
*
stride_width
+
filter_width
;
const
int
in_offset
=
(
batch
*
input_channels
+
c_in
)
*
input_height
*
input_width
;
T
*
addr_offset
=
filter_grad_data
+
c_out
*
filter_height
*
filter_width
;
const
int
h_end
=
h_in_end
<
input_height
?
h_in_end
:
input_height
;
const
int
w_end
=
w_in_end
<
input_width
?
w_in_end
:
input_width
;
const
int
h_start
=
h_in_start
>
0
?
h_in_start
:
0
;
const
int
w_start
=
w_in_start
>
0
?
w_in_start
:
0
;
for
(
int
h_in
=
h_start
;
h_in
<
h_end
;
h_in
++
)
{
for
(
int
w_in
=
w_start
;
w_in
<
w_end
;
w_in
++
)
{
const
int
offset
=
in_offset
+
h_in
*
input_width
+
w_in
;
const
T
diff_temp
=
output_grad_data
[
index
]
*
input_data
[
offset
];
T
*
addr
=
addr_offset
+
(
h_in
-
h_in_start
)
*
filter_width
+
(
w_in
-
w_in_start
);
paddle
::
platform
::
CudaAtomicAdd
(
addr
,
diff_temp
);
__device__
__inline__
void
KernelDepthwiseConvFilterGrad
(
const
T
*
output_grad_data
,
const
T
*
input_data
,
const
int
num
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
filter_grad_data
)
{
T
s
=
0
;
int
gbid
=
((
blockIdx
.
z
*
gridDim
.
y
)
+
blockIdx
.
y
)
*
gridDim
.
x
+
blockIdx
.
x
;
int
lid
=
lane_id
();
for
(
int
image_w
=
threadIdx
.
x
;
image_w
<
output_width
;
image_w
+=
blockDim
.
x
)
{
for
(
int
bid
=
0
;
bid
<
num
;
bid
++
)
{
for
(
int
image_h
=
threadIdx
.
y
;
image_h
<
output_height
;
image_h
+=
blockDim
.
y
)
{
int
kernel_id
=
blockIdx
.
z
;
int
kernel_h
=
blockIdx
.
y
*
dilate_height
-
padding_height
;
int
kernel_w
=
blockIdx
.
x
*
dilate_width
-
padding_width
;
int
image_hk
=
image_h
*
stride_height
+
kernel_h
;
int
image_wk
=
image_w
*
stride_width
+
kernel_w
;
if
(
image_hk
<
0
||
image_hk
>=
input_height
)
continue
;
if
(
image_wk
<
0
||
image_wk
>=
input_width
)
continue
;
#define gaid(N, C, H, W) \
((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
s
+=
output_grad_data
[
gaid
(
bid
,
kernel_id
,
image_h
,
image_w
)]
*
input_data
[((
bid
*
(
gridDim
.
z
/
filter_multiplier
)
+
kernel_id
/
filter_multiplier
)
*
input_height
+
image_hk
)
*
input_width
+
image_wk
];
#undef gaid
}
}
}
#if __CUDA_ARCH__ >= 530
s
=
warpReduceSum
<
T
>
(
s
);
if
(
lid
==
0
)
paddle
::
platform
::
CudaAtomicAdd
(
&
filter_grad_data
[
gbid
],
s
);
#else
paddle
::
platform
::
CudaAtomicAdd
(
&
filter_grad_data
[
gbid
],
s
);
#endif
}
template
<
typename
T
,
int
c_filter_multiplier
>
__global__
void
KernelDepthwiseConvFilterGradSp
(
const
T
*
output_grad_data
,
const
T
*
input_data
,
const
int
num
,
const
int
output_channels
,
const
int
output_height
,
const
int
output_width
,
const
int
input_channels
,
const
int
input_height
,
const
int
input_width
,
const
int
filter_multiplier
,
const
int
filter_height
,
const
int
filter_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
const
int
dilate_height
,
const
int
dilate_width
,
T
*
filter_grad_data
)
{
if
(
c_filter_multiplier
==
0
)
KernelDepthwiseConvFilterGrad
<
T
>
(
output_grad_data
,
input_data
,
num
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
filter_grad_data
);
else
KernelDepthwiseConvFilterGrad
<
T
>
(
output_grad_data
,
input_data
,
num
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
c_filter_multiplier
,
filter_height
,
filter_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
dilate_height
,
dilate_width
,
filter_grad_data
);
}
/*
...
...
@@ -177,7 +297,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
filter
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
framework
::
Tensor
*
output
)
{
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
framework
::
Tensor
*
output
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_height
=
input
.
dims
()[
2
];
...
...
@@ -191,22 +313,37 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
int
dilate_height
=
dilations
[
0
];
const
int
dilate_width
=
dilations
[
1
];
const
T
*
input_data
=
input
.
data
<
T
>
();
const
T
*
filter_data
=
filter
.
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
nthreads
=
batch_size
*
output_channels
*
output_height
*
output_width
;
int
blocks
=
(
nthreads
+
1024
-
1
)
/
1024
;
dim3
threads
(
1024
,
1
);
dim3
grid
(
blocks
,
1
);
KernelDepthwiseConv
<
T
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
nthreads
,
input_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
output_channels
/
input_channels
,
ksize_height
,
ksize_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
output_data
);
int
thread
=
512
;
int
blocks
=
std
::
min
(
std
::
max
(
thread
/
output_width
,
1
),
output_height
);
dim3
threads
(
std
::
min
(
output_width
,
thread
),
blocks
,
1
);
dim3
grid
(
output_channels
,
batch_size
,
1
);
int
filter_multiplier
=
output_channels
/
input_channels
;
#define check_case(c_filter_multiplier, c_stride) \
if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride) { \
KernelDepthwiseConvSp<T, c_filter_multiplier, \
c_stride><<<grid, threads, 0, context.stream()>>>( \
input_data, filter_data, batch_size, output_channels, output_height, \
output_width, input_channels, input_height, input_width, \
filter_multiplier, ksize_height, ksize_width, stride_height, \
stride_width, padding_height, padding_width, dilate_height, \
dilate_width, output_data); \
return; \
}
check_case
(
1
,
1
);
check_case
(
1
,
2
);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case
(
0
,
0
);
#undef check_case
}
};
...
...
@@ -219,6 +356,7 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
...
...
@@ -233,22 +371,39 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
int
dilate_height
=
dilations
[
0
];
const
int
dilate_width
=
dilations
[
1
];
const
T
*
filter_data
=
filter
.
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
nthreads
=
batch_size
*
input_channels
*
input_height
*
input_width
;
int
blocks
=
(
nthreads
+
1024
-
1
)
/
1024
;
dim3
threads
(
1024
,
1
);
dim3
grid
(
blocks
,
1
);
KernelDepthwiseConvInputGrad
<
T
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
nthreads
,
output_grad_data
,
filter_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
output_channels
/
input_channels
,
ksize_height
,
ksize_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
input_grad_data
);
int
thread
=
512
;
int
blocks
=
std
::
min
(
std
::
max
(
thread
/
input_width
,
1
),
input_height
);
dim3
threads
(
std
::
min
(
input_width
,
thread
),
blocks
,
1
);
dim3
grid
(
input_channels
,
batch_size
,
1
);
int
filter_multiplier
=
output_channels
/
input_channels
;
#define check_case(c_filter_multiplier, c_stride) \
if (c_filter_multiplier == 0 || \
filter_multiplier == c_filter_multiplier && \
stride_height == stride_width && stride_height == c_stride) { \
KernelDepthwiseConvInputGradSp< \
T, c_filter_multiplier, \
c_stride><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, filter_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, input_grad_data); \
return; \
}
check_case
(
1
,
1
);
check_case
(
1
,
2
);
// NOTE(liangdun): 0,0 for other case
// add other case if needed, e.g. check_case(2^n,1)
check_case
(
0
,
0
);
#undef check_case
}
};
...
...
@@ -260,6 +415,7 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
framework
::
Tensor
*
filter_grad
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
...
...
@@ -274,23 +430,34 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
const
int
stride_width
=
strides
[
1
];
const
int
padding_height
=
paddings
[
0
];
const
int
padding_width
=
paddings
[
1
];
const
int
dilate_height
=
dilations
[
0
];
const
int
dilate_width
=
dilations
[
1
];
const
T
*
input_data
=
input
.
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
nthreads
=
batch_size
*
output_channels
*
output_height
*
output_width
;
int
blocks
=
(
nthreads
+
1024
-
1
)
/
1024
;
dim3
threads
(
1024
,
1
);
dim3
grid
(
blocks
,
1
);
KernelDepthwiseConvFilterGrad
<
T
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
nthreads
,
output_grad_data
,
input_data
,
batch_size
,
output_channels
,
output_height
,
output_width
,
input_channels
,
input_height
,
input_width
,
output_channels
/
input_channels
,
ksize_height
,
ksize_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
filter_grad_data
);
int
block_size
=
512
;
int
crop_output_height
=
std
::
min
(
std
::
max
(
block_size
/
output_width
,
1
),
output_height
);
dim3
grid
(
ksize_width
,
ksize_height
,
output_channels
);
dim3
threads
(
std
::
min
(
output_width
,
block_size
),
crop_output_height
,
1
);
int
filter_multiplier
=
output_channels
/
input_channels
;
#define check_case(c_filter_multiplier) \
if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
KernelDepthwiseConvFilterGradSp< \
T, c_filter_multiplier><<<grid, threads, 0, context.stream()>>>( \
output_grad_data, input_data, batch_size, output_channels, \
output_height, output_width, input_channels, input_height, \
input_width, filter_multiplier, ksize_height, ksize_width, \
stride_height, stride_width, padding_height, padding_width, \
dilate_height, dilate_width, filter_grad_data); \
return; \
}
check_case
(
1
);
check_case
(
0
);
#undef check_case
}
};
...
...
paddle/fluid/operators/math/depthwise_conv.h
浏览文件 @
ecce973d
...
...
@@ -32,7 +32,8 @@ class DepthwiseConvFunctor {
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
filter
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
framework
::
Tensor
*
output
);
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
framework
::
Tensor
*
output
);
};
template
<
typename
DeviceContext
,
typename
T
>
...
...
@@ -43,6 +44,7 @@ class DepthwiseConvInputGradFunctor {
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
framework
::
Tensor
*
input_grad
);
};
...
...
@@ -53,6 +55,7 @@ class DepthwiseConvFilterGradFunctor {
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
framework
::
Tensor
*
filter_grad
);
};
...
...
paddle/fluid/operators/reduce_mean_op.cu
浏览文件 @
ecce973d
...
...
@@ -12,17 +12,64 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_mean_op.h"
REGISTER_OP_CUDA_KERNEL
(
reduce_mean
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
MeanFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
,
ops
::
MeanFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
MeanFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
MeanFunctor
>
);
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
struct
DivideFunctor
{
HOSTDEVICE
explicit
inline
DivideFunctor
(
int
n
)
:
n_inv
((
T
)(
1.0
/
n
))
{}
HOSTDEVICE
inline
T
operator
()(
const
T
&
x
)
const
{
return
x
*
n_inv
;
}
private:
T
n_inv
;
};
template
<
typename
T
>
class
ReduceMeanKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
auto
*
input
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
Tensor
>
(
"Out"
);
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
bool
keep_dim
=
context
.
Attr
<
bool
>
(
"keep_dim"
);
std
::
vector
<
int
>
reduce_dims
;
if
(
reduce_all
)
{
reduce_dims
.
resize
(
input
->
dims
().
size
());
for
(
int
i
=
0
;
i
<
reduce_dims
.
size
();
++
i
)
reduce_dims
[
i
]
=
i
;
}
else
{
for
(
auto
e
:
dims
)
{
reduce_dims
.
push_back
(
e
>=
0
?
e
:
e
+
input
->
dims
().
size
());
}
}
int
reduce_num
=
1
;
for
(
int
i
=
0
;
i
<
reduce_dims
.
size
();
++
i
)
{
reduce_num
*=
input
->
dims
()[
reduce_dims
[
i
]];
}
auto
stream
=
context
.
cuda_device_context
().
stream
();
TensorReduce
<
T
,
T
,
cub
::
Sum
,
DivideFunctor
<
T
>>
(
*
input
,
output
,
reduce_dims
,
static_cast
<
T
>
(
0
),
cub
::
Sum
(),
DivideFunctor
<
T
>
(
reduce_num
),
stream
);
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
reduce_mean
,
ops
::
ReduceMeanKernel
<
float
>
,
ops
::
ReduceMeanKernel
<
double
>
,
ops
::
ReduceMeanKernel
<
int
>
,
ops
::
ReduceMeanKernel
<
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
reduce_mean_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
MeanGradFunctor
>
,
...
...
paddle/fluid/operators/reduce_sum_op.cu
浏览文件 @
ecce973d
...
...
@@ -12,17 +12,59 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_sum_op.h"
REGISTER_OP_CUDA_KERNEL
(
reduce_sum
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
SumFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
,
ops
::
SumFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
,
ops
::
SumFunctor
>
,
ops
::
ReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
,
ops
::
SumFunctor
>
);
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
struct
IdentityFunctor
{
HOSTDEVICE
explicit
inline
IdentityFunctor
()
{}
HOSTDEVICE
inline
T
operator
()(
const
T
&
x
)
const
{
return
x
;
}
};
template
<
typename
T
>
class
ReduceSumKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
auto
*
input
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
Tensor
>
(
"Out"
);
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
bool
keep_dim
=
context
.
Attr
<
bool
>
(
"keep_dim"
);
std
::
vector
<
int
>
reduce_dims
;
if
(
reduce_all
)
{
reduce_dims
.
resize
(
input
->
dims
().
size
());
for
(
int
i
=
0
;
i
<
reduce_dims
.
size
();
++
i
)
reduce_dims
[
i
]
=
i
;
}
else
{
for
(
auto
e
:
dims
)
{
reduce_dims
.
push_back
(
e
>=
0
?
e
:
e
+
input
->
dims
().
size
());
}
}
int
reduce_num
=
1
;
for
(
int
i
=
0
;
i
<
reduce_dims
.
size
();
++
i
)
{
reduce_num
*=
input
->
dims
()[
reduce_dims
[
i
]];
}
auto
stream
=
context
.
cuda_device_context
().
stream
();
TensorReduce
<
T
,
T
,
cub
::
Sum
,
IdentityFunctor
<
T
>>
(
*
input
,
output
,
reduce_dims
,
static_cast
<
T
>
(
0
),
cub
::
Sum
(),
IdentityFunctor
<
T
>
(),
stream
);
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_CUDA_KERNEL
(
reduce_sum
,
ops
::
ReduceSumKernel
<
float
>
,
ops
::
ReduceSumKernel
<
double
>
,
ops
::
ReduceSumKernel
<
int
>
,
ops
::
ReduceSumKernel
<
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
reduce_sum_grad
,
ops
::
ReduceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
,
ops
::
SumGradFunctor
>
,
...
...
python/paddle/fluid/tests/unittests/test_conv2d_op.py
浏览文件 @
ecce973d
...
...
@@ -67,6 +67,7 @@ class TestConv2dOp(OpTest):
def
setUp
(
self
):
self
.
op_type
=
"conv2d"
self
.
use_cudnn
=
False
self
.
use_cuda
=
False
self
.
use_mkldnn
=
False
self
.
data_format
=
"AnyLayout"
self
.
dtype
=
np
.
float32
...
...
@@ -101,24 +102,25 @@ class TestConv2dOp(OpTest):
}
self
.
outputs
=
{
'Output'
:
output
}
def
testcudnn
(
self
):
return
core
.
is_compiled_with_cuda
()
and
self
.
use_cudnn
def
testcuda
(
self
):
return
core
.
is_compiled_with_cuda
()
and
(
self
.
use_cudnn
or
self
.
use_cuda
)
def
test_check_output
(
self
):
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
nn
()
else
core
.
CPUPlace
()
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
a
()
else
core
.
CPUPlace
()
self
.
check_output_with_place
(
place
,
atol
=
1e-5
)
def
test_check_grad
(
self
):
if
self
.
dtype
==
np
.
float16
:
return
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
nn
()
else
core
.
CPUPlace
()
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
a
()
else
core
.
CPUPlace
()
self
.
check_grad_with_place
(
place
,
set
([
'Input'
,
'Filter'
]),
'Output'
,
max_relative_error
=
0.02
)
def
test_check_grad_no_filter
(
self
):
if
self
.
dtype
==
np
.
float16
:
return
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
nn
()
else
core
.
CPUPlace
()
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
a
()
else
core
.
CPUPlace
()
self
.
check_grad_with_place
(
place
,
[
'Input'
],
'Output'
,
...
...
@@ -128,7 +130,7 @@ class TestConv2dOp(OpTest):
def
test_check_grad_no_input
(
self
):
if
self
.
dtype
==
np
.
float16
:
return
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
nn
()
else
core
.
CPUPlace
()
place
=
core
.
CUDAPlace
(
0
)
if
self
.
testcud
a
()
else
core
.
CPUPlace
()
self
.
check_grad_with_place
(
place
,
[
'Filter'
],
'Output'
,
...
...
@@ -325,18 +327,33 @@ class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
class
TestDepthwiseConv
(
TestConv2dOp
):
def
init_test_case
(
self
):
self
.
use_cuda
=
True
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
2
,
2
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
groups
=
3
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
filter_size
=
[
3
,
f_c
,
3
,
3
]
self
.
op_type
=
"depthwise_conv2d"
class
TestDepthwiseConv2
(
TestConv2dOp
):
def
init_test_case
(
self
):
self
.
use_cuda
=
True
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
groups
=
3
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
3
,
f_c
,
3
,
3
]
self
.
op_type
=
"depthwise_conv2d"
class
TestDepthwiseConv3
(
TestConv2dOp
):
def
init_test_case
(
self
):
self
.
use_cuda
=
True
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
...
...
@@ -347,6 +364,34 @@ class TestDepthwiseConv2(TestConv2dOp):
self
.
op_type
=
"depthwise_conv2d"
class
TestDepthwiseConvWithDilation
(
TestConv2dOp
):
def
init_test_case
(
self
):
self
.
use_cuda
=
True
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
2
,
2
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
groups
=
3
self
.
dilations
=
[
2
,
2
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
op_type
=
"depthwise_conv2d"
class
TestDepthwiseConvWithDilation2
(
TestConv2dOp
):
def
init_test_case
(
self
):
self
.
use_cuda
=
True
self
.
pad
=
[
1
,
1
]
self
.
stride
=
[
1
,
1
]
self
.
input_size
=
[
2
,
3
,
5
,
5
]
# NCHW
self
.
groups
=
3
self
.
dilations
=
[
2
,
2
]
assert
np
.
mod
(
self
.
input_size
[
1
],
self
.
groups
)
==
0
f_c
=
self
.
input_size
[
1
]
//
self
.
groups
self
.
filter_size
=
[
6
,
f_c
,
3
,
3
]
self
.
op_type
=
"depthwise_conv2d"
# Please Don't remove the following code.
# Currently, CI use cudnn V5.0 which not support dilation conv.
# class TestCUDNNWithDilation(TestWithDilation):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录