Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
ac4bae8e
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ac4bae8e
编写于
12月 14, 2020
作者:
W
wangchaochaohu
提交者:
GitHub
12月 14, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
elementwise_add_grad Op optimization (#29575)
上级
62d44836
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
199 addition
and
0 deletion
+199
-0
paddle/fluid/operators/elementwise/elementwise_add_op.h
paddle/fluid/operators/elementwise/elementwise_add_op.h
+188
-0
python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
...n/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+11
-0
未找到文件。
paddle/fluid/operators/elementwise/elementwise_add_op.h
浏览文件 @
ac4bae8e
...
...
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <utility>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
...
...
@@ -116,6 +118,135 @@ elementwise_add_grad(const framework::ExecutionContext &ctx,
default_elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
}
#ifdef PADDLE_WITH_CUDA
#ifdef __NVCC__
template
<
typename
T
,
int
BLOCK_W
,
int
BLOCK_H
>
__global__
void
MatrixColReduce
(
const
T
*
__restrict__
in
,
T
*
__restrict__
out
,
size_t
width
,
size_t
height
)
{
__shared__
T
sdata
[
BLOCK_H
][
BLOCK_W
+
1
];
size_t
idx
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
size_t
width_stride
=
gridDim
.
x
*
blockDim
.
x
;
size_t
full_width
=
(
width
&
(
~
((
uint64_t
)(
BLOCK_W
-
1
))))
+
((
width
&
(
BLOCK_W
-
1
))
?
BLOCK_W
:
0
);
#pragma unroll
for
(
size_t
w
=
idx
;
w
<
full_width
;
w
+=
width_stride
)
{
sdata
[
threadIdx
.
y
][
threadIdx
.
x
]
=
0
;
__syncthreads
();
size_t
offset
=
w
+
threadIdx
.
y
*
width
;
#pragma unroll
for
(
size_t
h
=
threadIdx
.
y
;
h
<
height
;
h
+=
BLOCK_H
)
{
// block-stride loop across matrix height
sdata
[
threadIdx
.
y
][
threadIdx
.
x
]
+=
(
w
<
width
)
?
in
[
offset
]
:
(
static_cast
<
T
>
(
0
));
offset
+=
width
*
BLOCK_H
;
}
__syncthreads
();
T
val
=
sdata
[
threadIdx
.
x
][
threadIdx
.
y
];
for
(
int
i
=
warpSize
>>
1
;
i
>
0
;
i
>>=
1
)
val
+=
platform
::
CudaShuffleXorSync
(
0xFFFFFFFF
,
val
,
i
);
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
sdata
[
0
][
threadIdx
.
y
]
=
val
;
__syncthreads
();
if
((
threadIdx
.
y
==
0
)
&&
((
w
)
<
width
))
out
[
w
]
=
sdata
[
0
][
threadIdx
.
x
];
}
}
template
<
int
BLOCK_W
,
int
BLOCK_H
>
__global__
void
FP16MatrixColReduce
(
const
paddle
::
platform
::
float16
*
__restrict__
in
,
paddle
::
platform
::
float16
*
__restrict__
out
,
size_t
width
,
size_t
height
)
{
constexpr
int
repeats
=
BLOCK_H
/
BLOCK_W
;
__shared__
paddle
::
platform
::
float16
sdata
[
BLOCK_H
][
BLOCK_W
+
1
];
size_t
idx
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
size_t
width_stride
=
gridDim
.
x
*
blockDim
.
x
;
size_t
full_width
=
(
width
&
(
~
((
uint64_t
)(
BLOCK_W
-
1
))))
+
((
width
&
(
BLOCK_W
-
1
))
?
BLOCK_W
:
0
);
#pragma unroll
for
(
size_t
w
=
idx
;
w
<
full_width
;
w
+=
width_stride
)
{
for
(
int
r
=
0
;
r
<
repeats
;
r
++
)
{
sdata
[
threadIdx
.
y
+
r
*
BLOCK_W
][
threadIdx
.
x
]
=
0
;
}
__syncthreads
();
for
(
int
r
=
0
;
r
<
repeats
;
r
++
)
{
size_t
offset
=
w
+
(
r
*
BLOCK_W
+
threadIdx
.
y
)
*
width
;
#pragma unroll
for
(
size_t
h
=
r
*
BLOCK_H
+
threadIdx
.
y
;
h
<
height
;
h
+=
BLOCK_H
)
{
// block-stride loop across matrix height
sdata
[
r
*
BLOCK_W
+
threadIdx
.
y
][
threadIdx
.
x
]
+=
(
w
<
width
)
?
in
[
offset
+
r
*
BLOCK_W
*
width
]
:
(
static_cast
<
paddle
::
platform
::
float16
>
(
0
));
offset
+=
width
*
BLOCK_H
;
}
}
__syncthreads
();
paddle
::
platform
::
float16
result
=
static_cast
<
paddle
::
platform
::
float16
>
(
0
);
for
(
int
r
=
0
;
r
<
repeats
;
r
++
)
{
paddle
::
platform
::
float16
val
=
sdata
[
threadIdx
.
x
+
r
*
BLOCK_W
][
threadIdx
.
y
];
for
(
int
i
=
warpSize
>>
1
;
i
>
0
;
i
>>=
1
)
val
+=
platform
::
CudaShuffleXorSync
(
0xFFFFFFFF
,
val
,
i
);
__syncthreads
();
result
+=
val
;
}
if
(
threadIdx
.
x
==
0
)
sdata
[
0
][
threadIdx
.
y
]
=
result
;
__syncthreads
();
if
((
threadIdx
.
y
==
0
)
&&
((
w
)
<
width
))
out
[
w
]
=
sdata
[
0
][
threadIdx
.
x
];
}
}
#endif
#endif
bool
static
RunSpecialDims
(
const
framework
::
DDim
&
dx_dims
,
const
framework
::
DDim
&
dy_dims
,
const
framework
::
DDim
&
dout_dims
,
int
axis
)
{
auto
smaller_dims
=
dx_dims
;
auto
bigger_dims
=
dy_dims
;
auto
smaller_dims_size
=
smaller_dims
.
size
();
auto
bigger_dims_size
=
bigger_dims
.
size
();
int
smaller_ignore_size
=
0
;
int
bigger_ignore_size
=
0
;
for
(
int
i
=
0
;
i
<
smaller_dims_size
;
i
++
)
{
if
(
smaller_dims
[
i
]
==
1
)
smaller_ignore_size
++
;
else
break
;
}
for
(
int
i
=
0
;
i
<
bigger_dims_size
;
i
++
)
{
if
(
bigger_dims
[
i
]
==
1
)
bigger_ignore_size
++
;
else
break
;
}
int
smaller_real_size
=
smaller_dims
.
size
()
-
smaller_ignore_size
;
int
bigger_real_size
=
bigger_dims
.
size
()
-
bigger_ignore_size
;
if
(
smaller_real_size
==
bigger_real_size
)
return
false
;
if
(
bigger_real_size
<
smaller_real_size
)
{
smaller_dims
=
dy_dims
;
bigger_dims
=
dx_dims
;
std
::
swap
(
smaller_real_size
,
bigger_real_size
);
}
int
big_size
=
bigger_dims
.
size
();
int
small_size
=
smaller_dims
.
size
();
for
(
int
i
=
1
;
i
<=
smaller_real_size
;
i
++
)
{
if
(
bigger_dims
[
big_size
-
i
]
!=
smaller_dims
[
small_size
-
i
])
return
false
;
}
if
(
axis
!=
-
1
&&
(
axis
!=
(
bigger_real_size
-
smaller_real_size
)))
{
return
false
;
}
return
true
;
}
#ifdef PADDLE_WITH_CUDA
// cuda definition
template
<
typename
DeviceContext
,
typename
T
>
...
...
@@ -144,6 +275,63 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
// skip out
auto
*
out
=
dout
;
#ifdef PADDLE_WITH_CUDA
#ifdef __NVCC__
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
if
(
ctx
.
GetPlace
()
==
platform
::
CUDAPlace
()
&&
dx
!=
nullptr
&&
dy
!=
nullptr
&&
dout
!=
nullptr
&&
dx
->
numel
()
!=
dy
->
numel
()
&&
RunSpecialDims
(
dx
->
dims
(),
dy
->
dims
(),
dout
->
dims
(),
axis
))
{
auto
*
dx_data
=
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
dy_data
=
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
dout_data
=
dout
->
data
<
T
>
();
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
auto
*
out_data
=
dx_data
;
int
width
=
dx
->
numel
();
int
height
=
dout
->
numel
()
/
width
;
if
(
dx
->
dims
()
==
dout
->
dims
())
{
width
=
dy
->
numel
();
height
=
dout
->
numel
()
/
width
;
out_data
=
dy_data
;
framework
::
TensorCopy
(
*
dout
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
dx
);
}
else
{
framework
::
TensorCopy
(
*
dout
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
dy
);
}
constexpr
int
block_x
=
32
;
constexpr
int
block_y
=
32
;
dim3
blocks
(
block_x
,
block_y
);
int
max_physical_threads
=
ctx
.
cuda_device_context
().
GetMaxPhysicalThreadCount
();
int
max_blocks
=
std
::
max
(
max_physical_threads
/
(
block_x
*
block_y
),
1
);
int
theory_block
=
(
width
+
blocks
.
x
-
1
)
/
blocks
.
x
;
dim3
grids
(
std
::
min
(
theory_block
,
max_blocks
));
if
(
std
::
is_same
<
T
,
paddle
::
platform
::
float16
>::
value
)
{
const
paddle
::
platform
::
float16
*
ptr1
=
reinterpret_cast
<
const
paddle
::
platform
::
float16
*>
(
dout_data
);
paddle
::
platform
::
float16
*
ptr2
=
reinterpret_cast
<
paddle
::
platform
::
float16
*>
(
out_data
);
if
(
height
<=
32
)
{
FP16MatrixColReduce
<
32
,
32
><<<
grids
,
blocks
,
0
,
stream
>>>
(
ptr1
,
ptr2
,
width
,
height
);
}
else
{
FP16MatrixColReduce
<
32
,
64
><<<
grids
,
blocks
,
0
,
stream
>>>
(
ptr1
,
ptr2
,
width
,
height
);
}
return
;
}
MatrixColReduce
<
T
,
block_x
,
block_y
><<<
grids
,
blocks
,
0
,
stream
>>>
(
dout_data
,
out_data
,
width
,
height
);
return
;
}
#endif
#endif
// Special case when dy is not needed and dx doesn't reduce
if
(
dx
!=
nullptr
&&
dy
==
nullptr
&&
dx
->
dims
()
==
dout
->
dims
())
{
VLOG
(
4
)
<<
"Special case when dy is not needed and dx doesn't "
...
...
python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
浏览文件 @
ac4bae8e
...
...
@@ -351,6 +351,16 @@ class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
self
.
axis
=
-
1
class
TestElementwiseFP16AddOp_commonuse_add1
(
TestFP16ElementwiseAddOp
):
def
init_input_output
(
self
):
self
.
x
=
np
.
random
.
rand
(
20
,
30
,
100
).
astype
(
self
.
dtype
)
self
.
y
=
np
.
random
.
rand
(
1
,
1
,
100
).
astype
(
self
.
dtype
)
self
.
out
=
self
.
x
+
self
.
y
def
init_axis
(
self
):
self
.
axis
=
-
1
class
TestElementwiseAddOp_commonuse_add2
(
TestElementwiseAddOp
):
def
init_input_output
(
self
):
self
.
x
=
np
.
random
.
rand
(
10
,
3
,
1
,
4
).
astype
(
self
.
dtype
)
...
...
@@ -429,4 +439,5 @@ class TestAddOp(unittest.TestCase):
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录