Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
2d9508f8
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2d9508f8
编写于
8月 13, 2018
作者:
S
sneaxiy
提交者:
GitHub
8月 13, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #12554 from sneaxiy/refine_elementwise_add
Refine elementwise_add op
上级
6b45c5a1
5d698589
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
87 addition
and
8 deletion
+87
-8
paddle/fluid/operators/elementwise_add_op.cu
paddle/fluid/operators/elementwise_add_op.cu
+54
-0
paddle/fluid/operators/elementwise_add_op.h
paddle/fluid/operators/elementwise_add_op.h
+33
-8
未找到文件。
paddle/fluid/operators/elementwise_add_op.cu
浏览文件 @
2d9508f8
...
...
@@ -16,6 +16,60 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__global__
void
ElementwiseAddCUDAKernel
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
,
int
post
,
int
size
)
{
int
idx_x
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx_x
<
size
)
{
int
idx_y
=
idx_x
/
post
-
(
idx_x
/
(
n
*
post
))
*
n
;
z
[
idx_x
]
=
x
[
idx_x
]
+
y
[
idx_y
];
}
}
template
<
typename
T
>
class
ElementwiseAddKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
const
auto
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
z
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
*
z_data
=
z
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
device
=
*
(
ctx
.
cuda_device_context
().
eigen_device
());
const
framework
::
DDim
&
x_dim
=
x
->
dims
();
framework
::
DDim
y_dim
=
y
->
dims
();
int
size
=
x
->
numel
();
if
(
x_dim
==
y_dim
)
{
auto
dim
=
framework
::
make_ddim
({
size
});
auto
z_eigen
=
framework
::
EigenTensor
<
T
,
1
>::
From
(
*
z
,
dim
);
auto
x_eigen
=
framework
::
EigenTensor
<
T
,
1
>::
From
(
*
x
,
dim
);
auto
y_eigen
=
framework
::
EigenTensor
<
T
,
1
>::
From
(
*
y
,
dim
);
z_eigen
.
device
(
device
)
=
x_eigen
+
y_eigen
;
}
else
{
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
axis
=
(
axis
==
-
1
?
x_dim
.
size
()
-
y_dim
.
size
()
:
axis
);
y_dim
=
trim_trailing_singular_dims
(
y_dim
);
axis
=
(
y_dim
.
size
()
==
0
)
?
x_dim
.
size
()
:
axis
;
int
pre
,
n
,
post
;
get_mid_dims
(
x_dim
,
y_dim
,
axis
,
&
pre
,
&
n
,
&
post
);
int
threads
=
512
;
int
grids
=
(
size
+
threads
-
1
)
/
threads
;
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
ElementwiseAddCUDAKernel
<
T
><<<
grids
,
threads
,
0
,
stream
>>>
(
x
->
data
<
T
>
(),
y
->
data
<
T
>
(),
z_data
,
n
,
post
,
size
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
...
...
paddle/fluid/operators/elementwise_add_op.h
浏览文件 @
2d9508f8
...
...
@@ -144,16 +144,41 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
// skip out, x, y
auto
*
out
=
dout
;
auto
*
x
=
dout
,
*
y
=
dout
;
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
())
&&
dx
!=
nullptr
&&
dy
!=
nullptr
&&
(
dx
->
dims
()
==
dy
->
dims
()))
{
elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
if
(
dx
!=
nullptr
)
{
// In fact, we can just share memory, but it may cause a bug of memory
// optimizer
// dx->ShareDataWith(*dout);
framework
::
TensorCopy
(
*
dout
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
DeviceContext
>(),
dx
);
}
if
(
dy
==
nullptr
)
return
;
const
framework
::
DDim
&
x_dim
=
dout
->
dims
();
framework
::
DDim
y_dim
=
dy
->
dims
();
if
(
x_dim
==
y_dim
)
{
// dy->ShareDataWith(*dout);
framework
::
TensorCopy
(
*
dout
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
DeviceContext
>(),
dy
);
}
else
{
default_elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Perform reduction to dout to calculate dy
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
axis
=
(
axis
==
-
1
?
x_dim
.
size
()
-
y_dim
.
size
()
:
axis
);
y_dim
=
trim_trailing_singular_dims
(
y_dim
);
axis
=
(
y_dim
.
size
()
==
0
)
?
x_dim
.
size
()
:
axis
;
auto
&
device
=
*
(
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
());
int
pre
,
n
,
post
;
get_mid_dims
(
x_dim
,
y_dim
,
axis
,
&
pre
,
&
n
,
&
post
);
auto
eigen_dout
=
framework
::
EigenTensor
<
T
,
3
>::
From
(
*
dout
,
framework
::
make_ddim
({
pre
,
n
,
post
}));
auto
eigen_dy
=
framework
::
EigenTensor
<
T
,
1
>::
From
(
*
dy
,
framework
::
make_ddim
({
n
}));
eigen_dy
.
device
(
device
)
=
eigen_dout
.
sum
(
framework
::
EigenDim
<
2
>::
From
(
framework
::
make_ddim
({
0
,
2
})));
}
}
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录