Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
b2d0ee51
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b2d0ee51
编写于
8月 06, 2018
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refine elementwise_add op
上级
f86198e6
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
78 addition
and
5 deletion
+78
-5
paddle/fluid/operators/elementwise_add_op.cu
paddle/fluid/operators/elementwise_add_op.cu
+54
-0
paddle/fluid/operators/elementwise_add_op.h
paddle/fluid/operators/elementwise_add_op.h
+24
-5
未找到文件。
paddle/fluid/operators/elementwise_add_op.cu
浏览文件 @
b2d0ee51
...
...
@@ -16,6 +16,60 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__global__
void
ElementwiseAddCUDAKernel
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
,
int
post
,
int
size
)
{
int
idx_x
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx_x
<
size
)
{
int
idx_y
=
idx_x
/
post
-
(
idx_x
/
(
n
*
post
))
*
n
;
z
[
idx_x
]
=
x
[
idx_x
]
+
y
[
idx_y
];
}
}
template
<
typename
T
>
class
ElementwiseAddKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
const
auto
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
z
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
*
z_data
=
z
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
device
=
*
(
ctx
.
cuda_device_context
().
eigen_device
());
const
framework
::
DDim
&
x_dim
=
x
->
dims
();
framework
::
DDim
y_dim
=
y
->
dims
();
int
size
=
x
->
numel
();
if
(
x_dim
==
y_dim
)
{
auto
dim
=
framework
::
make_ddim
({
size
});
auto
z_eigen
=
framework
::
EigenTensor
<
T
,
1
>::
From
(
*
z
,
dim
);
auto
x_eigen
=
framework
::
EigenTensor
<
T
,
1
>::
From
(
*
x
,
dim
);
auto
y_eigen
=
framework
::
EigenTensor
<
T
,
1
>::
From
(
*
y
,
dim
);
z_eigen
.
device
(
device
)
=
x_eigen
+
y_eigen
;
}
else
{
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
axis
=
(
axis
==
-
1
?
x_dim
.
size
()
-
y_dim
.
size
()
:
axis
);
y_dim
=
trim_trailing_singular_dims
(
y_dim
);
axis
=
(
y_dim
.
size
()
==
0
)
?
x_dim
.
size
()
:
axis
;
int
pre
,
n
,
post
;
get_mid_dims
(
x_dim
,
y_dim
,
axis
,
&
pre
,
&
n
,
&
post
);
int
threads
=
512
;
int
grids
=
(
size
+
threads
-
1
)
/
threads
;
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
ElementwiseAddCUDAKernel
<
T
><<<
grids
,
threads
,
0
,
stream
>>>
(
x
->
data
<
T
>
(),
y
->
data
<
T
>
(),
z_data
,
n
,
post
,
size
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
...
...
paddle/fluid/operators/elementwise_add_op.h
浏览文件 @
b2d0ee51
...
...
@@ -142,16 +142,35 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
())
&&
(
x
->
dims
()
==
y
->
dims
()))
{
elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
if
(
dx
!=
nullptr
)
dx
->
ShareDataWith
(
*
dout
);
if
(
dy
==
nullptr
)
return
;
if
(
x
->
dims
()
==
y
->
dims
())
{
dy
->
ShareDataWith
(
*
dout
);
}
else
{
default_elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Perform reduction to dout to calculate dy
const
framework
::
DDim
&
x_dim
=
x
->
dims
();
framework
::
DDim
y_dim
=
y
->
dims
();
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
axis
=
(
axis
==
-
1
?
x_dim
.
size
()
-
y_dim
.
size
()
:
axis
);
y_dim
=
trim_trailing_singular_dims
(
y_dim
);
axis
=
(
y_dim
.
size
()
==
0
)
?
x_dim
.
size
()
:
axis
;
auto
*
device
=
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
int
pre
,
n
,
post
;
get_mid_dims
(
x_dim
,
y_dim
,
axis
,
&
pre
,
&
n
,
&
post
);
auto
eigen_dout
=
framework
::
EigenTensor
<
T
,
3
>::
From
(
*
dout
,
framework
::
make_ddim
({
pre
,
n
,
post
}));
auto
eigen_dy
=
framework
::
EigenTensor
<
T
,
1
>::
From
(
*
dy
,
framework
::
make_ddim
({
n
}));
eigen_dy
.
device
(
*
device
)
=
eigen_dout
.
sum
(
framework
::
EigenDim
<
2
>::
From
(
framework
::
make_ddim
({
0
,
2
})));
}
}
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录