Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
bab11969
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
bab11969
编写于
5月 25, 2018
作者:
T
Tao Luo
提交者:
GitHub
5月 25, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #10913 from tpatejko/tpatejko/optimized-elementwise-add
Blas optimized elementwise_add forward and backward passes
上级
4d29a5d3
3e876b3e
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
160 addition
and
10 deletion
+160
-10
paddle/fluid/operators/elementwise_add_op.h
paddle/fluid/operators/elementwise_add_op.h
+102
-10
paddle/fluid/operators/math/blas.h
paddle/fluid/operators/math/blas.h
+16
-0
paddle/fluid/operators/math/blas_impl.h
paddle/fluid/operators/math/blas_impl.h
+42
-0
未找到文件。
paddle/fluid/operators/elementwise_add_op.h
浏览文件 @
bab11969
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -24,19 +26,57 @@ struct AddFunctor {
inline
HOSTDEVICE
T
operator
()(
T
a
,
T
b
)
const
{
return
a
+
b
;
}
};
template
<
typename
DeviceContext
,
typename
T
>
void
default_elementwise_add
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
framework
::
Tensor
*
z
)
{
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
ElementwiseComputeEx
<
AddFunctor
<
T
>
,
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
axis
,
AddFunctor
<
T
>
(),
z
);
}
template
<
typename
DeviceContext
,
typename
T
>
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
&&
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
>::
type
elementwise_add
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
framework
::
Tensor
*
z
)
{
auto
eigen_x
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
eigen_y
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
eigen_z
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
z
);
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
ctx
);
blas
.
VADD
(
x
->
numel
(),
eigen_x
.
data
(),
eigen_y
.
data
(),
eigen_z
.
data
());
}
template
<
typename
DeviceContext
,
typename
T
>
typename
std
::
enable_if
<
!
std
::
is_floating_point
<
T
>::
value
||
!
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
>::
type
elementwise_add
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
framework
::
Tensor
*
z
)
{
default_elementwise_add
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
z
);
}
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwiseAddKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
*
z
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
const
auto
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
z
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
z
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
ElementwiseComputeEx
<
AddFunctor
<
T
>
,
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
axis
,
AddFunctor
<
T
>
(),
z
);
auto
dims_equal
=
x
->
dims
()
==
y
->
dims
();
if
(
dims_equal
)
{
elementwise_add
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
z
);
}
else
{
default_elementwise_add
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
z
);
}
}
};
...
...
@@ -45,6 +85,55 @@ struct IdentityGrad {
HOSTDEVICE
T
operator
()(
T
x
,
T
y
,
T
out
,
T
dout
)
const
{
return
dout
;
}
};
template
<
typename
DeviceContext
,
typename
T
>
void
default_elementwise_add_grad
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
const
framework
::
Tensor
*
out
,
const
framework
::
Tensor
*
dout
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
)
{
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
ElemwiseGradCompute
<
DeviceContext
,
T
,
IdentityGrad
<
T
>
,
IdentityGrad
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
IdentityGrad
<
T
>
(),
IdentityGrad
<
T
>
());
}
template
<
typename
DeviceContext
,
typename
T
>
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
&&
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
>::
type
elementwise_add_grad
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
const
framework
::
Tensor
*
out
,
const
framework
::
Tensor
*
dout
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
)
{
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
ctx
);
if
(
dx
)
{
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
if
(
dy
)
{
blas
.
VCOPY
(
dout
->
numel
(),
dout
->
data
<
T
>
(),
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
template
<
typename
DeviceContext
,
typename
T
>
typename
std
::
enable_if
<
!
std
::
is_floating_point
<
T
>::
value
||
!
std
::
is_same
<
DeviceContext
,
platform
::
CPUDeviceContext
>::
value
>::
type
elementwise_add_grad
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
const
framework
::
Tensor
*
out
,
const
framework
::
Tensor
*
dout
,
framework
::
Tensor
*
dx
,
framework
::
Tensor
*
dy
)
{
default_elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
}
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwiseAddGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -57,10 +146,13 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
ElemwiseGradCompute
<
DeviceContext
,
T
,
IdentityGrad
<
T
>
,
IdentityGrad
<
T
>>
(
ctx
,
*
x
,
*
y
,
*
out
,
*
dout
,
axis
,
dx
,
dy
,
IdentityGrad
<
T
>
(),
IdentityGrad
<
T
>
());
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
())
&&
(
x
->
dims
()
==
y
->
dims
()))
{
elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
}
else
{
default_elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
}
}
};
...
...
paddle/fluid/operators/math/blas.h
浏览文件 @
bab11969
...
...
@@ -125,6 +125,12 @@ class Blas {
template
<
typename
T
>
void
AXPY
(
int
n
,
T
alpha
,
const
T
*
x
,
T
*
y
)
const
;
template
<
typename
T
>
void
VADD
(
int
n
,
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
;
template
<
typename
T
>
void
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
;
template
<
typename
T
>
void
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
const
T
*
A
,
const
T
*
B
,
T
beta
,
T
*
C
)
const
;
...
...
@@ -163,6 +169,16 @@ class BlasT : private Blas<DeviceContext> {
Base
()
->
template
AXPY
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
VADD
(
ARGS
...
args
)
const
{
Base
()
->
template
VADD
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
VCOPY
(
ARGS
...
args
)
const
{
Base
()
->
template
VCOPY
<
T
>(
args
...);
}
template
<
typename
...
ARGS
>
void
GEMV
(
ARGS
...
args
)
const
{
Base
()
->
template
GEMV
<
T
>(
args
...);
...
...
paddle/fluid/operators/math/blas_impl.h
浏览文件 @
bab11969
...
...
@@ -34,6 +34,18 @@ struct CBlas<float> {
cblas_saxpy
(
args
...);
}
#ifdef PADDLE_WITH_MKLML
template
<
typename
...
ARGS
>
static
void
VADD
(
ARGS
...
args
)
{
vsAdd
(
args
...);
}
#endif
template
<
typename
...
ARGS
>
static
void
VCOPY
(
ARGS
...
args
)
{
cblas_scopy
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMV
(
ARGS
...
args
)
{
cblas_sgemv
(
args
...);
...
...
@@ -59,6 +71,18 @@ struct CBlas<double> {
cblas_daxpy
(
args
...);
}
#ifdef PADDLE_WITH_MKLML
template
<
typename
...
ARGS
>
static
void
VADD
(
ARGS
...
args
)
{
vdAdd
(
args
...);
}
#endif
template
<
typename
...
ARGS
>
static
void
VCOPY
(
ARGS
...
args
)
{
cblas_dcopy
(
args
...);
}
template
<
typename
...
ARGS
>
static
void
GEMV
(
ARGS
...
args
)
{
cblas_dgemv
(
args
...);
...
...
@@ -139,6 +163,24 @@ void Blas<platform::CPUDeviceContext>::AXPY(int n, T alpha, const T *x,
CBlas
<
T
>::
AXPY
(
n
,
alpha
,
x
,
1
,
y
,
1
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
VCOPY
(
int
n
,
const
T
*
x
,
T
*
y
)
const
{
CBlas
<
T
>::
VCOPY
(
n
,
x
,
1
,
y
,
1
);
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
VADD
(
int
n
,
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
{
#ifdef PADDLE_WITH_MKLML
CBlas
<
T
>::
VADD
(
n
,
x
,
y
,
z
);
#else
this
->
template
VCOPY
<
T
>(
n
,
y
,
z
);
this
->
template
AXPY
<
T
>(
n
,
1.
,
x
,
z
);
#endif
}
template
<
>
template
<
typename
T
>
void
Blas
<
platform
::
CPUDeviceContext
>::
GEMV
(
bool
trans_a
,
int
M
,
int
N
,
T
alpha
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录