Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
ed8a9370
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2305
Star
20932
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ed8a9370
编写于
3月 17, 2022
作者:
Y
YuanRisheng
提交者:
GitHub
3月 17, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
move activation sigmoid (#40626)
上级
9ee03302
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
563 addition
and
608 deletion
+563
-608
paddle/fluid/eager/tests/task_tests/generated_test.cc
paddle/fluid/eager/tests/task_tests/generated_test.cc
+1
-1
paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
...le/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+1
-1
paddle/fluid/framework/new_executor/standalone_executor_test.cc
.../fluid/framework/new_executor/standalone_executor_test.cc
+2
-2
paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
...le/fluid/inference/tensorrt/convert/test_activation_op.cc
+1
-1
paddle/fluid/operators/activation_op.cc
paddle/fluid/operators/activation_op.cc
+4
-24
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+5
-404
paddle/fluid/operators/activation_op.kps
paddle/fluid/operators/activation_op.kps
+3
-139
paddle/phi/kernels/activation_grad_kernel.h
paddle/phi/kernels/activation_grad_kernel.h
+65
-33
paddle/phi/kernels/activation_kernel.h
paddle/phi/kernels/activation_kernel.h
+3
-1
paddle/phi/kernels/cpu/activation_grad_kernel.cc
paddle/phi/kernels/cpu/activation_grad_kernel.cc
+29
-0
paddle/phi/kernels/cpu/activation_kernel.cc
paddle/phi/kernels/cpu/activation_kernel.cc
+9
-0
paddle/phi/kernels/funcs/activation_functor.h
paddle/phi/kernels/funcs/activation_functor.h
+317
-0
paddle/phi/kernels/gpu/activation_grad_kernel.cu
paddle/phi/kernels/gpu/activation_grad_kernel.cu
+30
-0
paddle/phi/kernels/gpu/activation_kernel.cu
paddle/phi/kernels/gpu/activation_kernel.cu
+9
-0
paddle/phi/kernels/impl/activation_grad_impl.h
paddle/phi/kernels/impl/activation_grad_impl.h
+53
-0
paddle/phi/ops/compat/activation_sig.cc
paddle/phi/ops/compat/activation_sig.cc
+31
-2
未找到文件。
paddle/fluid/eager/tests/task_tests/generated_test.cc
浏览文件 @
ed8a9370
...
...
@@ -128,6 +128,6 @@ TEST(Generated, ElementwiseAdd) {
}
// namespace egr
USE_OP
(
sigmoid
);
USE_OP
_ITSELF
(
sigmoid
);
USE_OP_ITSELF
(
elementwise_add
);
USE_OP_ITSELF
(
matmul_v2
);
paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
浏览文件 @
ed8a9370
...
...
@@ -255,6 +255,6 @@ TEST(Hook_intermidiate, Matmul_v2) {
}
}
// namespace egr
USE_OP
(
sigmoid
);
USE_OP
_ITSELF
(
sigmoid
);
USE_OP_ITSELF
(
elementwise_add
);
USE_OP_ITSELF
(
matmul_v2
);
paddle/fluid/framework/new_executor/standalone_executor_test.cc
浏览文件 @
ed8a9370
...
...
@@ -31,7 +31,7 @@ USE_OP(slice);
USE_OP
(
concat
);
USE_OP
(
matmul
);
USE_OP_ITSELF
(
elementwise_add
);
USE_OP
(
sigmoid
);
USE_OP
_ITSELF
(
sigmoid
);
USE_OP_ITSELF
(
tanh
);
USE_OP
(
elementwise_mul
);
USE_OP
(
softmax_with_cross_entropy
);
...
...
@@ -47,7 +47,7 @@ USE_OP(square);
USE_OP
(
transpose2_grad
);
USE_OP
(
concat_grad
);
USE_OP_ITSELF
(
elementwise_mul_grad
);
USE_OP
(
sigmoid_grad
);
USE_OP
_ITSELF
(
sigmoid_grad
);
USE_OP_ITSELF
(
tanh_grad
);
USE_OP
(
sum
);
USE_OP
(
slice_grad
);
...
...
paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
浏览文件 @
ed8a9370
...
...
@@ -53,6 +53,6 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
}
// namespace paddle
USE_OP_ITSELF
(
relu
);
USE_OP
(
sigmoid
);
USE_OP
_ITSELF
(
sigmoid
);
USE_OP_ITSELF
(
tanh
);
USE_OP
(
relu6
);
paddle/fluid/operators/activation_op.cc
浏览文件 @
ed8a9370
...
...
@@ -1492,6 +1492,10 @@ REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor,
REGISTER_ACTIVATION_OP
(
tanh_shrink
,
TanhShrink
,
TanhShrinkFunctor
,
TanhShrinkGradFunctor
);
REGISTER_ACTIVATION_OP
(
silu
,
Silu
,
SiluFunctor
,
SiluGradFunctor
);
REGISTER_ACTIVATION_OP
(
hard_sigmoid
,
HardSigmoid
,
HardSigmoidFunctor
,
HardSigmoidGradFunctor
);
REGISTER_ACTIVATION_OP
(
logsigmoid
,
LogSigmoid
,
LogSigmoidFunctor
,
LogSigmoidGradFunctor
);
/* ========================== sigmoid register =============================
*/
...
...
@@ -1526,30 +1530,6 @@ REGISTER_OPERATOR(sigmoid_triple_grad,
ops
::
SigmoidTripleGradFunctor
<
float
>::
FwdDeps
()
>
,
ops
::
ActivationTripleGradOpInplaceInferer
);
// Register Sigmoid/GradSigmoid Kernels
REGISTER_ACTIVATION_CPU_KERNEL
(
sigmoid
,
Sigmoid
,
SigmoidFunctor
,
SigmoidGradFunctor
);
// Register DoubleGrad Kernel
REGISTER_OP_CPU_KERNEL
(
sigmoid_grad_grad
,
ops
::
SigmoidDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
SigmoidGradGradFunctor
<
float
>>
,
ops
::
SigmoidDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
SigmoidGradGradFunctor
<
double
>>
,
ops
::
SigmoidDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
SigmoidGradGradFunctor
<
plat
::
float16
>>
);
// Register TripleGrad Kernel
REGISTER_OP_CPU_KERNEL
(
sigmoid_triple_grad
,
ops
::
SigmoidTripleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
SigmoidTripleGradFunctor
<
float
>>
,
ops
::
SigmoidTripleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
SigmoidTripleGradFunctor
<
double
>>
,
ops
::
SigmoidTripleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
SigmoidTripleGradFunctor
<
plat
::
float16
>>
);
/* ========================================================================== */
/* ========================== tanh register ============================= */
...
...
paddle/fluid/operators/activation_op.h
浏览文件 @
ed8a9370
...
...
@@ -238,15 +238,6 @@ struct BaseActivationFunctor {
AttrPair
GetAttrs
()
{
return
AttrPair
();
}
};
// sigmoid(x) = 1 / (1 + exp(-x))
template
<
typename
T
>
struct
SigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
out
.
device
(
d
)
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
(
-
x
).
exp
());
}
};
#define USE_PHI_FUNCTOR(name) \
template <typename T> \
using name##Functor = phi::funcs::name##Functor<T>; \
...
...
@@ -285,160 +276,15 @@ USE_PHI_FUNCTOR(TanhShrink)
USE_PHI_FUNCTOR
(
Silu
)
USE_PHI_FUNCTOR
(
ELU
)
USE_PHI_DOUBLE_GRAD_FUNCTOR
(
ELU
)
USE_PHI_FUNCTOR
(
Sigmoid
)
USE_PHI_DOUBLE_GRAD_FUNCTOR
(
Sigmoid
)
USE_PHI_TRIPLE_GRAD_FUNCTOR
(
Sigmoid
)
USE_PHI_FUNCTOR
(
LogSigmoid
)
USE_PHI_FUNCTOR
(
HardSigmoid
)
template
<
typename
T
>
using
ELUGradNegativeAlphaFunctor
=
phi
::
funcs
::
ELUGradNegativeAlphaFunctor
<
T
>
;
template
<
typename
T
>
struct
SigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
out
*
(
static_cast
<
T
>
(
1
)
-
out
);
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
/*
Out
DOut -> SigmoidGradGrad -> DOutNew
DDX DDOut
DDOut = (1-Out)*Out*DDX
DOutNew = (1-2*Out)*DOut*DDX
*/
template
<
typename
T
>
struct
SigmoidGradGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
>
void
operator
()(
const
Device
&
dev
,
const
framework
::
Tensor
*
Out
,
const
framework
::
Tensor
*
ddX
,
const
framework
::
Tensor
*
dOut
,
framework
::
Tensor
*
dOutNew
,
framework
::
Tensor
*
ddOut
)
const
{
auto
*
d
=
dev
.
eigen_device
();
auto
ddx
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddX
,
"Input"
,
"DDX"
,
"SigmoidGradGrad"
));
auto
out
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
Out
,
"Input"
,
"Out"
,
"SigmoidGradGrad"
));
if
(
dOutNew
)
{
auto
dout
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
dOut
,
"Input"
,
"DOut"
,
"SigmoidGradGrad"
));
auto
dout_new
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
dOutNew
,
"Output"
,
"DOutNew"
,
"SigmoidGradGrad"
));
dout_new
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
static_cast
<
T
>
(
2
)
*
out
)
*
dout
*
ddx
;
}
if
(
ddOut
)
{
auto
ddout
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddOut
,
"Output"
,
"DDOut"
,
"SigmoidGradGrad"
));
ddout
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
out
)
*
out
*
ddx
;
}
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
/*
Out
DOut D_Dout
DDx -> SigmoidTripleGrad -> D_DDx
D_DDout d_OutNew
D_Dout_new
D_Dout = (1-2*Out)*DDx*D_Dout_new
D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
Out, DDX, DOut, D_DDOut, D_DOut_New // input
D_OutNew, D_DOut, D_DDx // output
*/
template
<
typename
T
>
struct
SigmoidTripleGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
>
void
operator
()(
const
Device
&
dev
,
const
framework
::
Tensor
*
Out
,
const
framework
::
Tensor
*
ddX
,
const
framework
::
Tensor
*
dOut
,
const
framework
::
Tensor
*
d_DDOut
,
const
framework
::
Tensor
*
d_dOut_New
,
framework
::
Tensor
*
d_d_Out
,
framework
::
Tensor
*
d_Out_New
,
framework
::
Tensor
*
d_DDx
)
const
{
auto
*
d
=
dev
.
eigen_device
();
auto
ddx
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddX
,
"Input"
,
"DDX"
,
"SigmoidTripleGrad"
));
auto
out
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
Out
,
"Input"
,
"Out"
,
"SigmoidTripleGrad"
));
auto
dout
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
dOut
,
"Input"
,
"DOut"
,
"SigmoidTripleGrad"
));
auto
d_ddOut
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_DDOut
,
"Input"
,
"D_DDOut"
,
"SigmoidTripleGrad"
));
auto
d_dOutNew
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_dOut_New
,
"Input"
,
"D_DOut_New"
,
"SigmoidTripleGrad"
));
if
(
d_Out_New
)
{
auto
d_OutNew
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_Out_New
,
"Output"
,
"D_OutNew"
,
"SigmoidTripleGrad"
));
d_OutNew
.
device
(
*
d
)
=
(
ddx
-
static_cast
<
T
>
(
2
)
*
out
*
ddx
)
*
d_ddOut
-
static_cast
<
T
>
(
2
)
*
dout
*
ddx
*
d_dOutNew
;
}
if
(
d_d_Out
)
{
auto
d_dOut
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_d_Out
,
"Output"
,
"D_DOut"
,
"SigmoidTripleGrad"
));
d_dOut
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
static_cast
<
T
>
(
2
)
*
out
)
*
ddx
*
d_dOutNew
;
}
if
(
d_DDx
)
{
auto
d_ddx
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_DDx
,
"Output"
,
"D_DDx"
,
"SigmoidTripleGrad"
));
d_ddx
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
out
)
*
out
*
d_ddOut
+
(
static_cast
<
T
>
(
1
)
-
static_cast
<
T
>
(
2
)
*
out
)
*
dout
*
d_dOutNew
;
}
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
// Originally: logsigmoid(x) = -log (1 + exp(-x))
// For numerical stability, we can use the log-sum-exp trick:
// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
// We can rewrite the above equation as:
// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
// max(-x, 0)))
// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
//
// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
// + exp(-x - max(-x, 0))))
template
<
typename
T
>
struct
LogSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
auto
temp
=
(
-
x
).
cwiseMax
(
static_cast
<
T
>
(
0
));
// temp = max(-x, 0)
out
.
device
(
d
)
=
-
temp
-
(((
-
temp
).
exp
()
+
(
-
x
-
temp
).
exp
()).
log
());
}
};
// Originally: f' = exp(-x) / (1 + exp(-x))
// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
// exp(-x - max(-x, 0)))
template
<
typename
T
>
struct
LogSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
auto
temp
=
(
-
x
).
cwiseMax
(
static_cast
<
T
>
(
0
));
// temp = max(-x, 0)
dx
.
device
(
d
)
=
dout
*
((
-
x
-
temp
).
exp
()
/
((
-
temp
).
exp
()
+
(
-
x
-
temp
).
exp
()));
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepX
;
}
};
// exp(x) = e^x
template
<
typename
T
>
struct
ExpFunctor
:
public
BaseActivationFunctor
<
T
>
{
...
...
@@ -1101,43 +947,6 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepX
;
}
};
template
<
typename
T
>
struct
HardSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
float
slope
;
float
offset
;
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
auto
temp
=
x
*
static_cast
<
T
>
(
slope
)
+
static_cast
<
T
>
(
offset
);
out
.
device
(
d
)
=
temp
.
cwiseMax
(
static_cast
<
T
>
(
0
)).
cwiseMin
(
static_cast
<
T
>
(
1
));
}
};
template
<
typename
T
>
struct
HardSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
float
slope
;
float
offset
;
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
((
out
>
static_cast
<
T
>
(
0
))
*
(
out
<
static_cast
<
T
>
(
1
)))
.
template
cast
<
T
>()
*
static_cast
<
T
>
(
slope
);
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
template
<
typename
T
>
struct
SwishFunctor
:
public
BaseActivationFunctor
<
T
>
{
float
beta
;
...
...
@@ -1365,211 +1174,6 @@ inline void ExtractDoubleGradTensorWithInputDOut(
}
}
template
<
typename
DeviceContext
,
typename
Functor
>
class
SigmoidDoubleGradKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
Tensor
*
Out
,
*
ddX
,
*
dOut
;
framework
::
Tensor
*
dOutNew
,
*
ddOut
;
Out
=
ddX
=
dOut
=
nullptr
;
dOutNew
=
ddOut
=
nullptr
;
// extract ddx(input) and out(input)
ddX
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DDX"
);
Out
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Out"
);
PADDLE_ENFORCE_NOT_NULL
(
ddX
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable ddX, variable name = %s"
,
ctx
.
InputName
(
"DDX"
)));
PADDLE_ENFORCE_NOT_NULL
(
Out
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable Out, variable name = %s"
,
ctx
.
InputName
(
"Out"
)));
// set output ddout
ddOut
=
ctx
.
Output
<
framework
::
Tensor
>
(
"DDOut"
);
// extract dOut(intput)
dOut
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DOut"
);
PADDLE_ENFORCE_NOT_NULL
(
dOut
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable dOut, variable name = %s"
,
ctx
.
InputName
(
"DOut"
)));
dOutNew
=
ctx
.
Output
<
framework
::
Tensor
>
(
"DOutNew"
);
if
(
dOutNew
)
dOutNew
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
if
(
ddOut
)
ddOut
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
auto
&
place
=
ctx
.
template
device_context
<
DeviceContext
>();
Functor
functor
;
functor
(
place
,
Out
,
ddX
,
dOut
,
dOutNew
,
ddOut
);
}
};
// Out, DDX, DOut, D_DDOut, D_DOut_New // input
// D_OutNew, D_DOut, D_DDx // output
template
<
typename
DeviceContext
,
typename
Functor
>
class
SigmoidTripleGradKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
Tensor
*
Out
,
*
ddX
,
*
dOut
,
*
d_ddOut
,
*
d_dOutNew
;
framework
::
Tensor
*
d_OutNew
,
*
d_dOut
,
*
d_ddx
;
Out
=
ddX
=
dOut
=
d_ddOut
=
d_dOutNew
=
nullptr
;
d_OutNew
=
d_dOut
=
d_ddx
=
nullptr
;
// extract ddx(input), out(input), dOut(input), d_ddOut(input),
// d_dOutNew(input)
ddX
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DDX"
);
Out
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Out"
);
dOut
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DOut"
);
d_ddOut
=
ctx
.
Input
<
framework
::
Tensor
>
(
"D_DDOut"
);
d_dOutNew
=
ctx
.
Input
<
framework
::
Tensor
>
(
"D_DOut_New"
);
PADDLE_ENFORCE_NOT_NULL
(
ddX
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable ddX, variable name = %s"
,
ctx
.
InputName
(
"DDX"
)));
PADDLE_ENFORCE_NOT_NULL
(
Out
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable Out, variable name = %s"
,
ctx
.
InputName
(
"Out"
)));
PADDLE_ENFORCE_NOT_NULL
(
dOut
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable dOut, variable name = %s"
,
ctx
.
InputName
(
"DOut"
)));
PADDLE_ENFORCE_NOT_NULL
(
d_ddOut
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable d_ddOut, variable name = %s"
,
ctx
.
InputName
(
"D_DDOut"
)));
PADDLE_ENFORCE_NOT_NULL
(
d_dOutNew
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable d_dOutNew, variable name = %s"
,
ctx
.
InputName
(
"D_DOutNew"
)));
// set output d_OutNew、d_dOut、d_ddx
d_dOut
=
ctx
.
Output
<
framework
::
Tensor
>
(
"D_DOut"
);
d_OutNew
=
ctx
.
Output
<
framework
::
Tensor
>
(
"D_OutNew"
);
d_ddx
=
ctx
.
Output
<
framework
::
Tensor
>
(
"D_DDx"
);
if
(
d_dOut
)
d_dOut
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
if
(
d_OutNew
)
d_OutNew
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
if
(
d_ddx
)
d_ddx
->
mutable_data
<
T
>
(
ddX
->
dims
(),
ctx
.
GetPlace
());
auto
&
place
=
ctx
.
template
device_context
<
DeviceContext
>();
Functor
functor
;
functor
(
place
,
Out
,
ddX
,
dOut
,
d_ddOut
,
d_dOutNew
,
// input
d_dOut
,
d_OutNew
,
d_ddx
);
// output
}
};
template
<
typename
DeviceContext
,
typename
Functor
>
class
TanhDoubleGradKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
Tensor
*
Out
,
*
ddX
,
*
dOut
;
framework
::
Tensor
*
dOutNew
,
*
ddOut
;
Out
=
ddX
=
dOut
=
nullptr
;
dOutNew
=
ddOut
=
nullptr
;
// extract ddx(input) and out(input)
auto
ddx_var
=
ctx
.
InputVar
(
"DDX"
);
auto
out_var
=
ctx
.
InputVar
(
"Out"
);
PADDLE_ENFORCE_NOT_NULL
(
ddx_var
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable ddx, variable name = %s"
,
ctx
.
InputName
(
"DDX"
)));
PADDLE_ENFORCE_NOT_NULL
(
out_var
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable out, variable name = %s"
,
ctx
.
InputName
(
"Out"
)));
ddX
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DDX"
);
Out
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Out"
);
// set output ddout
auto
ddout_var
=
ctx
.
OutputVar
(
"DDOut"
);
if
(
ddout_var
)
{
ddOut
=
ctx
.
Output
<
framework
::
Tensor
>
(
"DDOut"
);
}
// extract dOut(intput)
auto
dout_var
=
ctx
.
InputVar
(
"DOut"
);
PADDLE_ENFORCE_NOT_NULL
(
dout_var
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable dout_var, variable name = %s"
,
ctx
.
InputName
(
"DOut"
)));
dOut
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DOut"
);
// set output dout_new
auto
dout_new_var
=
ctx
.
OutputVar
(
"DOutNew"
);
if
(
dout_new_var
)
{
dOutNew
=
ctx
.
Output
<
framework
::
Tensor
>
(
"DOutNew"
);
}
if
(
dOutNew
)
dOutNew
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
if
(
ddOut
)
ddOut
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
auto
&
place
=
ctx
.
template
device_context
<
DeviceContext
>();
Functor
functor
;
functor
(
place
,
Out
,
ddX
,
dOut
,
dOutNew
,
ddOut
);
}
};
template
<
typename
DeviceContext
,
typename
Functor
>
class
TanhTripeGradKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
Tensor
*
Out
,
*
ddX
,
*
dOut
,
*
d_ddOut
,
*
d_dOutNew
;
framework
::
Tensor
*
d_OutNew
,
*
d_dOut
,
*
d_ddx
;
Out
=
ddX
=
dOut
=
d_ddOut
=
d_dOutNew
=
nullptr
;
d_OutNew
=
d_dOut
=
d_ddx
=
nullptr
;
// extract ddx(input), out(input), dOut(input), d_ddOut(input),
// d_dOutNew(input)
ddX
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DDX"
);
Out
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Out"
);
dOut
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DOut"
);
d_ddOut
=
ctx
.
Input
<
framework
::
Tensor
>
(
"D_DDOut"
);
d_dOutNew
=
ctx
.
Input
<
framework
::
Tensor
>
(
"D_DOut_New"
);
PADDLE_ENFORCE_NOT_NULL
(
ddX
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable ddX, variable name = %s"
,
ctx
.
InputName
(
"DDX"
)));
PADDLE_ENFORCE_NOT_NULL
(
Out
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable Out, variable name = %s"
,
ctx
.
InputName
(
"Out"
)));
PADDLE_ENFORCE_NOT_NULL
(
dOut
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable dOut, variable name = %s"
,
ctx
.
InputName
(
"DOut"
)));
PADDLE_ENFORCE_NOT_NULL
(
d_ddOut
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable d_ddOut, variable name = %s"
,
ctx
.
InputName
(
"D_DDOut"
)));
PADDLE_ENFORCE_NOT_NULL
(
d_dOutNew
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable d_dOutNew, variable name = %s"
,
ctx
.
InputName
(
"D_DOutNew"
)));
// set output d_OutNew、d_dOut、d_ddx
d_dOut
=
ctx
.
Output
<
framework
::
Tensor
>
(
"D_DOut"
);
d_OutNew
=
ctx
.
Output
<
framework
::
Tensor
>
(
"D_OutNew"
);
d_ddx
=
ctx
.
Output
<
framework
::
Tensor
>
(
"D_DDx"
);
if
(
d_dOut
)
d_dOut
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
if
(
d_OutNew
)
d_OutNew
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
if
(
d_ddx
)
d_ddx
->
mutable_data
<
T
>
(
ddX
->
dims
(),
ctx
.
GetPlace
());
auto
&
place
=
ctx
.
template
device_context
<
DeviceContext
>();
Functor
functor
;
functor
(
place
,
Out
,
ddX
,
dOut
,
d_ddOut
,
d_dOutNew
,
// input
d_dOut
,
d_OutNew
,
d_ddx
);
// output
}
};
template
<
typename
DeviceContext
,
typename
Functor
>
class
SquareDoubleGradKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
...
...
@@ -1952,7 +1556,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
}
// namespace paddle
#define FOR_EACH_ACTIVATION_OP(__macro) \
__macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
__macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \
__macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \
__macro(round, Round, RoundFunctor, ZeroGradFunctor); \
...
...
@@ -1965,8 +1568,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
__macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \
__macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \
__macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \
__macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \
HardSigmoidGradFunctor); \
__macro(swish, Swish, SwishFunctor, SwishGradFunctor); \
__macro(mish, Mish, MishFunctor, MishGradFunctor); \
__macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
paddle/fluid/operators/activation_op.kps
浏览文件 @
ed8a9370
...
...
@@ -20,69 +20,6 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType one = static_cast<MPType>(1.0f);
// sigmoid(x) = 1 / (1 + exp(-x))
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(one / (one + exp(-x)));
}
};
template <typename T>
struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
T one = static_cast<T>(1.0f);
// dx = dout * out * (1 - out)
__device__ __forceinline__ T operator()(const T dout, const T out) const {
return dout * out * (one - out);
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType zero = static_cast<MPType>(0.0f);
// logsigmoid(x) = log(1 / (1 + exp(-x)))
// For numerical stability,
// logsigmoid(x) =
// - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
MPType temp = x > zero ? zero : -x;
return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
}
};
template <typename T>
struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType zero = static_cast<MPType>(0.0f);
// dx = dout * exp(-x) / (1 + exp(-x))
// For numerical stability:
// dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
// 0)))
__device__ __forceinline__ T operator()(const T arg_dout,
const T arg_x) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType x = static_cast<MPType>(arg_x);
MPType temp1 = x > zero ? zero : -x;
MPType temp2 = exp(-x - temp1);
return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaCeilFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
...
...
@@ -551,49 +488,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
}
};
template <typename T>
struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
T one = static_cast<T>(1.0f);
float slope;
float offset;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"slope", &slope}, {"offset", &offset}};
}
// hard_sigmoid(x) = 0, when x <= -3
// 1, when x >= 3
// x * slope + offset, otherwise
__device__ __forceinline__ T operator()(const T x) const {
T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
T temp_max = temp > zero ? temp : zero;
T temp_min = temp_max < one ? temp_max : one;
return temp_min;
}
};
template <typename T>
struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
T one = static_cast<T>(1.0f);
float slope;
float offset;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"slope", &slope}, {"offset", &offset}};
}
// dx = (out > 0 && out < 1) ? dout * slope : 0
__device__ __forceinline__ T operator()(const T dout, const T out) const {
return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct CudaSwishFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
...
...
@@ -876,6 +770,9 @@ USE_PHI_FUNCTOR(CudaSoftShrink)
USE_PHI_FUNCTOR(CudaTanhShrink)
USE_PHI_FUNCTOR(CudaSilu)
USE_PHI_FUNCTOR(CudaELU)
USE_PHI_FUNCTOR(CudaSigmoid)
USE_PHI_FUNCTOR(CudaLogSigmoid)
USE_PHI_FUNCTOR(CudaHardSigmoid)
template <typename T>
using CudaELUGradNegativeAlphaFunctor =
...
...
@@ -954,35 +851,6 @@ REGISTER_OP_CUDA_KERNEL(
ops::CELUGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* =========================== sigmoid register ============================
*/
REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
CudaSigmoidGradFunctor);
REGISTER_OP_CUDA_KERNEL(
sigmoid_grad_grad,
ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::SigmoidGradGradFunctor<float>>,
ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::SigmoidGradGradFunctor<double>>,
ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
ops::SigmoidGradGradFunctor<plat::float16>>,
ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
ops::SigmoidGradGradFunctor<plat::bfloat16>>);
REGISTER_OP_CUDA_KERNEL(
sigmoid_triple_grad,
ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
ops::SigmoidTripleGradFunctor<float>>,
ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
ops::SigmoidTripleGradFunctor<double>>,
ops::SigmoidTripleGradKernel<plat::CUDADeviceContext,
ops::SigmoidTripleGradFunctor<plat::float16>>,
ops::SigmoidTripleGradKernel<
plat::CUDADeviceContext,
ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
/* ========================================================================== */
/* =========================== sqrt register ============================= */
REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
CudaSqrtGradFunctor);
...
...
@@ -1120,8 +988,6 @@ REGISTER_OP_CUDA_KERNEL(
/* ========================================================================== */
#define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \
__macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \
CudaLogSigmoidGradFunctor); \
__macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \
CudaSoftShrinkGradFunctor); \
__macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor); \
...
...
@@ -1141,8 +1007,6 @@ REGISTER_OP_CUDA_KERNEL(
CudaTanhShrinkGradFunctor); \
__macro(hard_shrink, HardShrink, CudaHardShrinkFunctor, \
CudaHardShrinkGradFunctor); \
__macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor, \
CudaHardSigmoidGradFunctor); \
__macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \
__macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); \
__macro(hard_swish, HardSwish, CudaHardSwishFunctor, \
...
...
paddle/phi/kernels/activation_grad_kernel.h
浏览文件 @
ed8a9370
...
...
@@ -19,14 +19,14 @@ limitations under the License. */
namespace
phi
{
#define DECLARE_ACTIVATION_GRAD_KERNEL_D
ep
X(name) \
#define DECLARE_ACTIVATION_GRAD_KERNEL_D
EP
X(name) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& dout, \
DenseTensor* dx);
#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_D
ep
X(name, attr) \
#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_D
EP
X(name, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
...
...
@@ -34,7 +34,7 @@ namespace phi {
float attr, \
DenseTensor* dx);
#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_D
ep
X(name, attr1, attr2) \
#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_D
EP
X(name, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
...
...
@@ -43,19 +43,28 @@ namespace phi {
float attr2, \
DenseTensor* dx);
#define DECLARE_ACTIVATION_GRAD_KERNEL_D
epOut
(name) \
#define DECLARE_ACTIVATION_GRAD_KERNEL_D
EPOUT
(name) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
DenseTensor* dx);
#define DECLARE_ACTIVATION_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(name, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr, \
#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(name, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr, \
DenseTensor* dx);
#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(name, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr1, \
float attr2, \
DenseTensor* dx);
template
<
typename
T
,
typename
Context
>
...
...
@@ -107,28 +116,51 @@ void EluDoubleGradKernel(const Context& dev_ctx,
DenseTensor
*
dx
,
DenseTensor
*
ddout
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Cos
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Tan
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Acos
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Sin
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Asin
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Atan
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Sinh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Cosh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Asinh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Acosh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Atanh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
TanhShrink
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Silu
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepOut
(
Relu
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepOut
(
Tanh
);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX
(
LeakyRelu
,
alpha
)
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX
(
ThresholdedRelu
,
threshold
)
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX
(
SoftShrink
,
lambda
)
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX
(
HardShrink
,
threshold
)
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX
(
BRelu
,
t_min
,
t_max
)
template
<
typename
T
,
typename
Context
>
void
SigmoidDoubleGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
out
,
const
DenseTensor
&
ddx
,
const
DenseTensor
&
dout
,
DenseTensor
*
dout_new
,
DenseTensor
*
ddout
);
template
<
typename
T
,
typename
Context
>
void
SigmoidTripleGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
out
,
const
DenseTensor
&
ddx
,
const
DenseTensor
&
dout
,
const
DenseTensor
&
d_ddout
,
const
DenseTensor
&
d_dout_new
,
DenseTensor
*
d_out_new
,
DenseTensor
*
d_dout
,
DenseTensor
*
d_ddx
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Cos
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Tan
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Acos
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Sin
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Asin
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Atan
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Sinh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Cosh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Asinh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Acosh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Atanh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
TanhShrink
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Silu
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
LogSigmoid
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Relu
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Tanh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Sigmoid
);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX
(
LeakyRelu
,
alpha
);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX
(
ThresholdedRelu
,
threshold
);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX
(
SoftShrink
,
lambda
);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX
(
HardShrink
,
threshold
);
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX
(
BRelu
,
t_min
,
t_max
);
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT
(
HardSigmoid
,
slope
,
offset
);
}
// namespace phi
paddle/phi/kernels/activation_kernel.h
浏览文件 @
ed8a9370
...
...
@@ -54,6 +54,8 @@ DECLARE_ACTIVATION_KERNEL(Relu)
DECLARE_ACTIVATION_KERNEL
(
Tanh
)
DECLARE_ACTIVATION_KERNEL
(
TanhShrink
)
DECLARE_ACTIVATION_KERNEL
(
Silu
)
DECLARE_ACTIVATION_KERNEL
(
Sigmoid
)
DECLARE_ACTIVATION_KERNEL
(
LogSigmoid
)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS
(
LeakyRelu
,
alpha
)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS
(
ThresholdedRelu
,
threshold
)
...
...
@@ -62,5 +64,5 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS
(
Elu
,
alpha
)
DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS
(
BRelu
,
t_min
,
t_max
)
DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS
(
HardSigmoid
,
slope
,
offset
)
}
// namespace phi
paddle/phi/kernels/cpu/activation_grad_kernel.cc
浏览文件 @
ed8a9370
...
...
@@ -90,6 +90,23 @@ namespace phi {
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT( \
name, functor_class, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr1, \
float attr2, \
DenseTensor* dx) { \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr1; \
*(attrs[1].second) = attr2; \
ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Cos
,
CosGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Tan
,
TanGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Acos
,
AcosGradFunctor
);
...
...
@@ -103,9 +120,11 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Atanh
,
AtanhGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
TanhShrink
,
TanhShrinkGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Silu
,
SiluGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
LogSigmoid
,
LogSigmoidGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Relu
,
ReluGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Tanh
,
TanhGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Sigmoid
,
SigmoidGradFunctor
);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX
(
LeakyRelu
,
LeakyReluGradFunctor
,
...
...
@@ -125,6 +144,11 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
t_min
,
t_max
);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT
(
HardSigmoid
,
HardSigmoidGradFunctor
,
slope
,
offset
);
template
<
typename
T
,
typename
Context
>
void
EluGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
...
...
@@ -204,3 +228,8 @@ PD_REGISTER_KERNEL(tanh_triple_grad,
float
,
double
,
phi
::
dtype
::
float16
)
{}
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
sigmoid_grad
,
SigmoidGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
sigmoid_double_grad
,
SigmoidDoubleGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
sigmoid_triple_grad
,
SigmoidTripleGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
hard_sigmoid_grad
,
HardSigmoidGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
logsigmoid_grad
,
LogSigmoidGradKernel
)
paddle/phi/kernels/cpu/activation_kernel.cc
浏览文件 @
ed8a9370
...
...
@@ -72,6 +72,8 @@ DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
DEFINE_CPU_ACTIVATION_KERNEL
(
Tanh
,
TanhFunctor
)
DEFINE_CPU_ACTIVATION_KERNEL
(
TanhShrink
,
TanhShrinkFunctor
)
DEFINE_CPU_ACTIVATION_KERNEL
(
Silu
,
SiluFunctor
)
DEFINE_CPU_ACTIVATION_KERNEL
(
Sigmoid
,
SigmoidFunctor
)
DEFINE_CPU_ACTIVATION_KERNEL
(
LogSigmoid
,
LogSigmoidFunctor
)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS
(
LeakyRelu
,
LeakyReluFunctor
,
alpha
)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS
(
ThresholdedRelu
,
...
...
@@ -82,6 +84,10 @@ DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS
(
Elu
,
ELUFunctor
,
alpha
)
DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS
(
BRelu
,
BReluFunctor
,
t_min
,
t_max
)
DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS
(
HardSigmoid
,
HardSigmoidFunctor
,
slope
,
offset
)
}
// namespace phi
PD_REGISTER_KERNEL
(
relu
,
CPU
,
ALL_LAYOUT
,
phi
::
ReluKernel
,
float
,
double
)
{}
...
...
@@ -109,3 +115,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL
(
tanh_shrink
,
TanhShrinkKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
elu
,
EluKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
silu
,
SiluKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
sigmoid
,
SigmoidKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
logsigmoid
,
LogSigmoidKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
hard_sigmoid
,
HardSigmoidKernel
)
paddle/phi/kernels/funcs/activation_functor.h
浏览文件 @
ed8a9370
...
...
@@ -1012,6 +1012,217 @@ struct SiluGradFunctor : public BaseActivationFunctor<T> {
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepX
;
}
};
// sigmoid(x) = 1 / (1 + exp(-x))
template
<
typename
T
>
struct
SigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
out
.
device
(
d
)
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
(
-
x
).
exp
());
}
};
template
<
typename
T
>
struct
SigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
out
*
(
static_cast
<
T
>
(
1
)
-
out
);
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
/*
Out
DOut -> SigmoidGradGrad -> DOutNew
DDX DDOut
DDOut = (1-Out)*Out*DDX
DOutNew = (1-2*Out)*DOut*DDX
*/
template
<
typename
T
>
struct
SigmoidGradGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
>
void
operator
()(
const
Device
&
dev
,
const
DenseTensor
*
Out
,
const
DenseTensor
*
ddX
,
const
DenseTensor
*
dOut
,
DenseTensor
*
dOutNew
,
DenseTensor
*
ddOut
)
const
{
auto
*
d
=
dev
.
eigen_device
();
auto
ddx
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddX
,
"Input"
,
"DDX"
,
"SigmoidGradGrad"
));
auto
out
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
Out
,
"Input"
,
"Out"
,
"SigmoidGradGrad"
));
if
(
dOutNew
)
{
auto
dout
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
dOut
,
"Input"
,
"DOut"
,
"SigmoidGradGrad"
));
auto
dout_new
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
dOutNew
,
"Output"
,
"DOutNew"
,
"SigmoidGradGrad"
));
dout_new
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
static_cast
<
T
>
(
2
)
*
out
)
*
dout
*
ddx
;
}
if
(
ddOut
)
{
auto
ddout
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddOut
,
"Output"
,
"DDOut"
,
"SigmoidGradGrad"
));
ddout
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
out
)
*
out
*
ddx
;
}
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
/*
Out
DOut D_Dout
DDx -> SigmoidTripleGrad -> D_DDx
D_DDout d_OutNew
D_Dout_new
D_Dout = (1-2*Out)*DDx*D_Dout_new
D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
Out, DDX, DOut, D_DDOut, D_DOut_New // input
D_OutNew, D_DOut, D_DDx // output
*/
template
<
typename
T
>
struct
SigmoidTripleGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
>
void
operator
()(
const
Device
&
dev
,
const
DenseTensor
*
Out
,
const
DenseTensor
*
ddX
,
const
DenseTensor
*
dOut
,
const
DenseTensor
*
d_DDOut
,
const
DenseTensor
*
d_dOut_New
,
DenseTensor
*
d_d_Out
,
DenseTensor
*
d_Out_New
,
DenseTensor
*
d_DDx
)
const
{
auto
*
d
=
dev
.
eigen_device
();
auto
ddx
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddX
,
"Input"
,
"DDX"
,
"SigmoidTripleGrad"
));
auto
out
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
Out
,
"Input"
,
"Out"
,
"SigmoidTripleGrad"
));
auto
dout
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
dOut
,
"Input"
,
"DOut"
,
"SigmoidTripleGrad"
));
auto
d_ddOut
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_DDOut
,
"Input"
,
"D_DDOut"
,
"SigmoidTripleGrad"
));
auto
d_dOutNew
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_dOut_New
,
"Input"
,
"D_DOut_New"
,
"SigmoidTripleGrad"
));
if
(
d_Out_New
)
{
auto
d_OutNew
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_Out_New
,
"Output"
,
"D_OutNew"
,
"SigmoidTripleGrad"
));
d_OutNew
.
device
(
*
d
)
=
(
ddx
-
static_cast
<
T
>
(
2
)
*
out
*
ddx
)
*
d_ddOut
-
static_cast
<
T
>
(
2
)
*
dout
*
ddx
*
d_dOutNew
;
}
if
(
d_d_Out
)
{
auto
d_dOut
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_d_Out
,
"Output"
,
"D_DOut"
,
"SigmoidTripleGrad"
));
d_dOut
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
static_cast
<
T
>
(
2
)
*
out
)
*
ddx
*
d_dOutNew
;
}
if
(
d_DDx
)
{
auto
d_ddx
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_DDx
,
"Output"
,
"D_DDx"
,
"SigmoidTripleGrad"
));
d_ddx
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
out
)
*
out
*
d_ddOut
+
(
static_cast
<
T
>
(
1
)
-
static_cast
<
T
>
(
2
)
*
out
)
*
dout
*
d_dOutNew
;
}
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
// Originally: logsigmoid(x) = -log (1 + exp(-x))
// For numerical stability, we can use the log-sum-exp trick:
// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
// We can rewrite the above equation as:
// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
// max(-x, 0)))
// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
//
// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
// + exp(-x - max(-x, 0))))
template
<
typename
T
>
struct
LogSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
auto
temp
=
(
-
x
).
cwiseMax
(
static_cast
<
T
>
(
0
));
// temp = max(-x, 0)
out
.
device
(
d
)
=
-
temp
-
(((
-
temp
).
exp
()
+
(
-
x
-
temp
).
exp
()).
log
());
}
};
// Originally: f' = exp(-x) / (1 + exp(-x))
// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
// exp(-x - max(-x, 0)))
template
<
typename
T
>
struct
LogSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
auto
temp
=
(
-
x
).
cwiseMax
(
static_cast
<
T
>
(
0
));
// temp = max(-x, 0)
dx
.
device
(
d
)
=
dout
*
((
-
x
-
temp
).
exp
()
/
((
-
temp
).
exp
()
+
(
-
x
-
temp
).
exp
()));
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepX
;
}
};
template
<
typename
T
>
struct
HardSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
float
slope
;
float
offset
;
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
auto
temp
=
x
*
static_cast
<
T
>
(
slope
)
+
static_cast
<
T
>
(
offset
);
out
.
device
(
d
)
=
temp
.
cwiseMax
(
static_cast
<
T
>
(
0
)).
cwiseMin
(
static_cast
<
T
>
(
1
));
}
};
template
<
typename
T
>
struct
HardSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
float
slope
;
float
offset
;
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
((
out
>
static_cast
<
T
>
(
0
))
*
(
out
<
static_cast
<
T
>
(
1
)))
.
template
cast
<
T
>()
*
static_cast
<
T
>
(
slope
);
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
template
<
typename
T
>
struct
CudaReluFunctor
:
public
BaseActivationFunctor
<
T
>
{
...
...
@@ -1653,6 +1864,112 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepX
;
}
};
template
<
typename
T
>
struct
CudaSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
using
MPType
=
typename
phi
::
dtype
::
MPTypeTrait
<
T
>::
Type
;
MPType
one
=
static_cast
<
MPType
>
(
1.0
f
);
// sigmoid(x) = 1 / (1 + exp(-x))
__device__
__forceinline__
T
operator
()(
const
T
arg_x
)
const
{
MPType
x
=
static_cast
<
MPType
>
(
arg_x
);
return
static_cast
<
T
>
(
one
/
(
one
+
exp
(
-
x
)));
}
};
template
<
typename
T
>
struct
CudaSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
T
one
=
static_cast
<
T
>
(
1.0
f
);
// dx = dout * out * (1 - out)
__device__
__forceinline__
T
operator
()(
const
T
dout
,
const
T
out
)
const
{
return
dout
*
out
*
(
one
-
out
);
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
template
<
typename
T
>
struct
CudaLogSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
using
MPType
=
typename
phi
::
dtype
::
MPTypeTrait
<
T
>::
Type
;
MPType
zero
=
static_cast
<
MPType
>
(
0.0
f
);
// logsigmoid(x) = log(1 / (1 + exp(-x)))
// For numerical stability,
// logsigmoid(x) =
// - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
__device__
__forceinline__
T
operator
()(
const
T
arg_x
)
const
{
MPType
x
=
static_cast
<
MPType
>
(
arg_x
);
MPType
temp
=
x
>
zero
?
zero
:
-
x
;
return
static_cast
<
T
>
(
-
temp
-
log
(
exp
(
-
temp
)
+
exp
(
-
x
-
temp
)));
}
};
template
<
typename
T
>
struct
CudaLogSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
using
MPType
=
typename
phi
::
dtype
::
MPTypeTrait
<
T
>::
Type
;
MPType
zero
=
static_cast
<
MPType
>
(
0.0
f
);
// dx = dout * exp(-x) / (1 + exp(-x))
// For numerical stability:
// dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
// 0)))
__device__
__forceinline__
T
operator
()(
const
T
arg_dout
,
const
T
arg_x
)
const
{
MPType
dout
=
static_cast
<
MPType
>
(
arg_dout
);
MPType
x
=
static_cast
<
MPType
>
(
arg_x
);
MPType
temp1
=
x
>
zero
?
zero
:
-
x
;
MPType
temp2
=
exp
(
-
x
-
temp1
);
return
static_cast
<
T
>
(
dout
*
(
temp2
/
(
exp
(
-
temp1
)
+
temp2
)));
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepX
;
}
};
template
<
typename
T
>
struct
CudaHardSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
T
zero
=
static_cast
<
T
>
(
0.0
f
);
T
one
=
static_cast
<
T
>
(
1.0
f
);
float
slope
;
float
offset
;
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
// hard_sigmoid(x) = 0, when x <= -3
// 1, when x >= 3
// x * slope + offset, otherwise
__device__
__forceinline__
T
operator
()(
const
T
x
)
const
{
T
temp
=
x
*
static_cast
<
T
>
(
slope
)
+
static_cast
<
T
>
(
offset
);
T
temp_max
=
temp
>
zero
?
temp
:
zero
;
T
temp_min
=
temp_max
<
one
?
temp_max
:
one
;
return
temp_min
;
}
};
template
<
typename
T
>
struct
CudaHardSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
T
zero
=
static_cast
<
T
>
(
0.0
f
);
T
one
=
static_cast
<
T
>
(
1.0
f
);
float
slope
;
float
offset
;
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
// dx = (out > 0 && out < 1) ? dout * slope : 0
__device__
__forceinline__
T
operator
()(
const
T
dout
,
const
T
out
)
const
{
return
(
out
>
zero
&&
out
<
one
)
?
dout
*
static_cast
<
T
>
(
slope
)
:
zero
;
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
#endif
}
// namespace funcs
...
...
paddle/phi/kernels/gpu/activation_grad_kernel.cu
浏览文件 @
ed8a9370
...
...
@@ -142,8 +142,27 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT( \
name, functor_class, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr1, \
float attr2, \
DenseTensor* dx) { \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr1; \
*(attrs[1].second) = attr2; \
ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Relu
,
CudaReluGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Tanh
,
CudaTanhGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Sigmoid
,
CudaSigmoidGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Cos
,
CudaCosGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Tan
,
CudaTanGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Acos
,
CudaAcosGradFunctor
);
...
...
@@ -157,6 +176,7 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Atanh
,
CudaAtanhGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
TanhShrink
,
CudaTanhShrinkGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Silu
,
CudaSiluGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
LogSigmoid
,
CudaLogSigmoidGradFunctor
);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX
(
LeakyRelu
,
CudaLeakyReluGradFunctor
,
...
...
@@ -176,6 +196,11 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
t_min
,
t_max
);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT
(
HardSigmoid
,
CudaHardSigmoidGradFunctor
,
slope
,
offset
);
template
<
typename
T
,
typename
Context
>
void
EluGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
...
...
@@ -270,3 +295,8 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
silu_grad
,
SiluGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
elu_grad
,
EluGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
elu_double_grad
,
EluDoubleGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
sigmoid_grad
,
SigmoidGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
sigmoid_double_grad
,
SigmoidDoubleGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
sigmoid_triple_grad
,
SigmoidTripleGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
hard_sigmoid_grad
,
HardSigmoidGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
logsigmoid_grad
,
LogSigmoidGradKernel
)
paddle/phi/kernels/gpu/activation_kernel.cu
浏览文件 @
ed8a9370
...
...
@@ -91,6 +91,8 @@ DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
DEFINE_GPU_ACTIVATION_KERNEL
(
Tanh
,
CudaTanhFunctor
)
DEFINE_GPU_ACTIVATION_KERNEL
(
TanhShrink
,
CudaTanhShrinkFunctor
)
DEFINE_GPU_ACTIVATION_KERNEL
(
Silu
,
CudaSiluFunctor
)
DEFINE_GPU_ACTIVATION_KERNEL
(
Sigmoid
,
CudaSigmoidFunctor
)
DEFINE_GPU_ACTIVATION_KERNEL
(
LogSigmoid
,
CudaLogSigmoidFunctor
)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS
(
LeakyRelu
,
CudaLeakyReluFunctor
,
alpha
)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS
(
ThresholdedRelu
,
...
...
@@ -103,6 +105,10 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS
(
Elu
,
CudaELUFunctor
,
alpha
)
DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS
(
BRelu
,
CudaBReluFunctor
,
t_min
,
t_max
)
DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS
(
HardSigmoid
,
CudaHardSigmoidFunctor
,
slope
,
offset
)
}
// namespace phi
...
...
@@ -155,3 +161,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL
(
tanh_shrink
,
TanhShrinkKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
elu
,
EluKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
silu
,
SiluKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
sigmoid
,
SigmoidKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
logsigmoid
,
LogSigmoidKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
hard_sigmoid
,
HardSigmoidKernel
)
paddle/phi/kernels/impl/activation_grad_impl.h
浏览文件 @
ed8a9370
...
...
@@ -222,4 +222,57 @@ void EluDoubleGradKernel(const Context& dev_ctx,
functor
(
dev_ctx
,
&
x
,
&
ddx
,
ddout
,
&
dout
,
dx
);
}
template
<
typename
T
,
typename
Context
>
void
SigmoidDoubleGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
out
,
const
DenseTensor
&
ddx
,
const
DenseTensor
&
dout
,
DenseTensor
*
dout_new
,
DenseTensor
*
ddout
)
{
if
(
dout_new
)
{
dout_new
->
Resize
(
out
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
dout_new
);
}
if
(
ddout
)
{
ddout
->
Resize
(
out
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
ddout
);
}
funcs
::
SigmoidGradGradFunctor
<
T
>
functor
;
functor
(
dev_ctx
,
&
out
,
&
ddx
,
&
dout
,
dout_new
,
ddout
);
}
template
<
typename
T
,
typename
Context
>
void
SigmoidTripleGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
out
,
const
DenseTensor
&
ddx
,
const
DenseTensor
&
dout
,
const
DenseTensor
&
d_ddout
,
const
DenseTensor
&
d_dout_new
,
DenseTensor
*
d_out_new
,
DenseTensor
*
d_dout
,
DenseTensor
*
d_ddx
)
{
if
(
d_dout
)
{
d_dout
->
Resize
(
out
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
d_dout
);
}
if
(
d_out_new
)
{
d_dout
->
Resize
(
out
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
d_out_new
);
}
if
(
d_ddx
)
{
d_dout
->
Resize
(
ddx
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
d_ddx
);
}
funcs
::
SigmoidTripleGradFunctor
<
T
>
functor
;
functor
(
dev_ctx
,
&
out
,
&
ddx
,
&
dout
,
&
d_ddout
,
&
d_dout_new
,
d_dout
,
d_out_new
,
d_ddx
);
}
}
// namespace phi
paddle/phi/ops/compat/activation_sig.cc
浏览文件 @
ed8a9370
...
...
@@ -56,9 +56,14 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda");
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP
(
HardShrink
,
"hard_shrink"
,
"threshold"
);
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP
(
TanhShrink
,
"tanh_shrink"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP
(
Silu
,
"silu"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP
(
LogSigmoid
,
"logsigmoid"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP
(
Relu
,
"relu"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP
(
Tanh
,
"tanh"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP
(
Relu
,
"relu"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP
(
Tanh
,
"tanh"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP
(
Sigmoid
,
"sigmoid"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP
(
HardSigmoid
,
"hard_sigmoid"
,
"slope"
comma
"offset"
);
// NOLINT
KernelSignature
ReluDoubleGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
...
...
@@ -79,6 +84,20 @@ KernelSignature TanhTripleGradOpArgumentMapping(
{
"D_OutNew"
,
"D_DOut"
,
"D_DDx"
});
}
KernelSignature
SigmoidDoubleGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"sigmoid_double_grad"
,
{
"Out"
,
"DDX"
,
"DOut"
},
{},
{
"DOutNew"
,
"DDOut"
});
}
KernelSignature
SigmoidTripleGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"sigmoid_triple_grad"
,
{
"Out"
,
"DDX"
,
"DOut"
,
"D_DDOut"
,
"D_DOut_New"
},
{},
{
"D_OutNew"
,
"D_DOut"
,
"D_DDx"
});
}
KernelSignature
LeakyReluDoubleGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
...
...
@@ -114,6 +133,7 @@ PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad);
PD_REGISTER_BASE_KERNEL_NAME
(
softshrink
,
soft_shrink
);
PD_REGISTER_BASE_KERNEL_NAME
(
softshrink_grad
,
soft_shrink_grad
);
PD_REGISTER_BASE_KERNEL_NAME
(
elu_grad_grad
,
elu_double_grad
);
PD_REGISTER_BASE_KERNEL_NAME
(
sigmoid_grad_grad
,
sigmoid_double_grad
);
PD_REGISTER_ARG_MAPPING_FN
(
cos_grad
,
phi
::
CosGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
tan_grad
,
phi
::
TanGradOpArgumentMapping
);
...
...
@@ -152,3 +172,12 @@ PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN
(
elu_grad
,
phi
::
EluGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
elu_grad_grad
,
phi
::
EluDoubleGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
silu_grad
,
phi
::
SiluGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
sigmoid_grad
,
phi
::
SigmoidGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
sigmoid_grad_grad
,
phi
::
SigmoidDoubleGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
sigmoid_triple_grad
,
phi
::
SigmoidTripleGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
logsigmoid_grad
,
phi
::
LogSigmoidGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
hard_sigmoid_grad
,
phi
::
HardSigmoidGradOpArgumentMapping
);
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录