Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
ed8a9370
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ed8a9370
编写于
3月 17, 2022
作者:
Y
YuanRisheng
提交者:
GitHub
3月 17, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
move activation sigmoid (#40626)
上级
9ee03302
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
563 addition
and
608 deletion
+563
-608
paddle/fluid/eager/tests/task_tests/generated_test.cc
paddle/fluid/eager/tests/task_tests/generated_test.cc
+1
-1
paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
...le/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+1
-1
paddle/fluid/framework/new_executor/standalone_executor_test.cc
.../fluid/framework/new_executor/standalone_executor_test.cc
+2
-2
paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
...le/fluid/inference/tensorrt/convert/test_activation_op.cc
+1
-1
paddle/fluid/operators/activation_op.cc
paddle/fluid/operators/activation_op.cc
+4
-24
paddle/fluid/operators/activation_op.h
paddle/fluid/operators/activation_op.h
+5
-404
paddle/fluid/operators/activation_op.kps
paddle/fluid/operators/activation_op.kps
+3
-139
paddle/phi/kernels/activation_grad_kernel.h
paddle/phi/kernels/activation_grad_kernel.h
+65
-33
paddle/phi/kernels/activation_kernel.h
paddle/phi/kernels/activation_kernel.h
+3
-1
paddle/phi/kernels/cpu/activation_grad_kernel.cc
paddle/phi/kernels/cpu/activation_grad_kernel.cc
+29
-0
paddle/phi/kernels/cpu/activation_kernel.cc
paddle/phi/kernels/cpu/activation_kernel.cc
+9
-0
paddle/phi/kernels/funcs/activation_functor.h
paddle/phi/kernels/funcs/activation_functor.h
+317
-0
paddle/phi/kernels/gpu/activation_grad_kernel.cu
paddle/phi/kernels/gpu/activation_grad_kernel.cu
+30
-0
paddle/phi/kernels/gpu/activation_kernel.cu
paddle/phi/kernels/gpu/activation_kernel.cu
+9
-0
paddle/phi/kernels/impl/activation_grad_impl.h
paddle/phi/kernels/impl/activation_grad_impl.h
+53
-0
paddle/phi/ops/compat/activation_sig.cc
paddle/phi/ops/compat/activation_sig.cc
+31
-2
未找到文件。
paddle/fluid/eager/tests/task_tests/generated_test.cc
浏览文件 @
ed8a9370
...
...
@@ -128,6 +128,6 @@ TEST(Generated, ElementwiseAdd) {
}
// namespace egr
USE_OP
(
sigmoid
);
USE_OP
_ITSELF
(
sigmoid
);
USE_OP_ITSELF
(
elementwise_add
);
USE_OP_ITSELF
(
matmul_v2
);
paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
浏览文件 @
ed8a9370
...
...
@@ -255,6 +255,6 @@ TEST(Hook_intermidiate, Matmul_v2) {
}
}
// namespace egr
USE_OP
(
sigmoid
);
USE_OP
_ITSELF
(
sigmoid
);
USE_OP_ITSELF
(
elementwise_add
);
USE_OP_ITSELF
(
matmul_v2
);
paddle/fluid/framework/new_executor/standalone_executor_test.cc
浏览文件 @
ed8a9370
...
...
@@ -31,7 +31,7 @@ USE_OP(slice);
USE_OP
(
concat
);
USE_OP
(
matmul
);
USE_OP_ITSELF
(
elementwise_add
);
USE_OP
(
sigmoid
);
USE_OP
_ITSELF
(
sigmoid
);
USE_OP_ITSELF
(
tanh
);
USE_OP
(
elementwise_mul
);
USE_OP
(
softmax_with_cross_entropy
);
...
...
@@ -47,7 +47,7 @@ USE_OP(square);
USE_OP
(
transpose2_grad
);
USE_OP
(
concat_grad
);
USE_OP_ITSELF
(
elementwise_mul_grad
);
USE_OP
(
sigmoid_grad
);
USE_OP
_ITSELF
(
sigmoid_grad
);
USE_OP_ITSELF
(
tanh_grad
);
USE_OP
(
sum
);
USE_OP
(
slice_grad
);
...
...
paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
浏览文件 @
ed8a9370
...
...
@@ -53,6 +53,6 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); }
}
// namespace paddle
USE_OP_ITSELF
(
relu
);
USE_OP
(
sigmoid
);
USE_OP
_ITSELF
(
sigmoid
);
USE_OP_ITSELF
(
tanh
);
USE_OP
(
relu6
);
paddle/fluid/operators/activation_op.cc
浏览文件 @
ed8a9370
...
...
@@ -1492,6 +1492,10 @@ REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor,
REGISTER_ACTIVATION_OP
(
tanh_shrink
,
TanhShrink
,
TanhShrinkFunctor
,
TanhShrinkGradFunctor
);
REGISTER_ACTIVATION_OP
(
silu
,
Silu
,
SiluFunctor
,
SiluGradFunctor
);
REGISTER_ACTIVATION_OP
(
hard_sigmoid
,
HardSigmoid
,
HardSigmoidFunctor
,
HardSigmoidGradFunctor
);
REGISTER_ACTIVATION_OP
(
logsigmoid
,
LogSigmoid
,
LogSigmoidFunctor
,
LogSigmoidGradFunctor
);
/* ========================== sigmoid register =============================
*/
...
...
@@ -1526,30 +1530,6 @@ REGISTER_OPERATOR(sigmoid_triple_grad,
ops
::
SigmoidTripleGradFunctor
<
float
>::
FwdDeps
()
>
,
ops
::
ActivationTripleGradOpInplaceInferer
);
// Register Sigmoid/GradSigmoid Kernels
REGISTER_ACTIVATION_CPU_KERNEL
(
sigmoid
,
Sigmoid
,
SigmoidFunctor
,
SigmoidGradFunctor
);
// Register DoubleGrad Kernel
REGISTER_OP_CPU_KERNEL
(
sigmoid_grad_grad
,
ops
::
SigmoidDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
SigmoidGradGradFunctor
<
float
>>
,
ops
::
SigmoidDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
SigmoidGradGradFunctor
<
double
>>
,
ops
::
SigmoidDoubleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
SigmoidGradGradFunctor
<
plat
::
float16
>>
);
// Register TripleGrad Kernel
REGISTER_OP_CPU_KERNEL
(
sigmoid_triple_grad
,
ops
::
SigmoidTripleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
SigmoidTripleGradFunctor
<
float
>>
,
ops
::
SigmoidTripleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
SigmoidTripleGradFunctor
<
double
>>
,
ops
::
SigmoidTripleGradKernel
<
plat
::
CPUDeviceContext
,
ops
::
SigmoidTripleGradFunctor
<
plat
::
float16
>>
);
/* ========================================================================== */
/* ========================== tanh register ============================= */
...
...
paddle/fluid/operators/activation_op.h
浏览文件 @
ed8a9370
...
...
@@ -238,15 +238,6 @@ struct BaseActivationFunctor {
AttrPair
GetAttrs
()
{
return
AttrPair
();
}
};
// sigmoid(x) = 1 / (1 + exp(-x))
template
<
typename
T
>
struct
SigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
out
.
device
(
d
)
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
(
-
x
).
exp
());
}
};
#define USE_PHI_FUNCTOR(name) \
template <typename T> \
using name##Functor = phi::funcs::name##Functor<T>; \
...
...
@@ -285,160 +276,15 @@ USE_PHI_FUNCTOR(TanhShrink)
USE_PHI_FUNCTOR
(
Silu
)
USE_PHI_FUNCTOR
(
ELU
)
USE_PHI_DOUBLE_GRAD_FUNCTOR
(
ELU
)
USE_PHI_FUNCTOR
(
Sigmoid
)
USE_PHI_DOUBLE_GRAD_FUNCTOR
(
Sigmoid
)
USE_PHI_TRIPLE_GRAD_FUNCTOR
(
Sigmoid
)
USE_PHI_FUNCTOR
(
LogSigmoid
)
USE_PHI_FUNCTOR
(
HardSigmoid
)
template
<
typename
T
>
using
ELUGradNegativeAlphaFunctor
=
phi
::
funcs
::
ELUGradNegativeAlphaFunctor
<
T
>
;
template
<
typename
T
>
struct
SigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
out
*
(
static_cast
<
T
>
(
1
)
-
out
);
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
/*
Out
DOut -> SigmoidGradGrad -> DOutNew
DDX DDOut
DDOut = (1-Out)*Out*DDX
DOutNew = (1-2*Out)*DOut*DDX
*/
template
<
typename
T
>
struct
SigmoidGradGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
>
void
operator
()(
const
Device
&
dev
,
const
framework
::
Tensor
*
Out
,
const
framework
::
Tensor
*
ddX
,
const
framework
::
Tensor
*
dOut
,
framework
::
Tensor
*
dOutNew
,
framework
::
Tensor
*
ddOut
)
const
{
auto
*
d
=
dev
.
eigen_device
();
auto
ddx
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddX
,
"Input"
,
"DDX"
,
"SigmoidGradGrad"
));
auto
out
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
Out
,
"Input"
,
"Out"
,
"SigmoidGradGrad"
));
if
(
dOutNew
)
{
auto
dout
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
dOut
,
"Input"
,
"DOut"
,
"SigmoidGradGrad"
));
auto
dout_new
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
dOutNew
,
"Output"
,
"DOutNew"
,
"SigmoidGradGrad"
));
dout_new
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
static_cast
<
T
>
(
2
)
*
out
)
*
dout
*
ddx
;
}
if
(
ddOut
)
{
auto
ddout
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddOut
,
"Output"
,
"DDOut"
,
"SigmoidGradGrad"
));
ddout
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
out
)
*
out
*
ddx
;
}
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
/*
Out
DOut D_Dout
DDx -> SigmoidTripleGrad -> D_DDx
D_DDout d_OutNew
D_Dout_new
D_Dout = (1-2*Out)*DDx*D_Dout_new
D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
Out, DDX, DOut, D_DDOut, D_DOut_New // input
D_OutNew, D_DOut, D_DDx // output
*/
template
<
typename
T
>
struct
SigmoidTripleGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
>
void
operator
()(
const
Device
&
dev
,
const
framework
::
Tensor
*
Out
,
const
framework
::
Tensor
*
ddX
,
const
framework
::
Tensor
*
dOut
,
const
framework
::
Tensor
*
d_DDOut
,
const
framework
::
Tensor
*
d_dOut_New
,
framework
::
Tensor
*
d_d_Out
,
framework
::
Tensor
*
d_Out_New
,
framework
::
Tensor
*
d_DDx
)
const
{
auto
*
d
=
dev
.
eigen_device
();
auto
ddx
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddX
,
"Input"
,
"DDX"
,
"SigmoidTripleGrad"
));
auto
out
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
Out
,
"Input"
,
"Out"
,
"SigmoidTripleGrad"
));
auto
dout
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
dOut
,
"Input"
,
"DOut"
,
"SigmoidTripleGrad"
));
auto
d_ddOut
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_DDOut
,
"Input"
,
"D_DDOut"
,
"SigmoidTripleGrad"
));
auto
d_dOutNew
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_dOut_New
,
"Input"
,
"D_DOut_New"
,
"SigmoidTripleGrad"
));
if
(
d_Out_New
)
{
auto
d_OutNew
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_Out_New
,
"Output"
,
"D_OutNew"
,
"SigmoidTripleGrad"
));
d_OutNew
.
device
(
*
d
)
=
(
ddx
-
static_cast
<
T
>
(
2
)
*
out
*
ddx
)
*
d_ddOut
-
static_cast
<
T
>
(
2
)
*
dout
*
ddx
*
d_dOutNew
;
}
if
(
d_d_Out
)
{
auto
d_dOut
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_d_Out
,
"Output"
,
"D_DOut"
,
"SigmoidTripleGrad"
));
d_dOut
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
static_cast
<
T
>
(
2
)
*
out
)
*
ddx
*
d_dOutNew
;
}
if
(
d_DDx
)
{
auto
d_ddx
=
framework
::
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_DDx
,
"Output"
,
"D_DDx"
,
"SigmoidTripleGrad"
));
d_ddx
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
out
)
*
out
*
d_ddOut
+
(
static_cast
<
T
>
(
1
)
-
static_cast
<
T
>
(
2
)
*
out
)
*
dout
*
d_dOutNew
;
}
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
// Originally: logsigmoid(x) = -log (1 + exp(-x))
// For numerical stability, we can use the log-sum-exp trick:
// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
// We can rewrite the above equation as:
// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
// max(-x, 0)))
// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
//
// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
// + exp(-x - max(-x, 0))))
template
<
typename
T
>
struct
LogSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
auto
temp
=
(
-
x
).
cwiseMax
(
static_cast
<
T
>
(
0
));
// temp = max(-x, 0)
out
.
device
(
d
)
=
-
temp
-
(((
-
temp
).
exp
()
+
(
-
x
-
temp
).
exp
()).
log
());
}
};
// Originally: f' = exp(-x) / (1 + exp(-x))
// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
// exp(-x - max(-x, 0)))
template
<
typename
T
>
struct
LogSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
auto
temp
=
(
-
x
).
cwiseMax
(
static_cast
<
T
>
(
0
));
// temp = max(-x, 0)
dx
.
device
(
d
)
=
dout
*
((
-
x
-
temp
).
exp
()
/
((
-
temp
).
exp
()
+
(
-
x
-
temp
).
exp
()));
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepX
;
}
};
// exp(x) = e^x
template
<
typename
T
>
struct
ExpFunctor
:
public
BaseActivationFunctor
<
T
>
{
...
...
@@ -1101,43 +947,6 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepX
;
}
};
template
<
typename
T
>
struct
HardSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
float
slope
;
float
offset
;
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
auto
temp
=
x
*
static_cast
<
T
>
(
slope
)
+
static_cast
<
T
>
(
offset
);
out
.
device
(
d
)
=
temp
.
cwiseMax
(
static_cast
<
T
>
(
0
)).
cwiseMin
(
static_cast
<
T
>
(
1
));
}
};
template
<
typename
T
>
struct
HardSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
float
slope
;
float
offset
;
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
((
out
>
static_cast
<
T
>
(
0
))
*
(
out
<
static_cast
<
T
>
(
1
)))
.
template
cast
<
T
>()
*
static_cast
<
T
>
(
slope
);
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
template
<
typename
T
>
struct
SwishFunctor
:
public
BaseActivationFunctor
<
T
>
{
float
beta
;
...
...
@@ -1365,211 +1174,6 @@ inline void ExtractDoubleGradTensorWithInputDOut(
}
}
template
<
typename
DeviceContext
,
typename
Functor
>
class
SigmoidDoubleGradKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
Tensor
*
Out
,
*
ddX
,
*
dOut
;
framework
::
Tensor
*
dOutNew
,
*
ddOut
;
Out
=
ddX
=
dOut
=
nullptr
;
dOutNew
=
ddOut
=
nullptr
;
// extract ddx(input) and out(input)
ddX
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DDX"
);
Out
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Out"
);
PADDLE_ENFORCE_NOT_NULL
(
ddX
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable ddX, variable name = %s"
,
ctx
.
InputName
(
"DDX"
)));
PADDLE_ENFORCE_NOT_NULL
(
Out
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable Out, variable name = %s"
,
ctx
.
InputName
(
"Out"
)));
// set output ddout
ddOut
=
ctx
.
Output
<
framework
::
Tensor
>
(
"DDOut"
);
// extract dOut(intput)
dOut
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DOut"
);
PADDLE_ENFORCE_NOT_NULL
(
dOut
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable dOut, variable name = %s"
,
ctx
.
InputName
(
"DOut"
)));
dOutNew
=
ctx
.
Output
<
framework
::
Tensor
>
(
"DOutNew"
);
if
(
dOutNew
)
dOutNew
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
if
(
ddOut
)
ddOut
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
auto
&
place
=
ctx
.
template
device_context
<
DeviceContext
>();
Functor
functor
;
functor
(
place
,
Out
,
ddX
,
dOut
,
dOutNew
,
ddOut
);
}
};
// Out, DDX, DOut, D_DDOut, D_DOut_New // input
// D_OutNew, D_DOut, D_DDx // output
template
<
typename
DeviceContext
,
typename
Functor
>
class
SigmoidTripleGradKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
Tensor
*
Out
,
*
ddX
,
*
dOut
,
*
d_ddOut
,
*
d_dOutNew
;
framework
::
Tensor
*
d_OutNew
,
*
d_dOut
,
*
d_ddx
;
Out
=
ddX
=
dOut
=
d_ddOut
=
d_dOutNew
=
nullptr
;
d_OutNew
=
d_dOut
=
d_ddx
=
nullptr
;
// extract ddx(input), out(input), dOut(input), d_ddOut(input),
// d_dOutNew(input)
ddX
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DDX"
);
Out
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Out"
);
dOut
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DOut"
);
d_ddOut
=
ctx
.
Input
<
framework
::
Tensor
>
(
"D_DDOut"
);
d_dOutNew
=
ctx
.
Input
<
framework
::
Tensor
>
(
"D_DOut_New"
);
PADDLE_ENFORCE_NOT_NULL
(
ddX
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable ddX, variable name = %s"
,
ctx
.
InputName
(
"DDX"
)));
PADDLE_ENFORCE_NOT_NULL
(
Out
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable Out, variable name = %s"
,
ctx
.
InputName
(
"Out"
)));
PADDLE_ENFORCE_NOT_NULL
(
dOut
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable dOut, variable name = %s"
,
ctx
.
InputName
(
"DOut"
)));
PADDLE_ENFORCE_NOT_NULL
(
d_ddOut
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable d_ddOut, variable name = %s"
,
ctx
.
InputName
(
"D_DDOut"
)));
PADDLE_ENFORCE_NOT_NULL
(
d_dOutNew
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable d_dOutNew, variable name = %s"
,
ctx
.
InputName
(
"D_DOutNew"
)));
// set output d_OutNew、d_dOut、d_ddx
d_dOut
=
ctx
.
Output
<
framework
::
Tensor
>
(
"D_DOut"
);
d_OutNew
=
ctx
.
Output
<
framework
::
Tensor
>
(
"D_OutNew"
);
d_ddx
=
ctx
.
Output
<
framework
::
Tensor
>
(
"D_DDx"
);
if
(
d_dOut
)
d_dOut
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
if
(
d_OutNew
)
d_OutNew
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
if
(
d_ddx
)
d_ddx
->
mutable_data
<
T
>
(
ddX
->
dims
(),
ctx
.
GetPlace
());
auto
&
place
=
ctx
.
template
device_context
<
DeviceContext
>();
Functor
functor
;
functor
(
place
,
Out
,
ddX
,
dOut
,
d_ddOut
,
d_dOutNew
,
// input
d_dOut
,
d_OutNew
,
d_ddx
);
// output
}
};
template
<
typename
DeviceContext
,
typename
Functor
>
class
TanhDoubleGradKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
Tensor
*
Out
,
*
ddX
,
*
dOut
;
framework
::
Tensor
*
dOutNew
,
*
ddOut
;
Out
=
ddX
=
dOut
=
nullptr
;
dOutNew
=
ddOut
=
nullptr
;
// extract ddx(input) and out(input)
auto
ddx_var
=
ctx
.
InputVar
(
"DDX"
);
auto
out_var
=
ctx
.
InputVar
(
"Out"
);
PADDLE_ENFORCE_NOT_NULL
(
ddx_var
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable ddx, variable name = %s"
,
ctx
.
InputName
(
"DDX"
)));
PADDLE_ENFORCE_NOT_NULL
(
out_var
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable out, variable name = %s"
,
ctx
.
InputName
(
"Out"
)));
ddX
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DDX"
);
Out
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Out"
);
// set output ddout
auto
ddout_var
=
ctx
.
OutputVar
(
"DDOut"
);
if
(
ddout_var
)
{
ddOut
=
ctx
.
Output
<
framework
::
Tensor
>
(
"DDOut"
);
}
// extract dOut(intput)
auto
dout_var
=
ctx
.
InputVar
(
"DOut"
);
PADDLE_ENFORCE_NOT_NULL
(
dout_var
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable dout_var, variable name = %s"
,
ctx
.
InputName
(
"DOut"
)));
dOut
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DOut"
);
// set output dout_new
auto
dout_new_var
=
ctx
.
OutputVar
(
"DOutNew"
);
if
(
dout_new_var
)
{
dOutNew
=
ctx
.
Output
<
framework
::
Tensor
>
(
"DOutNew"
);
}
if
(
dOutNew
)
dOutNew
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
if
(
ddOut
)
ddOut
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
auto
&
place
=
ctx
.
template
device_context
<
DeviceContext
>();
Functor
functor
;
functor
(
place
,
Out
,
ddX
,
dOut
,
dOutNew
,
ddOut
);
}
};
template
<
typename
DeviceContext
,
typename
Functor
>
class
TanhTripeGradKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
Tensor
*
Out
,
*
ddX
,
*
dOut
,
*
d_ddOut
,
*
d_dOutNew
;
framework
::
Tensor
*
d_OutNew
,
*
d_dOut
,
*
d_ddx
;
Out
=
ddX
=
dOut
=
d_ddOut
=
d_dOutNew
=
nullptr
;
d_OutNew
=
d_dOut
=
d_ddx
=
nullptr
;
// extract ddx(input), out(input), dOut(input), d_ddOut(input),
// d_dOutNew(input)
ddX
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DDX"
);
Out
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Out"
);
dOut
=
ctx
.
Input
<
framework
::
Tensor
>
(
"DOut"
);
d_ddOut
=
ctx
.
Input
<
framework
::
Tensor
>
(
"D_DDOut"
);
d_dOutNew
=
ctx
.
Input
<
framework
::
Tensor
>
(
"D_DOut_New"
);
PADDLE_ENFORCE_NOT_NULL
(
ddX
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable ddX, variable name = %s"
,
ctx
.
InputName
(
"DDX"
)));
PADDLE_ENFORCE_NOT_NULL
(
Out
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable Out, variable name = %s"
,
ctx
.
InputName
(
"Out"
)));
PADDLE_ENFORCE_NOT_NULL
(
dOut
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable dOut, variable name = %s"
,
ctx
.
InputName
(
"DOut"
)));
PADDLE_ENFORCE_NOT_NULL
(
d_ddOut
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable d_ddOut, variable name = %s"
,
ctx
.
InputName
(
"D_DDOut"
)));
PADDLE_ENFORCE_NOT_NULL
(
d_dOutNew
,
platform
::
errors
::
NotFound
(
"Cannot get input Variable d_dOutNew, variable name = %s"
,
ctx
.
InputName
(
"D_DOutNew"
)));
// set output d_OutNew、d_dOut、d_ddx
d_dOut
=
ctx
.
Output
<
framework
::
Tensor
>
(
"D_DOut"
);
d_OutNew
=
ctx
.
Output
<
framework
::
Tensor
>
(
"D_OutNew"
);
d_ddx
=
ctx
.
Output
<
framework
::
Tensor
>
(
"D_DDx"
);
if
(
d_dOut
)
d_dOut
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
if
(
d_OutNew
)
d_OutNew
->
mutable_data
<
T
>
(
Out
->
dims
(),
ctx
.
GetPlace
());
if
(
d_ddx
)
d_ddx
->
mutable_data
<
T
>
(
ddX
->
dims
(),
ctx
.
GetPlace
());
auto
&
place
=
ctx
.
template
device_context
<
DeviceContext
>();
Functor
functor
;
functor
(
place
,
Out
,
ddX
,
dOut
,
d_ddOut
,
d_dOutNew
,
// input
d_dOut
,
d_OutNew
,
d_ddx
);
// output
}
};
template
<
typename
DeviceContext
,
typename
Functor
>
class
SquareDoubleGradKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
...
...
@@ -1952,7 +1556,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
}
// namespace paddle
#define FOR_EACH_ACTIVATION_OP(__macro) \
__macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
__macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \
__macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \
__macro(round, Round, RoundFunctor, ZeroGradFunctor); \
...
...
@@ -1965,8 +1568,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
__macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \
__macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \
__macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \
__macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \
HardSigmoidGradFunctor); \
__macro(swish, Swish, SwishFunctor, SwishGradFunctor); \
__macro(mish, Mish, MishFunctor, MishGradFunctor); \
__macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor);
paddle/fluid/operators/activation_op.kps
浏览文件 @
ed8a9370
...
...
@@ -20,69 +20,6 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType one = static_cast<MPType>(1.0f);
// sigmoid(x) = 1 / (1 + exp(-x))
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
return static_cast<T>(one / (one + exp(-x)));
}
};
template <typename T>
struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
T one = static_cast<T>(1.0f);
// dx = dout * out * (1 - out)
__device__ __forceinline__ T operator()(const T dout, const T out) const {
return dout * out * (one - out);
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType zero = static_cast<MPType>(0.0f);
// logsigmoid(x) = log(1 / (1 + exp(-x)))
// For numerical stability,
// logsigmoid(x) =
// - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
__device__ __forceinline__ T operator()(const T arg_x) const {
MPType x = static_cast<MPType>(arg_x);
MPType temp = x > zero ? zero : -x;
return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
}
};
template <typename T>
struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
MPType zero = static_cast<MPType>(0.0f);
// dx = dout * exp(-x) / (1 + exp(-x))
// For numerical stability:
// dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
// 0)))
__device__ __forceinline__ T operator()(const T arg_dout,
const T arg_x) const {
MPType dout = static_cast<MPType>(arg_dout);
MPType x = static_cast<MPType>(arg_x);
MPType temp1 = x > zero ? zero : -x;
MPType temp2 = exp(-x - temp1);
return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
};
template <typename T>
struct CudaCeilFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
...
...
@@ -551,49 +488,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
}
};
template <typename T>
struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
T one = static_cast<T>(1.0f);
float slope;
float offset;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"slope", &slope}, {"offset", &offset}};
}
// hard_sigmoid(x) = 0, when x <= -3
// 1, when x >= 3
// x * slope + offset, otherwise
__device__ __forceinline__ T operator()(const T x) const {
T temp = x * static_cast<T>(slope) + static_cast<T>(offset);
T temp_max = temp > zero ? temp : zero;
T temp_min = temp_max < one ? temp_max : one;
return temp_min;
}
};
template <typename T>
struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
T zero = static_cast<T>(0.0f);
T one = static_cast<T>(1.0f);
float slope;
float offset;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"slope", &slope}, {"offset", &offset}};
}
// dx = (out > 0 && out < 1) ? dout * slope : 0
__device__ __forceinline__ T operator()(const T dout, const T out) const {
return (out > zero && out < one) ? dout * static_cast<T>(slope) : zero;
}
static constexpr ActBwdOpFwdDeps FwdDeps() {
return ActBwdOpFwdDeps::kDepOut;
}
};
template <typename T>
struct CudaSwishFunctor : public BaseActivationFunctor<T> {
using MPType = typename details::MPTypeTrait<T>::Type;
...
...
@@ -876,6 +770,9 @@ USE_PHI_FUNCTOR(CudaSoftShrink)
USE_PHI_FUNCTOR(CudaTanhShrink)
USE_PHI_FUNCTOR(CudaSilu)
USE_PHI_FUNCTOR(CudaELU)
USE_PHI_FUNCTOR(CudaSigmoid)
USE_PHI_FUNCTOR(CudaLogSigmoid)
USE_PHI_FUNCTOR(CudaHardSigmoid)
template <typename T>
using CudaELUGradNegativeAlphaFunctor =
...
...
@@ -954,35 +851,6 @@ REGISTER_OP_CUDA_KERNEL(
ops::CELUGradGradFunctor<plat::float16>>);
/* ========================================================================== */
/* =========================== sigmoid register ============================
*/
REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
CudaSigmoidGradFunctor);
REGISTER_OP_CUDA_KERNEL(
sigmoid_grad_grad,
ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::SigmoidGradGradFunctor<float>>,
ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
ops::SigmoidGradGradFunctor<double>>,
ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
ops::SigmoidGradGradFunctor<plat::float16>>,
ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
ops::SigmoidGradGradFunctor<plat::bfloat16>>);
REGISTER_OP_CUDA_KERNEL(
sigmoid_triple_grad,
ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
ops::SigmoidTripleGradFunctor<float>>,
ops::SigmoidTripleGradKernel<paddle::platform::CUDADeviceContext,
ops::SigmoidTripleGradFunctor<double>>,
ops::SigmoidTripleGradKernel<plat::CUDADeviceContext,
ops::SigmoidTripleGradFunctor<plat::float16>>,
ops::SigmoidTripleGradKernel<
plat::CUDADeviceContext,
ops::SigmoidTripleGradFunctor<plat::bfloat16>>);
/* ========================================================================== */
/* =========================== sqrt register ============================= */
REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
CudaSqrtGradFunctor);
...
...
@@ -1120,8 +988,6 @@ REGISTER_OP_CUDA_KERNEL(
/* ========================================================================== */
#define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \
__macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \
CudaLogSigmoidGradFunctor); \
__macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \
CudaSoftShrinkGradFunctor); \
__macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor); \
...
...
@@ -1141,8 +1007,6 @@ REGISTER_OP_CUDA_KERNEL(
CudaTanhShrinkGradFunctor); \
__macro(hard_shrink, HardShrink, CudaHardShrinkFunctor, \
CudaHardShrinkGradFunctor); \
__macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor, \
CudaHardSigmoidGradFunctor); \
__macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \
__macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); \
__macro(hard_swish, HardSwish, CudaHardSwishFunctor, \
...
...
paddle/phi/kernels/activation_grad_kernel.h
浏览文件 @
ed8a9370
...
...
@@ -19,14 +19,14 @@ limitations under the License. */
namespace
phi
{
#define DECLARE_ACTIVATION_GRAD_KERNEL_D
ep
X(name) \
#define DECLARE_ACTIVATION_GRAD_KERNEL_D
EP
X(name) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& dout, \
DenseTensor* dx);
#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_D
ep
X(name, attr) \
#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_D
EP
X(name, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
...
...
@@ -34,7 +34,7 @@ namespace phi {
float attr, \
DenseTensor* dx);
#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_D
ep
X(name, attr1, attr2) \
#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_D
EP
X(name, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& x, \
...
...
@@ -43,19 +43,28 @@ namespace phi {
float attr2, \
DenseTensor* dx);
#define DECLARE_ACTIVATION_GRAD_KERNEL_D
epOut
(name) \
#define DECLARE_ACTIVATION_GRAD_KERNEL_D
EPOUT
(name) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
DenseTensor* dx);
#define DECLARE_ACTIVATION_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(name, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr, \
#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(name, attr) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr, \
DenseTensor* dx);
#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(name, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr1, \
float attr2, \
DenseTensor* dx);
template
<
typename
T
,
typename
Context
>
...
...
@@ -107,28 +116,51 @@ void EluDoubleGradKernel(const Context& dev_ctx,
DenseTensor
*
dx
,
DenseTensor
*
ddout
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Cos
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Tan
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Acos
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Sin
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Asin
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Atan
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Sinh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Cosh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Asinh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Acosh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Atanh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
TanhShrink
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepX
(
Silu
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepOut
(
Relu
);
DECLARE_ACTIVATION_GRAD_KERNEL_DepOut
(
Tanh
);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX
(
LeakyRelu
,
alpha
)
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX
(
ThresholdedRelu
,
threshold
)
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX
(
SoftShrink
,
lambda
)
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX
(
HardShrink
,
threshold
)
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX
(
BRelu
,
t_min
,
t_max
)
template
<
typename
T
,
typename
Context
>
void
SigmoidDoubleGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
out
,
const
DenseTensor
&
ddx
,
const
DenseTensor
&
dout
,
DenseTensor
*
dout_new
,
DenseTensor
*
ddout
);
template
<
typename
T
,
typename
Context
>
void
SigmoidTripleGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
out
,
const
DenseTensor
&
ddx
,
const
DenseTensor
&
dout
,
const
DenseTensor
&
d_ddout
,
const
DenseTensor
&
d_dout_new
,
DenseTensor
*
d_out_new
,
DenseTensor
*
d_dout
,
DenseTensor
*
d_ddx
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Cos
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Tan
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Acos
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Sin
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Asin
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Atan
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Sinh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Cosh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Asinh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Acosh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Atanh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
TanhShrink
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
Silu
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPX
(
LogSigmoid
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Relu
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Tanh
);
DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Sigmoid
);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX
(
LeakyRelu
,
alpha
);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX
(
ThresholdedRelu
,
threshold
);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX
(
SoftShrink
,
lambda
);
DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX
(
HardShrink
,
threshold
);
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX
(
BRelu
,
t_min
,
t_max
);
DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT
(
HardSigmoid
,
slope
,
offset
);
}
// namespace phi
paddle/phi/kernels/activation_kernel.h
浏览文件 @
ed8a9370
...
...
@@ -54,6 +54,8 @@ DECLARE_ACTIVATION_KERNEL(Relu)
DECLARE_ACTIVATION_KERNEL
(
Tanh
)
DECLARE_ACTIVATION_KERNEL
(
TanhShrink
)
DECLARE_ACTIVATION_KERNEL
(
Silu
)
DECLARE_ACTIVATION_KERNEL
(
Sigmoid
)
DECLARE_ACTIVATION_KERNEL
(
LogSigmoid
)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS
(
LeakyRelu
,
alpha
)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS
(
ThresholdedRelu
,
threshold
)
...
...
@@ -62,5 +64,5 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS
(
Elu
,
alpha
)
DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS
(
BRelu
,
t_min
,
t_max
)
DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS
(
HardSigmoid
,
slope
,
offset
)
}
// namespace phi
paddle/phi/kernels/cpu/activation_grad_kernel.cc
浏览文件 @
ed8a9370
...
...
@@ -90,6 +90,23 @@ namespace phi {
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT( \
name, functor_class, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr1, \
float attr2, \
DenseTensor* dx) { \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr1; \
*(attrs[1].second) = attr2; \
ActivationGradImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Cos
,
CosGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Tan
,
TanGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Acos
,
AcosGradFunctor
);
...
...
@@ -103,9 +120,11 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Atanh
,
AtanhGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
TanhShrink
,
TanhShrinkGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Silu
,
SiluGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX
(
LogSigmoid
,
LogSigmoidGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Relu
,
ReluGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Tanh
,
TanhGradFunctor
);
DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Sigmoid
,
SigmoidGradFunctor
);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX
(
LeakyRelu
,
LeakyReluGradFunctor
,
...
...
@@ -125,6 +144,11 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
t_min
,
t_max
);
DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT
(
HardSigmoid
,
HardSigmoidGradFunctor
,
slope
,
offset
);
template
<
typename
T
,
typename
Context
>
void
EluGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
...
...
@@ -204,3 +228,8 @@ PD_REGISTER_KERNEL(tanh_triple_grad,
float
,
double
,
phi
::
dtype
::
float16
)
{}
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
sigmoid_grad
,
SigmoidGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
sigmoid_double_grad
,
SigmoidDoubleGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
sigmoid_triple_grad
,
SigmoidTripleGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
hard_sigmoid_grad
,
HardSigmoidGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
logsigmoid_grad
,
LogSigmoidGradKernel
)
paddle/phi/kernels/cpu/activation_kernel.cc
浏览文件 @
ed8a9370
...
...
@@ -72,6 +72,8 @@ DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor)
DEFINE_CPU_ACTIVATION_KERNEL
(
Tanh
,
TanhFunctor
)
DEFINE_CPU_ACTIVATION_KERNEL
(
TanhShrink
,
TanhShrinkFunctor
)
DEFINE_CPU_ACTIVATION_KERNEL
(
Silu
,
SiluFunctor
)
DEFINE_CPU_ACTIVATION_KERNEL
(
Sigmoid
,
SigmoidFunctor
)
DEFINE_CPU_ACTIVATION_KERNEL
(
LogSigmoid
,
LogSigmoidFunctor
)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS
(
LeakyRelu
,
LeakyReluFunctor
,
alpha
)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS
(
ThresholdedRelu
,
...
...
@@ -82,6 +84,10 @@ DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS
(
Elu
,
ELUFunctor
,
alpha
)
DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS
(
BRelu
,
BReluFunctor
,
t_min
,
t_max
)
DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS
(
HardSigmoid
,
HardSigmoidFunctor
,
slope
,
offset
)
}
// namespace phi
PD_REGISTER_KERNEL
(
relu
,
CPU
,
ALL_LAYOUT
,
phi
::
ReluKernel
,
float
,
double
)
{}
...
...
@@ -109,3 +115,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL
(
tanh_shrink
,
TanhShrinkKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
elu
,
EluKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
silu
,
SiluKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
sigmoid
,
SigmoidKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
logsigmoid
,
LogSigmoidKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
hard_sigmoid
,
HardSigmoidKernel
)
paddle/phi/kernels/funcs/activation_functor.h
浏览文件 @
ed8a9370
...
...
@@ -1012,6 +1012,217 @@ struct SiluGradFunctor : public BaseActivationFunctor<T> {
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepX
;
}
};
// sigmoid(x) = 1 / (1 + exp(-x))
template
<
typename
T
>
struct
SigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
out
.
device
(
d
)
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
(
-
x
).
exp
());
}
};
template
<
typename
T
>
struct
SigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
out
*
(
static_cast
<
T
>
(
1
)
-
out
);
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
/*
Out
DOut -> SigmoidGradGrad -> DOutNew
DDX DDOut
DDOut = (1-Out)*Out*DDX
DOutNew = (1-2*Out)*DOut*DDX
*/
template
<
typename
T
>
struct
SigmoidGradGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
>
void
operator
()(
const
Device
&
dev
,
const
DenseTensor
*
Out
,
const
DenseTensor
*
ddX
,
const
DenseTensor
*
dOut
,
DenseTensor
*
dOutNew
,
DenseTensor
*
ddOut
)
const
{
auto
*
d
=
dev
.
eigen_device
();
auto
ddx
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddX
,
"Input"
,
"DDX"
,
"SigmoidGradGrad"
));
auto
out
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
Out
,
"Input"
,
"Out"
,
"SigmoidGradGrad"
));
if
(
dOutNew
)
{
auto
dout
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
dOut
,
"Input"
,
"DOut"
,
"SigmoidGradGrad"
));
auto
dout_new
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
dOutNew
,
"Output"
,
"DOutNew"
,
"SigmoidGradGrad"
));
dout_new
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
static_cast
<
T
>
(
2
)
*
out
)
*
dout
*
ddx
;
}
if
(
ddOut
)
{
auto
ddout
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddOut
,
"Output"
,
"DDOut"
,
"SigmoidGradGrad"
));
ddout
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
out
)
*
out
*
ddx
;
}
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
/*
Out
DOut D_Dout
DDx -> SigmoidTripleGrad -> D_DDx
D_DDout d_OutNew
D_Dout_new
D_Dout = (1-2*Out)*DDx*D_Dout_new
D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new
D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new
Out, DDX, DOut, D_DDOut, D_DOut_New // input
D_OutNew, D_DOut, D_DDx // output
*/
template
<
typename
T
>
struct
SigmoidTripleGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
>
void
operator
()(
const
Device
&
dev
,
const
DenseTensor
*
Out
,
const
DenseTensor
*
ddX
,
const
DenseTensor
*
dOut
,
const
DenseTensor
*
d_DDOut
,
const
DenseTensor
*
d_dOut_New
,
DenseTensor
*
d_d_Out
,
DenseTensor
*
d_Out_New
,
DenseTensor
*
d_DDx
)
const
{
auto
*
d
=
dev
.
eigen_device
();
auto
ddx
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
ddX
,
"Input"
,
"DDX"
,
"SigmoidTripleGrad"
));
auto
out
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
Out
,
"Input"
,
"Out"
,
"SigmoidTripleGrad"
));
auto
dout
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
dOut
,
"Input"
,
"DOut"
,
"SigmoidTripleGrad"
));
auto
d_ddOut
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_DDOut
,
"Input"
,
"D_DDOut"
,
"SigmoidTripleGrad"
));
auto
d_dOutNew
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_dOut_New
,
"Input"
,
"D_DOut_New"
,
"SigmoidTripleGrad"
));
if
(
d_Out_New
)
{
auto
d_OutNew
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_Out_New
,
"Output"
,
"D_OutNew"
,
"SigmoidTripleGrad"
));
d_OutNew
.
device
(
*
d
)
=
(
ddx
-
static_cast
<
T
>
(
2
)
*
out
*
ddx
)
*
d_ddOut
-
static_cast
<
T
>
(
2
)
*
dout
*
ddx
*
d_dOutNew
;
}
if
(
d_d_Out
)
{
auto
d_dOut
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_d_Out
,
"Output"
,
"D_DOut"
,
"SigmoidTripleGrad"
));
d_dOut
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
static_cast
<
T
>
(
2
)
*
out
)
*
ddx
*
d_dOutNew
;
}
if
(
d_DDx
)
{
auto
d_ddx
=
EigenVector
<
T
>::
Flatten
(
GET_DATA_SAFELY
(
d_DDx
,
"Output"
,
"D_DDx"
,
"SigmoidTripleGrad"
));
d_ddx
.
device
(
*
d
)
=
(
static_cast
<
T
>
(
1
)
-
out
)
*
out
*
d_ddOut
+
(
static_cast
<
T
>
(
1
)
-
static_cast
<
T
>
(
2
)
*
out
)
*
dout
*
d_dOutNew
;
}
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
// Originally: logsigmoid(x) = -log (1 + exp(-x))
// For numerical stability, we can use the log-sum-exp trick:
// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
// We can rewrite the above equation as:
// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
// max(-x, 0)))
// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
//
// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
// + exp(-x - max(-x, 0))))
template
<
typename
T
>
struct
LogSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
auto
temp
=
(
-
x
).
cwiseMax
(
static_cast
<
T
>
(
0
));
// temp = max(-x, 0)
out
.
device
(
d
)
=
-
temp
-
(((
-
temp
).
exp
()
+
(
-
x
-
temp
).
exp
()).
log
());
}
};
// Originally: f' = exp(-x) / (1 + exp(-x))
// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
// exp(-x - max(-x, 0)))
template
<
typename
T
>
struct
LogSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
auto
temp
=
(
-
x
).
cwiseMax
(
static_cast
<
T
>
(
0
));
// temp = max(-x, 0)
dx
.
device
(
d
)
=
dout
*
((
-
x
-
temp
).
exp
()
/
((
-
temp
).
exp
()
+
(
-
x
-
temp
).
exp
()));
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepX
;
}
};
template
<
typename
T
>
struct
HardSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
float
slope
;
float
offset
;
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
template
<
typename
Device
,
typename
X
,
typename
Out
>
void
operator
()(
Device
d
,
X
x
,
Out
out
)
const
{
auto
temp
=
x
*
static_cast
<
T
>
(
slope
)
+
static_cast
<
T
>
(
offset
);
out
.
device
(
d
)
=
temp
.
cwiseMax
(
static_cast
<
T
>
(
0
)).
cwiseMin
(
static_cast
<
T
>
(
1
));
}
};
template
<
typename
T
>
struct
HardSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
float
slope
;
float
offset
;
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
template
<
typename
Device
,
typename
X
,
typename
Out
,
typename
dOut
,
typename
dX
>
void
operator
()(
Device
d
,
X
x
,
Out
out
,
dOut
dout
,
dX
dx
)
const
{
dx
.
device
(
d
)
=
dout
*
((
out
>
static_cast
<
T
>
(
0
))
*
(
out
<
static_cast
<
T
>
(
1
)))
.
template
cast
<
T
>()
*
static_cast
<
T
>
(
slope
);
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
template
<
typename
T
>
struct
CudaReluFunctor
:
public
BaseActivationFunctor
<
T
>
{
...
...
@@ -1653,6 +1864,112 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepX
;
}
};
template
<
typename
T
>
struct
CudaSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
using
MPType
=
typename
phi
::
dtype
::
MPTypeTrait
<
T
>::
Type
;
MPType
one
=
static_cast
<
MPType
>
(
1.0
f
);
// sigmoid(x) = 1 / (1 + exp(-x))
__device__
__forceinline__
T
operator
()(
const
T
arg_x
)
const
{
MPType
x
=
static_cast
<
MPType
>
(
arg_x
);
return
static_cast
<
T
>
(
one
/
(
one
+
exp
(
-
x
)));
}
};
template
<
typename
T
>
struct
CudaSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
T
one
=
static_cast
<
T
>
(
1.0
f
);
// dx = dout * out * (1 - out)
__device__
__forceinline__
T
operator
()(
const
T
dout
,
const
T
out
)
const
{
return
dout
*
out
*
(
one
-
out
);
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
template
<
typename
T
>
struct
CudaLogSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
using
MPType
=
typename
phi
::
dtype
::
MPTypeTrait
<
T
>::
Type
;
MPType
zero
=
static_cast
<
MPType
>
(
0.0
f
);
// logsigmoid(x) = log(1 / (1 + exp(-x)))
// For numerical stability,
// logsigmoid(x) =
// - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
__device__
__forceinline__
T
operator
()(
const
T
arg_x
)
const
{
MPType
x
=
static_cast
<
MPType
>
(
arg_x
);
MPType
temp
=
x
>
zero
?
zero
:
-
x
;
return
static_cast
<
T
>
(
-
temp
-
log
(
exp
(
-
temp
)
+
exp
(
-
x
-
temp
)));
}
};
template
<
typename
T
>
struct
CudaLogSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
using
MPType
=
typename
phi
::
dtype
::
MPTypeTrait
<
T
>::
Type
;
MPType
zero
=
static_cast
<
MPType
>
(
0.0
f
);
// dx = dout * exp(-x) / (1 + exp(-x))
// For numerical stability:
// dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
// 0)))
__device__
__forceinline__
T
operator
()(
const
T
arg_dout
,
const
T
arg_x
)
const
{
MPType
dout
=
static_cast
<
MPType
>
(
arg_dout
);
MPType
x
=
static_cast
<
MPType
>
(
arg_x
);
MPType
temp1
=
x
>
zero
?
zero
:
-
x
;
MPType
temp2
=
exp
(
-
x
-
temp1
);
return
static_cast
<
T
>
(
dout
*
(
temp2
/
(
exp
(
-
temp1
)
+
temp2
)));
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepX
;
}
};
template
<
typename
T
>
struct
CudaHardSigmoidFunctor
:
public
BaseActivationFunctor
<
T
>
{
T
zero
=
static_cast
<
T
>
(
0.0
f
);
T
one
=
static_cast
<
T
>
(
1.0
f
);
float
slope
;
float
offset
;
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
// hard_sigmoid(x) = 0, when x <= -3
// 1, when x >= 3
// x * slope + offset, otherwise
__device__
__forceinline__
T
operator
()(
const
T
x
)
const
{
T
temp
=
x
*
static_cast
<
T
>
(
slope
)
+
static_cast
<
T
>
(
offset
);
T
temp_max
=
temp
>
zero
?
temp
:
zero
;
T
temp_min
=
temp_max
<
one
?
temp_max
:
one
;
return
temp_min
;
}
};
template
<
typename
T
>
struct
CudaHardSigmoidGradFunctor
:
public
BaseActivationFunctor
<
T
>
{
T
zero
=
static_cast
<
T
>
(
0.0
f
);
T
one
=
static_cast
<
T
>
(
1.0
f
);
float
slope
;
float
offset
;
typename
BaseActivationFunctor
<
T
>::
AttrPair
GetAttrs
()
{
return
{{
"slope"
,
&
slope
},
{
"offset"
,
&
offset
}};
}
// dx = (out > 0 && out < 1) ? dout * slope : 0
__device__
__forceinline__
T
operator
()(
const
T
dout
,
const
T
out
)
const
{
return
(
out
>
zero
&&
out
<
one
)
?
dout
*
static_cast
<
T
>
(
slope
)
:
zero
;
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
ActBwdOpFwdDeps
::
kDepOut
;
}
};
#endif
}
// namespace funcs
...
...
paddle/phi/kernels/gpu/activation_grad_kernel.cu
浏览文件 @
ed8a9370
...
...
@@ -142,8 +142,27 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT( \
name, functor_class, attr1, attr2) \
template <typename T, typename Context> \
void name##GradKernel(const Context& dev_ctx, \
const DenseTensor& out, \
const DenseTensor& dout, \
float attr1, \
float attr2, \
DenseTensor* dx) { \
funcs::functor_class<T> functor; \
auto attrs = functor.GetAttrs(); \
*(attrs[0].second) = attr1; \
*(attrs[1].second) = attr2; \
ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
dev_ctx, nullptr, &out, &dout, dx, functor); \
}
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Relu
,
CudaReluGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Tanh
,
CudaTanhGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT
(
Sigmoid
,
CudaSigmoidGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Cos
,
CudaCosGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Tan
,
CudaTanGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Acos
,
CudaAcosGradFunctor
);
...
...
@@ -157,6 +176,7 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Atanh
,
CudaAtanhGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
TanhShrink
,
CudaTanhShrinkGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
Silu
,
CudaSiluGradFunctor
);
DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX
(
LogSigmoid
,
CudaLogSigmoidGradFunctor
);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX
(
LeakyRelu
,
CudaLeakyReluGradFunctor
,
...
...
@@ -176,6 +196,11 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
t_min
,
t_max
);
DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT
(
HardSigmoid
,
CudaHardSigmoidGradFunctor
,
slope
,
offset
);
template
<
typename
T
,
typename
Context
>
void
EluGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
...
...
@@ -270,3 +295,8 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
silu_grad
,
SiluGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
elu_grad
,
EluGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
elu_double_grad
,
EluDoubleGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
sigmoid_grad
,
SigmoidGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
sigmoid_double_grad
,
SigmoidDoubleGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
sigmoid_triple_grad
,
SigmoidTripleGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
hard_sigmoid_grad
,
HardSigmoidGradKernel
)
PD_REGISTER_ACTIVATION_GRAD_KERNEL
(
logsigmoid_grad
,
LogSigmoidGradKernel
)
paddle/phi/kernels/gpu/activation_kernel.cu
浏览文件 @
ed8a9370
...
...
@@ -91,6 +91,8 @@ DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
DEFINE_GPU_ACTIVATION_KERNEL
(
Tanh
,
CudaTanhFunctor
)
DEFINE_GPU_ACTIVATION_KERNEL
(
TanhShrink
,
CudaTanhShrinkFunctor
)
DEFINE_GPU_ACTIVATION_KERNEL
(
Silu
,
CudaSiluFunctor
)
DEFINE_GPU_ACTIVATION_KERNEL
(
Sigmoid
,
CudaSigmoidFunctor
)
DEFINE_GPU_ACTIVATION_KERNEL
(
LogSigmoid
,
CudaLogSigmoidFunctor
)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS
(
LeakyRelu
,
CudaLeakyReluFunctor
,
alpha
)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS
(
ThresholdedRelu
,
...
...
@@ -103,6 +105,10 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS
(
Elu
,
CudaELUFunctor
,
alpha
)
DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS
(
BRelu
,
CudaBReluFunctor
,
t_min
,
t_max
)
DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS
(
HardSigmoid
,
CudaHardSigmoidFunctor
,
slope
,
offset
)
}
// namespace phi
...
...
@@ -155,3 +161,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
PD_REGISTER_ACTIVATION_KERNEL
(
tanh_shrink
,
TanhShrinkKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
elu
,
EluKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
silu
,
SiluKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
sigmoid
,
SigmoidKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
logsigmoid
,
LogSigmoidKernel
)
PD_REGISTER_ACTIVATION_KERNEL
(
hard_sigmoid
,
HardSigmoidKernel
)
paddle/phi/kernels/impl/activation_grad_impl.h
浏览文件 @
ed8a9370
...
...
@@ -222,4 +222,57 @@ void EluDoubleGradKernel(const Context& dev_ctx,
functor
(
dev_ctx
,
&
x
,
&
ddx
,
ddout
,
&
dout
,
dx
);
}
template
<
typename
T
,
typename
Context
>
void
SigmoidDoubleGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
out
,
const
DenseTensor
&
ddx
,
const
DenseTensor
&
dout
,
DenseTensor
*
dout_new
,
DenseTensor
*
ddout
)
{
if
(
dout_new
)
{
dout_new
->
Resize
(
out
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
dout_new
);
}
if
(
ddout
)
{
ddout
->
Resize
(
out
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
ddout
);
}
funcs
::
SigmoidGradGradFunctor
<
T
>
functor
;
functor
(
dev_ctx
,
&
out
,
&
ddx
,
&
dout
,
dout_new
,
ddout
);
}
template
<
typename
T
,
typename
Context
>
void
SigmoidTripleGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
out
,
const
DenseTensor
&
ddx
,
const
DenseTensor
&
dout
,
const
DenseTensor
&
d_ddout
,
const
DenseTensor
&
d_dout_new
,
DenseTensor
*
d_out_new
,
DenseTensor
*
d_dout
,
DenseTensor
*
d_ddx
)
{
if
(
d_dout
)
{
d_dout
->
Resize
(
out
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
d_dout
);
}
if
(
d_out_new
)
{
d_dout
->
Resize
(
out
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
d_out_new
);
}
if
(
d_ddx
)
{
d_dout
->
Resize
(
ddx
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
d_ddx
);
}
funcs
::
SigmoidTripleGradFunctor
<
T
>
functor
;
functor
(
dev_ctx
,
&
out
,
&
ddx
,
&
dout
,
&
d_ddout
,
&
d_dout_new
,
d_dout
,
d_out_new
,
d_ddx
);
}
}
// namespace phi
paddle/phi/ops/compat/activation_sig.cc
浏览文件 @
ed8a9370
...
...
@@ -56,9 +56,14 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda");
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP
(
HardShrink
,
"hard_shrink"
,
"threshold"
);
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP
(
TanhShrink
,
"tanh_shrink"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP
(
Silu
,
"silu"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPX_OP_ARGMAP
(
LogSigmoid
,
"logsigmoid"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP
(
Relu
,
"relu"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP
(
Tanh
,
"tanh"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP
(
Relu
,
"relu"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP
(
Tanh
,
"tanh"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP
(
Sigmoid
,
"sigmoid"
,
);
// NOLINT
DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP
(
HardSigmoid
,
"hard_sigmoid"
,
"slope"
comma
"offset"
);
// NOLINT
KernelSignature
ReluDoubleGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
...
...
@@ -79,6 +84,20 @@ KernelSignature TanhTripleGradOpArgumentMapping(
{
"D_OutNew"
,
"D_DOut"
,
"D_DDx"
});
}
KernelSignature
SigmoidDoubleGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"sigmoid_double_grad"
,
{
"Out"
,
"DDX"
,
"DOut"
},
{},
{
"DOutNew"
,
"DDOut"
});
}
KernelSignature
SigmoidTripleGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"sigmoid_triple_grad"
,
{
"Out"
,
"DDX"
,
"DOut"
,
"D_DDOut"
,
"D_DOut_New"
},
{},
{
"D_OutNew"
,
"D_DOut"
,
"D_DDx"
});
}
KernelSignature
LeakyReluDoubleGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
...
...
@@ -114,6 +133,7 @@ PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad);
PD_REGISTER_BASE_KERNEL_NAME
(
softshrink
,
soft_shrink
);
PD_REGISTER_BASE_KERNEL_NAME
(
softshrink_grad
,
soft_shrink_grad
);
PD_REGISTER_BASE_KERNEL_NAME
(
elu_grad_grad
,
elu_double_grad
);
PD_REGISTER_BASE_KERNEL_NAME
(
sigmoid_grad_grad
,
sigmoid_double_grad
);
PD_REGISTER_ARG_MAPPING_FN
(
cos_grad
,
phi
::
CosGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
tan_grad
,
phi
::
TanGradOpArgumentMapping
);
...
...
@@ -152,3 +172,12 @@ PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN
(
elu_grad
,
phi
::
EluGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
elu_grad_grad
,
phi
::
EluDoubleGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
silu_grad
,
phi
::
SiluGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
sigmoid_grad
,
phi
::
SigmoidGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
sigmoid_grad_grad
,
phi
::
SigmoidDoubleGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
sigmoid_triple_grad
,
phi
::
SigmoidTripleGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
logsigmoid_grad
,
phi
::
LogSigmoidGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
hard_sigmoid_grad
,
phi
::
HardSigmoidGradOpArgumentMapping
);
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录