Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
372ac08a
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
372ac08a
编写于
3月 23, 2021
作者:
N
niuliling123
提交者:
GitHub
3月 23, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add relu forward kernel and backward kernel (#31613)
* add relu forward kernel and backward kernel
上级
814b38e3
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
283 addition
and
1 deletion
+283
-1
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+283
-1
未找到文件。
paddle/fluid/operators/activation_op.cu
浏览文件 @
372ac08a
...
...
@@ -10,8 +10,276 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
float16
=
paddle
::
platform
::
float16
;
template
<
typename
T
>
struct
CudaVecType
{
using
type
=
T
;
static
constexpr
int
vecsize
=
1
;
};
template
<
>
struct
CudaVecType
<
platform
::
float16
>
{
using
type
=
__half2
;
static
constexpr
int
vecsize
=
2
;
};
template
<
>
struct
CudaVecType
<
float
>
{
using
type
=
float4
;
static
constexpr
int
vecsize
=
4
;
};
template
<
typename
T
>
class
BaseGPUFunctor
{
public:
using
ELEMENT_TYPE
=
T
;
};
/* ========================================================================== */
/* =========================== relu forward ============================ */
template
<
typename
T
>
class
ReluGPUFuctor
:
public
BaseGPUFunctor
<
T
>
{
private:
T
zero_
;
public:
ReluGPUFuctor
()
{
zero_
=
static_cast
<
T
>
(
0.0
f
);
}
// for relu forward when T is double
__device__
__forceinline__
typename
CudaVecType
<
T
>::
type
Compute
(
const
typename
CudaVecType
<
T
>::
type
*
x
);
// when num % vecsize != 0 this func will be used
__device__
__forceinline__
T
ComputeRemainder
(
const
T
x
)
{
return
x
>
zero_
?
x
:
zero_
;
}
};
template
<
>
__device__
__forceinline__
CudaVecType
<
double
>::
type
ReluGPUFuctor
<
double
>::
Compute
(
const
CudaVecType
<
double
>::
type
*
x
)
{
// relu forward : out = max(x, 0)
#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
return
__ldg
(
x
)
>
zero_
?
__ldg
(
x
)
:
zero_
;
#else
return
(
*
x
)
>
zero_
?
(
*
x
)
:
zero_
;
#endif
}
template
<
>
__device__
__forceinline__
CudaVecType
<
float
>::
type
ReluGPUFuctor
<
float
>::
Compute
(
const
CudaVecType
<
float
>::
type
*
xx
)
{
// relu forward : out = max(xx, 0)
return
make_float4
((
xx
->
x
>
zero_
)
*
(
xx
->
x
),
(
xx
->
y
>
zero_
)
*
(
xx
->
y
),
(
xx
->
z
>
zero_
)
*
(
xx
->
z
),
(
xx
->
w
>
zero_
)
*
(
xx
->
w
));
}
template
<
>
__device__
__forceinline__
CudaVecType
<
float16
>::
type
ReluGPUFuctor
<
float16
>::
Compute
(
const
CudaVecType
<
float16
>::
type
*
in
)
{
// relu forward : out = max(in, 0)
#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
const
half2
kzero
=
__float2half2_rn
(
0.0
f
);
return
__hmul2
(
__hgt2
(
__ldg
(
in
),
kzero
),
__ldg
(
in
));
#else
const
float2
xx
=
__half22float2
(
*
in
);
return
__floats2half2_rn
((
xx
.
x
>
0.0
f
)
*
static_cast
<
float
>
(
xx
.
x
),
(
xx
.
y
>
0.0
f
)
*
static_cast
<
float
>
(
xx
.
y
));
#endif
}
/* ========================================================================== */
/* =========================== relu backward ============================
*/
template
<
typename
T
>
class
ReluGradGPUFunctor
:
public
BaseGPUFunctor
<
T
>
{
private:
T
zero_
;
public:
ReluGradGPUFunctor
()
{
zero_
=
static_cast
<
T
>
(
0.0
f
);
}
// for relu backward when T is double
__device__
__forceinline__
typename
CudaVecType
<
T
>::
type
Compute
(
const
typename
CudaVecType
<
T
>::
type
*
out
,
const
typename
CudaVecType
<
T
>::
type
*
dout
);
// when num % vecsize != 0 this func will be used
__device__
__forceinline__
T
ComputeRemainder
(
const
T
out
,
const
T
dout
)
{
// relu backward : dx = out > 0 ? dout : 0;
return
out
>
zero_
?
dout
:
zero_
;
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
kDepOut
;
}
};
template
<
>
__device__
__forceinline__
CudaVecType
<
double
>::
type
ReluGradGPUFunctor
<
double
>::
Compute
(
const
CudaVecType
<
double
>::
type
*
out
,
const
CudaVecType
<
double
>::
type
*
dout
)
{
// relu backward : dx = out > 0 ? dout : 0;
#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
return
__ldg
(
out
)
>
zero_
?
__ldg
(
dout
)
:
zero_
;
#else
return
(
*
out
)
>
zero_
?
(
*
dout
)
:
zero_
;
#endif
}
template
<
>
__device__
__forceinline__
CudaVecType
<
float
>::
type
ReluGradGPUFunctor
<
float
>::
Compute
(
const
CudaVecType
<
float
>::
type
*
out
,
const
CudaVecType
<
float
>::
type
*
dout
)
{
// relu backward : dx = out > 0 ? dout : 0;
return
make_float4
((
out
->
x
>
zero_
)
*
(
dout
->
x
),
(
out
->
y
>
zero_
)
*
(
dout
->
y
),
(
out
->
z
>
zero_
)
*
(
dout
->
z
),
(
out
->
w
>
zero_
)
*
(
dout
->
w
));
}
template
<
>
__device__
__forceinline__
CudaVecType
<
float16
>::
type
ReluGradGPUFunctor
<
float16
>::
Compute
(
const
CudaVecType
<
float16
>::
type
*
out
,
const
CudaVecType
<
float16
>::
type
*
dout
)
{
// relu backward : dx = out > 0 ? dout : 0;
#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300
const
half2
kzero
=
__float2half2_rn
(
0.0
f
);
return
__hmul2
(
__hgt2
(
__ldg
(
out
),
kzero
),
__ldg
(
dout
));
#else
const
float2
xx
=
__half22float2
(
*
out
);
const
float2
yy
=
__half22float2
(
*
dout
);
return
__floats2half2_rn
((
xx
.
x
>
0.0
f
)
*
static_cast
<
float
>
(
yy
.
x
),
(
xx
.
y
>
0.0
f
)
*
static_cast
<
float
>
(
yy
.
y
));
#endif
}
/* ========================================================================== */
template
<
typename
T
,
typename
Functor
>
__global__
void
ActivationGradKernelVec
(
const
T
*
forward_data
,
const
T
*
dout
,
T
*
dx
,
int
num
,
Functor
functor
)
{
using
VecType
=
typename
CudaVecType
<
T
>::
type
;
constexpr
int
vecsize
=
CudaVecType
<
T
>::
vecsize
;
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
int
loop
=
num
/
vecsize
;
int
tail
=
num
%
vecsize
;
const
VecType
*
in_forward
=
reinterpret_cast
<
const
VecType
*>
(
forward_data
);
const
VecType
*
in_dout
=
reinterpret_cast
<
const
VecType
*>
(
dout
);
VecType
*
out
=
reinterpret_cast
<
VecType
*>
(
dx
);
for
(
int
i
=
idx
;
i
<
loop
;
i
+=
stride
)
{
out
[
i
]
=
functor
.
Compute
((
in_forward
+
i
),
(
in_dout
+
i
));
}
while
(
idx
==
loop
&&
tail
)
{
dx
[
num
-
tail
]
=
functor
.
ComputeRemainder
(
forward_data
[
num
-
tail
],
dout
[
num
-
tail
]);
--
tail
;
}
}
template
<
typename
T
,
typename
Functor
>
__global__
void
ActivationkernelVec
(
const
T
*
src
,
T
*
dst
,
int
num
,
Functor
functor
)
{
constexpr
int
vecsize
=
CudaVecType
<
T
>::
vecsize
;
using
VecType
=
typename
CudaVecType
<
T
>::
type
;
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
int
loop
=
num
/
vecsize
;
int
tail
=
num
%
vecsize
;
const
VecType
*
in
=
reinterpret_cast
<
const
VecType
*>
(
src
);
VecType
*
out
=
reinterpret_cast
<
VecType
*>
(
dst
);
for
(
int
i
=
idx
;
i
<
loop
;
i
+=
stride
)
{
out
[
i
]
=
functor
.
Compute
((
in
+
i
));
}
while
(
idx
==
loop
&&
tail
)
{
dst
[
num
-
tail
]
=
functor
.
ComputeRemainder
(
src
[
num
-
tail
]);
--
tail
;
}
}
template
<
typename
DeviceContext
,
typename
Functor
>
class
ActivationGPUKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
framework
::
Tensor
*
in_x
=
nullptr
;
framework
::
Tensor
*
out
=
nullptr
;
ExtractActivationTensor
(
context
,
&
in_x
,
&
out
);
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
int
num
=
in_x
->
numel
();
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
(),
static_cast
<
size_t
>
(
num
*
sizeof
(
T
)));
int
block
=
512
;
#ifdef __HIPCC__
block
=
256
;
#endif
Functor
functor
;
constexpr
int
vecsize
=
CudaVecType
<
T
>::
vecsize
;
int
grid
=
max
((
num
/
vecsize
+
block
-
1
)
/
block
,
1
);
ActivationkernelVec
<
T
,
Functor
><<<
grid
,
block
>>>
(
input_data
,
output_data
,
num
,
functor
);
}
};
template
<
typename
DeviceContext
,
typename
Functor
>
class
ActivationGradGPUKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
framework
::
Tensor
*
x
,
*
out
,
*
d_out
;
framework
::
Tensor
*
d_x
=
nullptr
;
x
=
out
=
d_out
=
nullptr
;
ExtractActivationGradTensor
<
Functor
::
FwdDeps
()
>
(
context
,
&
x
,
&
out
,
&
d_out
,
&
d_x
);
int
numel
=
d_out
->
numel
();
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
*
dx_data
=
d_x
->
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
(),
static_cast
<
size_t
>
(
numel
*
sizeof
(
T
)));
auto
*
dout_data
=
d_out
->
data
<
T
>
();
auto
*
forward_data
=
dout_data
;
if
(
static_cast
<
int
>
(
Functor
::
FwdDeps
())
==
static_cast
<
int
>
(
kDepOut
))
{
// Only need forward output Out
forward_data
=
out
->
data
<
T
>
();
}
else
if
(
static_cast
<
int
>
(
Functor
::
FwdDeps
())
==
static_cast
<
int
>
(
kDepX
))
{
// Only need forward input X
forward_data
=
x
->
data
<
T
>
();
}
int
block
=
512
;
#ifdef __HIPCC__
block
=
256
;
#endif
Functor
functor
;
constexpr
int
vecsize
=
CudaVecType
<
T
>::
vecsize
;
int
grid
=
max
((
numel
/
vecsize
+
block
-
1
)
/
block
,
1
);
ActivationGradKernelVec
<
T
,
Functor
><<<
grid
,
block
>>>
(
forward_data
,
dout_data
,
dx_data
,
numel
,
functor
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
...
...
@@ -60,7 +328,21 @@ REGISTER_OP_CUDA_KERNEL(
/* ========================================================================== */
/* =========================== relu register ============================ */
REGISTER_ACTIVATION_CUDA_KERNEL
(
relu
,
Relu
,
ReluCUDAFunctor
,
ReluGradFunctor
);
REGISTER_OP_CUDA_KERNEL
(
relu
,
ops
::
ActivationGPUKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
ReluGPUFuctor
<
float
>>
,
ops
::
ActivationGPUKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
ReluGPUFuctor
<
double
>>
,
ops
::
ActivationGPUKernel
<
plat
::
CUDADeviceContext
,
ops
::
ReluGPUFuctor
<
plat
::
float16
>>
);
REGISTER_OP_CUDA_KERNEL
(
relu_grad
,
ops
::
ActivationGradGPUKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
ReluGradGPUFunctor
<
float
>>
,
ops
::
ActivationGradGPUKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
ReluGradGPUFunctor
<
double
>>
,
ops
::
ActivationGradGPUKernel
<
plat
::
CUDADeviceContext
,
ops
::
ReluGradGPUFunctor
<
plat
::
float16
>>
);
REGISTER_OP_CUDA_KERNEL
(
relu_grad_grad
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录