Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a71d72d9
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
a71d72d9
编写于
3月 29, 2021
作者:
N
niuliling123
提交者:
GitHub
3月 29, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
relu forward and backward with vectortype (#31869)
上级
8829a309
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
285 addition
and
1 deletion
+285
-1
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+285
-1
未找到文件。
paddle/fluid/operators/activation_op.cu
浏览文件 @
a71d72d9
...
...
@@ -10,8 +10,278 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
float16
=
paddle
::
platform
::
float16
;
template
<
typename
T
>
struct
CudaVecType
{
using
type
=
T
;
static
constexpr
int
vecsize
=
1
;
};
template
<
>
struct
CudaVecType
<
platform
::
float16
>
{
using
type
=
__half2
;
static
constexpr
int
vecsize
=
2
;
};
template
<
>
struct
CudaVecType
<
float
>
{
using
type
=
float4
;
static
constexpr
int
vecsize
=
4
;
};
template
<
typename
T
>
class
BaseGPUFunctor
{
public:
using
ELEMENT_TYPE
=
T
;
};
/* ========================================================================== */
/* =========================== relu forward ============================ */
template
<
typename
T
>
class
ReluGPUFunctor
:
public
BaseGPUFunctor
<
T
>
{
private:
T
zero_
;
public:
ReluGPUFunctor
()
{
zero_
=
static_cast
<
T
>
(
0.0
f
);
}
// for relu forward when T is double
__device__
__forceinline__
typename
CudaVecType
<
T
>::
type
Compute
(
const
typename
CudaVecType
<
T
>::
type
*
x
);
// when num % vecsize != 0 this func will be used
__device__
__forceinline__
T
ComputeRemainder
(
const
T
x
)
{
return
x
>
zero_
?
x
:
zero_
;
}
};
template
<
>
__device__
__forceinline__
CudaVecType
<
double
>::
type
ReluGPUFunctor
<
double
>::
Compute
(
const
CudaVecType
<
double
>::
type
*
x
)
{
// relu forward : out = max(x, 0)
#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
return
__ldg
(
x
)
>
zero_
?
__ldg
(
x
)
:
zero_
;
#else
return
(
*
x
)
>
zero_
?
(
*
x
)
:
zero_
;
#endif
}
template
<
>
__device__
__forceinline__
CudaVecType
<
float
>::
type
ReluGPUFunctor
<
float
>::
Compute
(
const
CudaVecType
<
float
>::
type
*
xx
)
{
// relu forward : out = max(xx, 0)
return
make_float4
((
xx
->
x
>
zero_
)
*
(
xx
->
x
),
(
xx
->
y
>
zero_
)
*
(
xx
->
y
),
(
xx
->
z
>
zero_
)
*
(
xx
->
z
),
(
xx
->
w
>
zero_
)
*
(
xx
->
w
));
}
template
<
>
__device__
__forceinline__
CudaVecType
<
float16
>::
type
ReluGPUFunctor
<
float16
>::
Compute
(
const
CudaVecType
<
float16
>::
type
*
in
)
{
// relu forward : out = max(in, 0)
#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
const
half2
kzero
=
__float2half2_rn
(
0.0
f
);
return
__hmul2
(
__hgt2
(
__ldg
(
in
),
kzero
),
__ldg
(
in
));
#else
const
float2
xx
=
__half22float2
(
*
in
);
return
__floats2half2_rn
((
xx
.
x
>
0.0
f
)
*
static_cast
<
float
>
(
xx
.
x
),
(
xx
.
y
>
0.0
f
)
*
static_cast
<
float
>
(
xx
.
y
));
#endif
}
/* ========================================================================== */
/* =========================== relu backward ============================
*/
template
<
typename
T
>
class
ReluGradGPUFunctor
:
public
BaseGPUFunctor
<
T
>
{
private:
T
zero_
;
public:
ReluGradGPUFunctor
()
{
zero_
=
static_cast
<
T
>
(
0.0
f
);
}
// for relu backward when T is double
__device__
__forceinline__
typename
CudaVecType
<
T
>::
type
Compute
(
const
typename
CudaVecType
<
T
>::
type
*
out
,
const
typename
CudaVecType
<
T
>::
type
*
dout
);
// when num % vecsize != 0 this func will be used
__device__
__forceinline__
T
ComputeRemainder
(
const
T
out
,
const
T
dout
)
{
// relu backward : dx = out > 0 ? dout : 0
return
out
>
zero_
?
dout
:
zero_
;
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
return
kDepOut
;
}
};
template
<
>
__device__
__forceinline__
CudaVecType
<
double
>::
type
ReluGradGPUFunctor
<
double
>::
Compute
(
const
CudaVecType
<
double
>::
type
*
out
,
const
CudaVecType
<
double
>::
type
*
dout
)
{
// relu backward : dx = out > 0 ? dout : 0;
#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
return
__ldg
(
out
)
>
zero_
?
__ldg
(
dout
)
:
zero_
;
#else
return
(
*
out
)
>
zero_
?
(
*
dout
)
:
zero_
;
#endif
}
template
<
>
__device__
__forceinline__
CudaVecType
<
float
>::
type
ReluGradGPUFunctor
<
float
>::
Compute
(
const
CudaVecType
<
float
>::
type
*
out
,
const
CudaVecType
<
float
>::
type
*
dout
)
{
// relu backward : dx = out > 0 ? dout : 0;
return
make_float4
((
out
->
x
>
zero_
)
*
(
dout
->
x
),
(
out
->
y
>
zero_
)
*
(
dout
->
y
),
(
out
->
z
>
zero_
)
*
(
dout
->
z
),
(
out
->
w
>
zero_
)
*
(
dout
->
w
));
}
template
<
>
__device__
__forceinline__
CudaVecType
<
float16
>::
type
ReluGradGPUFunctor
<
float16
>::
Compute
(
const
CudaVecType
<
float16
>::
type
*
out
,
const
CudaVecType
<
float16
>::
type
*
dout
)
{
// relu backward : dx = out > 0 ? dout : 0;
#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
const
half2
kzero
=
__float2half2_rn
(
0.0
f
);
return
__hmul2
(
__hgt2
(
__ldg
(
out
),
kzero
),
__ldg
(
dout
));
#else
const
float2
xx
=
__half22float2
(
*
out
);
const
float2
yy
=
__half22float2
(
*
dout
);
return
__floats2half2_rn
((
xx
.
x
>
0.0
f
)
*
static_cast
<
float
>
(
yy
.
x
),
(
xx
.
y
>
0.0
f
)
*
static_cast
<
float
>
(
yy
.
y
));
#endif
}
/* ========================================================================== */
template
<
typename
T
,
typename
Functor
>
__global__
void
ActivationGradKernelVec
(
const
T
*
forward_data
,
const
T
*
dout
,
T
*
dx
,
int
num
,
Functor
functor
)
{
using
VecType
=
typename
CudaVecType
<
T
>::
type
;
constexpr
int
vecsize
=
CudaVecType
<
T
>::
vecsize
;
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
int
loop
=
num
/
vecsize
;
int
tail
=
num
%
vecsize
;
const
VecType
*
in_forward
=
reinterpret_cast
<
const
VecType
*>
(
forward_data
);
const
VecType
*
in_dout
=
reinterpret_cast
<
const
VecType
*>
(
dout
);
VecType
*
out
=
reinterpret_cast
<
VecType
*>
(
dx
);
for
(
int
i
=
idx
;
i
<
loop
;
i
+=
stride
)
{
out
[
i
]
=
functor
.
Compute
((
in_forward
+
i
),
(
in_dout
+
i
));
}
while
(
idx
==
loop
&&
tail
)
{
dx
[
num
-
tail
]
=
functor
.
ComputeRemainder
(
forward_data
[
num
-
tail
],
dout
[
num
-
tail
]);
--
tail
;
}
}
template
<
typename
T
,
typename
Functor
>
__global__
void
ActivationkernelVec
(
const
T
*
src
,
T
*
dst
,
int
num
,
Functor
functor
)
{
constexpr
int
vecsize
=
CudaVecType
<
T
>::
vecsize
;
using
VecType
=
typename
CudaVecType
<
T
>::
type
;
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
int
loop
=
num
/
vecsize
;
int
tail
=
num
%
vecsize
;
const
VecType
*
in
=
reinterpret_cast
<
const
VecType
*>
(
src
);
VecType
*
out
=
reinterpret_cast
<
VecType
*>
(
dst
);
for
(
int
i
=
idx
;
i
<
loop
;
i
+=
stride
)
{
out
[
i
]
=
functor
.
Compute
((
in
+
i
));
}
while
(
idx
==
loop
&&
tail
)
{
dst
[
num
-
tail
]
=
functor
.
ComputeRemainder
(
src
[
num
-
tail
]);
--
tail
;
}
}
template
<
typename
DeviceContext
,
typename
Functor
>
class
ActivationGPUKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
framework
::
Tensor
*
in_x
=
nullptr
;
framework
::
Tensor
*
out
=
nullptr
;
ExtractActivationTensor
(
context
,
&
in_x
,
&
out
);
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
int
num
=
in_x
->
numel
();
const
T
*
input_data
=
in_x
->
data
<
T
>
();
T
*
output_data
=
out
->
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
(),
static_cast
<
size_t
>
(
num
*
sizeof
(
T
)));
int
block
=
512
;
#ifdef __HIPCC__
block
=
256
;
#endif
Functor
functor
;
constexpr
int
vecsize
=
CudaVecType
<
T
>::
vecsize
;
int
grid
=
max
((
num
/
vecsize
+
block
-
1
)
/
block
,
1
);
auto
stream
=
context
.
cuda_device_context
().
stream
();
ActivationkernelVec
<
T
,
Functor
><<<
grid
,
block
,
0
,
stream
>>>
(
input_data
,
output_data
,
num
,
functor
);
}
};
template
<
typename
DeviceContext
,
typename
Functor
>
class
ActivationGradGPUKernel
:
public
framework
::
OpKernel
<
typename
Functor
::
ELEMENT_TYPE
>
{
public:
using
T
=
typename
Functor
::
ELEMENT_TYPE
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
framework
::
Tensor
*
x
,
*
out
,
*
d_out
;
framework
::
Tensor
*
d_x
=
nullptr
;
x
=
out
=
d_out
=
nullptr
;
ExtractActivationGradTensor
<
Functor
::
FwdDeps
()
>
(
context
,
&
x
,
&
out
,
&
d_out
,
&
d_x
);
int
numel
=
d_out
->
numel
();
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
*
dx_data
=
d_x
->
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
(),
static_cast
<
size_t
>
(
numel
*
sizeof
(
T
)));
auto
*
dout_data
=
d_out
->
data
<
T
>
();
auto
*
forward_data
=
dout_data
;
if
(
static_cast
<
int
>
(
Functor
::
FwdDeps
())
==
static_cast
<
int
>
(
kDepOut
))
{
// Only need forward output Out
forward_data
=
out
->
data
<
T
>
();
}
else
if
(
static_cast
<
int
>
(
Functor
::
FwdDeps
())
==
static_cast
<
int
>
(
kDepX
))
{
// Only need forward input X
forward_data
=
x
->
data
<
T
>
();
}
int
block
=
512
;
#ifdef __HIPCC__
block
=
256
;
#endif
Functor
functor
;
constexpr
int
vecsize
=
CudaVecType
<
T
>::
vecsize
;
int
grid
=
max
((
numel
/
vecsize
+
block
-
1
)
/
block
,
1
);
auto
stream
=
context
.
cuda_device_context
().
stream
();
ActivationGradKernelVec
<
T
,
Functor
><<<
grid
,
block
,
0
,
stream
>>>
(
forward_data
,
dout_data
,
dx_data
,
numel
,
functor
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
...
...
@@ -60,7 +330,21 @@ REGISTER_OP_CUDA_KERNEL(
/* ========================================================================== */
/* =========================== relu register ============================ */
REGISTER_ACTIVATION_CUDA_KERNEL
(
relu
,
Relu
,
ReluCUDAFunctor
,
ReluGradFunctor
);
REGISTER_OP_CUDA_KERNEL
(
relu
,
ops
::
ActivationGPUKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
ReluGPUFunctor
<
float
>>
,
ops
::
ActivationGPUKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
ReluGPUFunctor
<
double
>>
,
ops
::
ActivationGPUKernel
<
plat
::
CUDADeviceContext
,
ops
::
ReluGPUFunctor
<
plat
::
float16
>>
);
REGISTER_OP_CUDA_KERNEL
(
relu_grad
,
ops
::
ActivationGradGPUKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
ReluGradGPUFunctor
<
float
>>
,
ops
::
ActivationGradGPUKernel
<
paddle
::
platform
::
CUDADeviceContext
,
ops
::
ReluGradGPUFunctor
<
double
>>
,
ops
::
ActivationGradGPUKernel
<
plat
::
CUDADeviceContext
,
ops
::
ReluGradGPUFunctor
<
plat
::
float16
>>
);
REGISTER_OP_CUDA_KERNEL
(
relu_grad_grad
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录