Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
7879477f
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7879477f
编写于
4月 23, 2021
作者:
R
ronnywang
提交者:
GitHub
4月 23, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[ROCM] add cuda kenrel for batch_norm_op (#32393)
上级
49773f36
变更
2
展开全部
隐藏空白更改
内联
并排
Showing
2 changed file
with
409 addition
and
140 deletion
+409
-140
paddle/fluid/operators/batch_norm_op.cu
paddle/fluid/operators/batch_norm_op.cu
+386
-120
paddle/fluid/operators/norm_utils.cu.h
paddle/fluid/operators/norm_utils.cu.h
+23
-20
未找到文件。
paddle/fluid/operators/batch_norm_op.cu
浏览文件 @
7879477f
此差异已折叠。
点击以展开。
paddle/fluid/operators/norm_utils.cu.h
浏览文件 @
7879477f
...
@@ -32,6 +32,12 @@ namespace cub = hipcub;
...
@@ -32,6 +32,12 @@ namespace cub = hipcub;
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#endif
#ifdef __HIPCC__
#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
#else
#define LAUNCH_BOUNDS(BlockDim)
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -58,12 +64,10 @@ using DataLayout = framework::DataLayout;
...
@@ -58,12 +64,10 @@ using DataLayout = framework::DataLayout;
// axis=(n,h,w)))
// axis=(n,h,w)))
template
<
typename
T
,
int
BlockDim
,
framework
::
DataLayout
layout
>
template
<
typename
T
,
int
BlockDim
,
framework
::
DataLayout
layout
>
__global__
void
DoubleGradComputeDX
(
const
T
*
x
,
const
T
*
mean
,
__global__
LAUNCH_BOUNDS
(
BlockDim
)
void
DoubleGradComputeDX
(
const
T
*
variance
,
const
T
*
ddx
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
variance
,
const
T
*
ddx
,
const
T
*
dy
,
const
T
*
dy
,
const
T
*
scale
,
const
T
*
scale
,
const
T
*
ddscale
,
const
int
N
,
const
int
C
,
const
T
*
ddscale
,
const
int
N
,
const
int
C
,
const
int
sample_size
,
const
double
epsilon
,
T
*
dx
)
{
const
int
sample_size
,
const
double
epsilon
,
T
*
dx
)
{
const
int
outer_size
=
C
;
const
int
outer_size
=
C
;
const
int
inner_size
=
N
*
sample_size
;
const
int
inner_size
=
N
*
sample_size
;
...
@@ -160,12 +164,10 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
...
@@ -160,12 +164,10 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
// scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
// scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
// np.mean(ddx * (x - mean), axis=(n,h,w)))
// np.mean(ddx * (x - mean), axis=(n,h,w)))
template
<
typename
T
,
int
BlockDim
,
framework
::
DataLayout
layout
>
template
<
typename
T
,
int
BlockDim
,
framework
::
DataLayout
layout
>
__global__
void
DoubleGradComputeDDY
(
const
T
*
x
,
const
T
*
mean
,
__global__
LAUNCH_BOUNDS
(
BlockDim
)
void
DoubleGradComputeDDY
(
const
T
*
variance
,
const
T
*
ddscale
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
variance
,
const
T
*
ddscale
,
const
T
*
ddbias
,
const
T
*
ddx
,
const
T
*
ddbias
,
const
T
*
ddx
,
const
T
*
scale
,
const
int
N
,
const
int
C
,
const
T
*
scale
,
const
int
N
,
const
int
C
,
const
int
sample_size
,
const
double
epsilon
,
T
*
ddy
)
{
const
int
sample_size
,
const
double
epsilon
,
T
*
ddy
)
{
const
int
outer_size
=
C
;
const
int
outer_size
=
C
;
const
int
inner_size
=
N
*
sample_size
;
const
int
inner_size
=
N
*
sample_size
;
...
@@ -238,11 +240,10 @@ __global__ void DoubleGradComputeDDY(const T *x, const T *mean,
...
@@ -238,11 +240,10 @@ __global__ void DoubleGradComputeDDY(const T *x, const T *mean,
// inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
// inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
// ddx
// ddx
template
<
typename
T
,
int
BlockDim
,
framework
::
DataLayout
layout
>
template
<
typename
T
,
int
BlockDim
,
framework
::
DataLayout
layout
>
__global__
void
DoubleGradComputeDScale
(
const
T
*
x
,
const
T
*
mean
,
__global__
LAUNCH_BOUNDS
(
BlockDim
)
void
DoubleGradComputeDScale
(
const
T
*
variance
,
const
T
*
ddx
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
variance
,
const
T
*
ddx
,
const
T
*
dy
,
const
T
*
dy
,
const
int
N
,
const
int
C
,
const
int
N
,
const
int
C
,
const
int
sample_size
,
const
double
epsilon
,
const
int
sample_size
,
T
*
dscale
)
{
const
double
epsilon
,
T
*
dscale
)
{
const
int
outer_size
=
C
;
const
int
outer_size
=
C
;
const
int
inner_size
=
N
*
sample_size
;
const
int
inner_size
=
N
*
sample_size
;
...
@@ -302,7 +303,7 @@ __global__ void DoubleGradComputeDScale(const T *x, const T *mean,
...
@@ -302,7 +303,7 @@ __global__ void DoubleGradComputeDScale(const T *x, const T *mean,
// math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
// math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
template
<
typename
T
,
int
BlockDim
,
framework
::
DataLayout
layout
>
template
<
typename
T
,
int
BlockDim
,
framework
::
DataLayout
layout
>
__global__
void
DoubleGradComputeDScaleWithGlobal
(
__global__
LAUNCH_BOUNDS
(
BlockDim
)
void
DoubleGradComputeDScaleWithGlobal
(
const
T
*
ddx
,
const
T
*
variance
,
const
T
*
dy
,
const
double
epsilon
,
const
T
*
ddx
,
const
T
*
variance
,
const
T
*
dy
,
const
double
epsilon
,
const
int
N
,
const
int
C
,
const
int
sample_size
,
T
*
dscale
)
{
const
int
N
,
const
int
C
,
const
int
sample_size
,
T
*
dscale
)
{
int
outer_size
=
C
;
int
outer_size
=
C
;
...
@@ -422,8 +423,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
...
@@ -422,8 +423,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
set_constant
(
dev_ctx
,
&
scale_tmp
,
static_cast
<
T
>
(
1
));
set_constant
(
dev_ctx
,
&
scale_tmp
,
static_cast
<
T
>
(
1
));
}
}
const
T
*
scale_data
=
Scale
?
Scale
->
data
<
T
>
()
:
scale_tmp
.
data
<
T
>
();
const
T
*
scale_data
=
Scale
?
Scale
->
data
<
T
>
()
:
scale_tmp
.
data
<
T
>
();
#ifdef __HIPCC__
const
int
block
=
256
;
#else
const
int
block
=
512
;
const
int
block
=
512
;
#endif
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
int
grid
=
std
::
min
(
C
,
max_blocks
);
int
grid
=
std
::
min
(
C
,
max_blocks
);
...
@@ -532,6 +536,5 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
...
@@ -532,6 +536,5 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
}
}
}
}
}
}
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录