Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
1bc47c84
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
1bc47c84
编写于
7月 14, 2022
作者:
Y
Yao Zihang
提交者:
GitHub
7月 14, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimize batchnorm1d using 2D kernel (#43530)
上级
a2c4c86b
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
549 addition
and
38 deletion
+549
-38
paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+4
-2
paddle/phi/kernels/gpu/batch_norm_kernel.cu
paddle/phi/kernels/gpu/batch_norm_kernel.cu
+511
-11
python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+34
-25
未找到文件。
paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
浏览文件 @
1bc47c84
...
...
@@ -591,10 +591,12 @@ void BatchNormGradRawKernel(const Context &ctx,
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else
// CUDNN
PER_ACTIVATION mode
only support small batch size
// CUDNN only support small batch size
const
size_t
CUDNN_PER_ACTIVATION_THRESHOLD
=
131070
;
const
size_t
CUDNN_SPATIAL_THRESHOLD
=
880801
;
const
bool
use_native_kernel
=
(
x_dims
.
size
()
==
2
&&
N
>=
CUDNN_PER_ACTIVATION_THRESHOLD
);
((
x_dims
.
size
()
==
2
&&
N
>=
CUDNN_PER_ACTIVATION_THRESHOLD
)
||
(
x_dims
.
size
()
==
3
&&
N
>=
CUDNN_SPATIAL_THRESHOLD
));
if
(
use_native_kernel
)
{
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
BNBackward
<
T
,
block
,
DataLayout
::
kNCHW
>
...
...
paddle/phi/kernels/gpu/batch_norm_kernel.cu
浏览文件 @
1bc47c84
...
...
@@ -31,6 +31,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/batch_norm_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/norm_utils.h"
#include "paddle/phi/kernels/funcs/reduce_function.h"
#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
#ifdef __HIPCC__
...
...
@@ -137,6 +138,398 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
}
}
template
<
typename
T
>
__device__
__forceinline__
void
merge_block_vertical
(
BatchNormParamType
<
T
>
x_sum
,
BatchNormParamType
<
T
>
x_square_sum
,
BatchNormParamType
<
T
>
*
smem_sum
,
BatchNormParamType
<
T
>
*
smem_square_sum
,
BatchNormParamType
<
T
>
*
x_sum_out
,
BatchNormParamType
<
T
>
*
x_square_sum_out
)
{
int
tid
=
threadIdx
.
x
+
threadIdx
.
y
*
blockDim
.
x
;
#pragma unroll
for
(
int
offset
=
blockDim
.
y
/
2
;
offset
>
0
;
offset
>>=
1
)
{
if
(
threadIdx
.
y
<
offset
*
2
)
{
smem_sum
[
tid
]
=
x_sum
;
smem_square_sum
[
tid
]
=
x_square_sum
;
}
__syncthreads
();
if
(
threadIdx
.
y
<
offset
)
{
int
pair_tid
=
tid
+
offset
*
blockDim
.
x
;
x_sum
+=
smem_sum
[
pair_tid
];
x_square_sum
+=
smem_square_sum
[
pair_tid
];
}
}
if
(
threadIdx
.
y
==
0
)
{
*
x_sum_out
=
x_sum
;
*
x_square_sum_out
=
x_square_sum
;
}
}
template
<
typename
T
>
__device__
__forceinline__
void
merge_block_horizonal
(
BatchNormParamType
<
T
>
x_sum
,
BatchNormParamType
<
T
>
x_square_sum
,
BatchNormParamType
<
T
>
*
smem_sum
,
BatchNormParamType
<
T
>
*
smem_square_sum
,
BatchNormParamType
<
T
>
*
x_sum_out
,
BatchNormParamType
<
T
>
*
x_square_sum_out
)
{
int
tid
=
threadIdx
.
x
+
threadIdx
.
y
*
blockDim
.
x
;
#pragma unroll
for
(
int
offset
=
blockDim
.
x
/
2
;
offset
>
0
;
offset
>>=
1
)
{
if
(
threadIdx
.
x
<
offset
*
2
)
{
smem_sum
[
tid
]
=
x_sum
;
smem_square_sum
[
tid
]
=
x_square_sum
;
}
__syncthreads
();
if
(
threadIdx
.
x
<
offset
)
{
int
pair_tid
=
tid
+
offset
;
x_sum
+=
smem_sum
[
pair_tid
];
x_square_sum
+=
smem_square_sum
[
pair_tid
];
}
}
if
(
threadIdx
.
x
==
0
)
{
*
x_sum_out
=
x_sum
;
*
x_square_sum_out
=
x_square_sum
;
}
}
template
<
typename
T
,
int
BlockDim
>
static
__global__
void
BNForwardTraining2DChannelLastCompStat
(
const
T
*
x
,
const
BatchNormParamType
<
T
>
*
scale
,
const
BatchNormParamType
<
T
>
*
bias
,
const
int
C
,
const
int
N
,
const
int
HxW
,
const
double
epsilon
,
double
exponentialAverageFactor
,
T
*
y
,
BatchNormParamType
<
T
>
*
global_mean
,
BatchNormParamType
<
T
>
*
global_variance
,
BatchNormParamType
<
T
>
*
save_mean
,
BatchNormParamType
<
T
>
*
save_inv_variance
,
BatchNormParamType
<
T
>
*
compute_mean
,
BatchNormParamType
<
T
>
*
compute_inv_var
,
BatchNormParamType
<
T
>
*
block_data_ptr
,
int
*
flag_ptr
)
{
int
outer_size
=
C
;
int
inner_size
=
N
*
HxW
;
__shared__
BatchNormParamType
<
T
>
smem_sum
[
BlockDim
];
__shared__
BatchNormParamType
<
T
>
smem_square_sum
[
BlockDim
];
int
outer_loop_stride
=
gridDim
.
x
*
blockDim
.
x
;
int
inner_loop_stride
=
gridDim
.
y
*
blockDim
.
y
;
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
outer_size
;
i
+=
outer_loop_stride
)
{
BatchNormParamType
<
T
>
x_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
BatchNormParamType
<
T
>
x_square_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
for
(
int
j
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
j
<
inner_size
;
j
+=
inner_loop_stride
)
{
const
int
index
=
j
*
outer_size
+
i
;
BatchNormParamType
<
T
>
x_i
=
static_cast
<
BatchNormParamType
<
T
>>
(
x
[
index
]);
x_sum
+=
x_i
;
x_square_sum
+=
x_i
*
x_i
;
}
// vertical block sum
merge_block_vertical
<
T
>
(
x_sum
,
x_square_sum
,
&
smem_sum
[
0
],
&
smem_square_sum
[
0
],
&
x_sum
,
&
x_square_sum
);
if
(
gridDim
.
y
>
1
)
{
volatile
BatchNormParamType
<
T
>
*
staging_sum
=
block_data_ptr
;
volatile
BatchNormParamType
<
T
>
*
staging_square_sum
=
&
block_data_ptr
[
C
*
gridDim
.
y
];
// write block data to global memory
if
(
threadIdx
.
y
==
0
)
{
staging_sum
[
i
+
blockIdx
.
y
*
C
]
=
x_sum
;
staging_square_sum
[
i
+
blockIdx
.
y
*
C
]
=
x_square_sum
;
}
// make sure write is visible to all blocks
__threadfence
();
__syncthreads
();
__shared__
bool
is_last_block_done
;
// mark block done
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
)
{
int
old
=
atomicAdd
(
&
flag_ptr
[
blockIdx
.
x
],
1
);
is_last_block_done
=
(
old
==
(
gridDim
.
y
-
1
));
}
__syncthreads
();
if
(
is_last_block_done
)
{
x_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
x_square_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
// thread sum
for
(
int
y
=
threadIdx
.
y
;
y
<
gridDim
.
y
;
y
+=
blockDim
.
y
)
{
x_sum
+=
staging_sum
[
i
+
y
*
C
];
x_square_sum
+=
staging_square_sum
[
i
+
y
*
C
];
}
// vertical block sum
merge_block_vertical
<
T
>
(
x_sum
,
x_square_sum
,
&
smem_sum
[
0
],
&
smem_square_sum
[
0
],
&
x_sum
,
&
x_square_sum
);
// final compute
if
(
threadIdx
.
y
==
0
)
{
BatchNormParamType
<
T
>
compute_mean_val
=
x_sum
/
inner_size
;
BatchNormParamType
<
T
>
variance_val
=
x_square_sum
/
inner_size
-
compute_mean_val
*
compute_mean_val
;
BatchNormParamType
<
T
>
compute_inv_var_val
=
1
/
sqrt
(
variance_val
+
epsilon
);
if
(
save_mean
&&
save_inv_variance
)
{
save_mean
[
i
]
=
compute_mean_val
;
save_inv_variance
[
i
]
=
compute_inv_var_val
;
}
global_mean
[
i
]
=
(
1
-
exponentialAverageFactor
)
*
compute_mean_val
+
exponentialAverageFactor
*
global_mean
[
i
];
global_variance
[
i
]
=
(
1
-
exponentialAverageFactor
)
*
variance_val
+
exponentialAverageFactor
*
global_variance
[
i
];
compute_mean
[
i
]
=
compute_mean_val
;
compute_inv_var
[
i
]
=
compute_inv_var_val
;
}
}
}
else
{
if
(
blockIdx
.
y
==
0
&&
threadIdx
.
y
==
0
)
{
BatchNormParamType
<
T
>
compute_mean_val
=
x_sum
/
inner_size
;
BatchNormParamType
<
T
>
variance_val
=
x_square_sum
/
inner_size
-
compute_mean_val
*
compute_mean_val
;
BatchNormParamType
<
T
>
compute_inv_var_val
=
1
/
sqrt
(
variance_val
+
epsilon
);
if
(
save_mean
&&
save_inv_variance
)
{
save_mean
[
i
]
=
compute_mean_val
;
save_inv_variance
[
i
]
=
compute_inv_var_val
;
}
global_mean
[
i
]
=
(
1
-
exponentialAverageFactor
)
*
compute_mean_val
+
exponentialAverageFactor
*
global_mean
[
i
];
global_variance
[
i
]
=
(
1
-
exponentialAverageFactor
)
*
variance_val
+
exponentialAverageFactor
*
global_variance
[
i
];
compute_mean
[
i
]
=
compute_mean_val
;
compute_inv_var
[
i
]
=
compute_inv_var_val
;
}
}
}
}
template
<
typename
T
>
static
__global__
void
BNForwardTraining2DChannelLastWriteRes
(
const
T
*
x
,
const
BatchNormParamType
<
T
>
*
scale
,
const
BatchNormParamType
<
T
>
*
bias
,
const
int
C
,
const
int
N
,
const
int
HxW
,
T
*
y
,
BatchNormParamType
<
T
>
*
compute_mean
,
BatchNormParamType
<
T
>
*
compute_inv_var
)
{
int
outer_size
=
C
;
int
inner_size
=
N
*
HxW
;
int
outer_loop_stride
=
gridDim
.
x
*
blockDim
.
x
;
int
inner_loop_stride
=
gridDim
.
y
*
blockDim
.
y
;
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
outer_size
;
i
+=
outer_loop_stride
)
{
BatchNormParamType
<
T
>
mean_val
=
compute_mean
[
i
];
BatchNormParamType
<
T
>
inv_var_val
=
compute_inv_var
[
i
];
BatchNormParamType
<
T
>
scale_val
=
scale
[
i
];
BatchNormParamType
<
T
>
bias_val
=
bias
[
i
];
for
(
int
j
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
j
<
inner_size
;
j
+=
inner_loop_stride
)
{
const
int
index
=
j
*
outer_size
+
i
;
BatchNormParamType
<
T
>
x_sub_mean
=
static_cast
<
BatchNormParamType
<
T
>>
(
x
[
index
])
-
mean_val
;
y
[
index
]
=
scale_val
*
x_sub_mean
*
inv_var_val
+
bias_val
;
}
}
}
template
<
typename
T
,
int
BlockDim
>
static
__global__
void
BNForwardTraining2DCompStat
(
const
T
*
x
,
const
BatchNormParamType
<
T
>
*
scale
,
const
BatchNormParamType
<
T
>
*
bias
,
const
int
C
,
const
int
N
,
const
int
HxW
,
const
double
epsilon
,
double
exponentialAverageFactor
,
T
*
y
,
BatchNormParamType
<
T
>
*
global_mean
,
BatchNormParamType
<
T
>
*
global_variance
,
BatchNormParamType
<
T
>
*
save_mean
,
BatchNormParamType
<
T
>
*
save_inv_variance
,
BatchNormParamType
<
T
>
*
compute_mean
,
BatchNormParamType
<
T
>
*
compute_inv_var
,
BatchNormParamType
<
T
>
*
block_data_ptr
,
int
*
flag_ptr
)
{
int
outer_size
=
C
;
int
inner_size
=
N
*
HxW
;
__shared__
BatchNormParamType
<
T
>
smem_sum
[
BlockDim
];
__shared__
BatchNormParamType
<
T
>
smem_square_sum
[
BlockDim
];
int
outer_loop_stride
=
gridDim
.
y
*
blockDim
.
y
;
int
inner_loop_stride
=
gridDim
.
x
*
blockDim
.
x
;
for
(
int
i
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
i
<
outer_size
;
i
+=
outer_loop_stride
)
{
BatchNormParamType
<
T
>
x_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
BatchNormParamType
<
T
>
x_square_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
for
(
int
j
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
j
<
inner_size
;
j
+=
inner_loop_stride
)
{
const
int
index
=
(
j
/
HxW
*
C
+
i
)
*
HxW
+
j
%
HxW
;
BatchNormParamType
<
T
>
x_i
=
static_cast
<
BatchNormParamType
<
T
>>
(
x
[
index
]);
x_sum
+=
x_i
;
x_square_sum
+=
x_i
*
x_i
;
}
// horizonal block sum
merge_block_horizonal
<
T
>
(
x_sum
,
x_square_sum
,
&
smem_sum
[
0
],
&
smem_square_sum
[
0
],
&
x_sum
,
&
x_square_sum
);
if
(
gridDim
.
x
>
1
)
{
volatile
BatchNormParamType
<
T
>
*
staging_sum
=
block_data_ptr
;
volatile
BatchNormParamType
<
T
>
*
staging_square_sum
=
&
block_data_ptr
[
C
*
gridDim
.
x
];
// write block data to global memory
if
(
threadIdx
.
x
==
0
)
{
staging_sum
[
i
+
blockIdx
.
x
*
C
]
=
x_sum
;
staging_square_sum
[
i
+
blockIdx
.
x
*
C
]
=
x_square_sum
;
}
// make sure write is visible to all blocks
__threadfence
();
__syncthreads
();
__shared__
bool
is_last_block_done
;
// mark block done
if
(
threadIdx
.
x
==
0
&&
threadIdx
.
y
==
0
)
{
int
old
=
atomicAdd
(
&
flag_ptr
[
blockIdx
.
y
],
1
);
is_last_block_done
=
(
old
==
(
gridDim
.
x
-
1
));
}
__syncthreads
();
if
(
is_last_block_done
)
{
x_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
x_square_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
// thread sum
for
(
int
x
=
threadIdx
.
x
;
x
<
gridDim
.
x
;
x
+=
blockDim
.
x
)
{
x_sum
+=
staging_sum
[
i
+
x
*
C
];
x_square_sum
+=
staging_square_sum
[
i
+
x
*
C
];
}
// horizonal block sum
merge_block_horizonal
<
T
>
(
x_sum
,
x_square_sum
,
&
smem_sum
[
0
],
&
smem_square_sum
[
0
],
&
x_sum
,
&
x_square_sum
);
// final compute
if
(
threadIdx
.
x
==
0
)
{
BatchNormParamType
<
T
>
compute_mean_val
=
x_sum
/
inner_size
;
BatchNormParamType
<
T
>
variance_val
=
x_square_sum
/
inner_size
-
compute_mean_val
*
compute_mean_val
;
BatchNormParamType
<
T
>
compute_inv_var_val
=
1
/
sqrt
(
variance_val
+
epsilon
);
if
(
save_mean
&&
save_inv_variance
)
{
save_mean
[
i
]
=
compute_mean_val
;
save_inv_variance
[
i
]
=
compute_inv_var_val
;
}
global_mean
[
i
]
=
(
1
-
exponentialAverageFactor
)
*
compute_mean_val
+
exponentialAverageFactor
*
global_mean
[
i
];
global_variance
[
i
]
=
(
1
-
exponentialAverageFactor
)
*
variance_val
+
exponentialAverageFactor
*
global_variance
[
i
];
compute_mean
[
i
]
=
compute_mean_val
;
compute_inv_var
[
i
]
=
compute_inv_var_val
;
}
}
}
else
{
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
{
BatchNormParamType
<
T
>
compute_mean_val
=
x_sum
/
inner_size
;
BatchNormParamType
<
T
>
variance_val
=
x_square_sum
/
inner_size
-
compute_mean_val
*
compute_mean_val
;
BatchNormParamType
<
T
>
compute_inv_var_val
=
1
/
sqrt
(
variance_val
+
epsilon
);
if
(
save_mean
&&
save_inv_variance
)
{
save_mean
[
i
]
=
compute_mean_val
;
save_inv_variance
[
i
]
=
compute_inv_var_val
;
}
global_mean
[
i
]
=
(
1
-
exponentialAverageFactor
)
*
compute_mean_val
+
exponentialAverageFactor
*
global_mean
[
i
];
global_variance
[
i
]
=
(
1
-
exponentialAverageFactor
)
*
variance_val
+
exponentialAverageFactor
*
global_variance
[
i
];
compute_mean
[
i
]
=
compute_mean_val
;
compute_inv_var
[
i
]
=
compute_inv_var_val
;
}
}
}
}
template
<
typename
T
>
static
__global__
void
BNForwardTraining2DWriteRes
(
const
T
*
x
,
const
BatchNormParamType
<
T
>
*
scale
,
const
BatchNormParamType
<
T
>
*
bias
,
const
int
C
,
const
int
N
,
const
int
HxW
,
T
*
y
,
BatchNormParamType
<
T
>
*
compute_mean
,
BatchNormParamType
<
T
>
*
compute_inv_var
)
{
int
outer_size
=
C
;
int
inner_size
=
N
*
HxW
;
int
outer_loop_stride
=
gridDim
.
y
*
blockDim
.
y
;
int
inner_loop_stride
=
gridDim
.
x
*
blockDim
.
x
;
for
(
int
i
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
i
<
outer_size
;
i
+=
outer_loop_stride
)
{
BatchNormParamType
<
T
>
mean_val
=
compute_mean
[
i
];
BatchNormParamType
<
T
>
inv_var_val
=
compute_inv_var
[
i
];
BatchNormParamType
<
T
>
scale_val
=
scale
[
i
];
BatchNormParamType
<
T
>
bias_val
=
bias
[
i
];
for
(
int
j
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
j
<
inner_size
;
j
+=
inner_loop_stride
)
{
const
int
index
=
(
j
/
HxW
*
C
+
i
)
*
HxW
+
j
%
HxW
;
BatchNormParamType
<
T
>
x_sub_mean
=
static_cast
<
BatchNormParamType
<
T
>>
(
x
[
index
])
-
mean_val
;
y
[
index
]
=
scale_val
*
x_sub_mean
*
inv_var_val
+
bias_val
;
}
}
}
template
<
typename
T
,
typename
Context
>
void
BatchNormKernel
(
const
Context
&
ctx
,
const
DenseTensor
&
x
,
...
...
@@ -515,17 +908,63 @@ void BatchNormKernel(const Context &ctx,
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
#else
// CUDNN PER_ACTIVATION mode only support small batch size
const
size_t
CUDNN_PER_ACTIVATION_THRESHOLD
=
131070
;
const
size_t
CUDNN_SPATIAL_THRESHOLD
=
880801
;
const
bool
use_native_kernel
=
(
x_dims
.
size
()
==
2
&&
N
>=
CUDNN_PER_ACTIVATION_THRESHOLD
);
((
x_dims
.
size
()
==
2
&&
N
>=
CUDNN_PER_ACTIVATION_THRESHOLD
)
||
(
x_dims
.
size
()
==
3
&&
N
>=
CUDNN_SPATIAL_THRESHOLD
));
if
(
use_native_kernel
)
{
const
int
block
=
512
;
const
int
max_threads
=
ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
const
int
grid
=
std
::
min
(
C
,
max_blocks
);
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
BNForwardTraining
<
T
,
block
,
DataLayout
::
kNCHW
>
dim3
block
;
dim3
grid
;
const
int
block_size
=
512
;
const
int
MAX_GRID_SIZE
=
128
;
const
int
WARP_SIZE
=
32
;
// init intermediate storage
DenseTensor
block_data_tensor
;
DenseTensor
flag_tensor
;
DenseTensor
compute_mean_tensor
=
phi
::
Empty
<
BatchNormParamType
<
T
>
,
Context
>
(
ctx
,
{
C
});
DenseTensor
compute_inv_var_tensor
=
phi
::
Empty
<
BatchNormParamType
<
T
>
,
Context
>
(
ctx
,
{
C
});
BatchNormParamType
<
T
>
*
block_data_ptr
=
nullptr
;
int
*
flag_ptr
=
nullptr
;
if
(
x_dims
.
size
()
!=
2
&&
compute_format
==
DataLayout
::
kNCHW
)
{
// init block&grid config
int
block_x
=
std
::
min
(
phi
::
funcs
::
details
::
GetLastPow2
(
H
*
W
*
D
),
block_size
);
int
block_y
=
std
::
min
(
phi
::
funcs
::
details
::
GetLastPow2
(
C
),
block_size
/
block_x
);
if
(
block_x
*
block_y
!=
block_size
)
{
block_x
=
std
::
min
(
phi
::
funcs
::
details
::
GetLastPow2
(
N
*
H
*
W
*
D
/
16
),
block_size
/
block_y
);
}
int
grid_x
=
std
::
min
((
N
*
H
*
W
*
D
+
block_x
*
16
-
1
)
/
(
block_x
*
16
),
MAX_GRID_SIZE
);
int
grid_y
=
(
C
+
block_y
-
1
)
/
block_y
;
block
.
x
=
block_x
;
block
.
y
=
block_y
;
grid
.
x
=
grid_x
;
grid
.
y
=
grid_y
;
if
(
grid
.
x
>
1
)
{
block_data_tensor
=
phi
::
Empty
<
BatchNormParamType
<
T
>
,
Context
>
(
ctx
,
{
2
*
C
*
grid
.
x
});
flag_tensor
=
phi
::
Empty
<
int
,
Context
>
(
ctx
,
{
grid
.
y
});
block_data_ptr
=
block_data_tensor
.
data
<
BatchNormParamType
<
T
>>
();
flag_ptr
=
flag_tensor
.
data
<
int
>
();
funcs
::
SetConstant
<
Context
,
int
>
set_zero
;
set_zero
(
ctx
,
&
flag_tensor
,
static_cast
<
int
>
(
0
));
}
BNForwardTraining2DCompStat
<
T
,
block_size
>
<<<
grid
,
block
,
0
,
ctx
.
stream
()
>>>
(
transformed_x
.
template
data
<
T
>(),
scale
.
template
data
<
BatchNormParamType
<
T
>
>
(),
...
...
@@ -539,9 +978,54 @@ void BatchNormKernel(const Context &ctx,
mean_out
->
template
data
<
BatchNormParamType
<
T
>
>
(),
variance_out
->
template
data
<
BatchNormParamType
<
T
>
>
(),
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
(),
saved_variance
->
template
data
<
BatchNormParamType
<
T
>
>
());
saved_variance
->
template
data
<
BatchNormParamType
<
T
>
>
(),
compute_mean_tensor
.
data
<
BatchNormParamType
<
T
>>
(),
compute_inv_var_tensor
.
data
<
BatchNormParamType
<
T
>>
(),
block_data_ptr
,
flag_ptr
);
BNForwardTraining2DWriteRes
<
T
><<<
grid
,
block
,
0
,
ctx
.
stream
()
>>>
(
transformed_x
.
template
data
<
T
>(),
scale
.
template
data
<
BatchNormParamType
<
T
>
>
(),
bias
.
template
data
<
BatchNormParamType
<
T
>
>
(),
C
,
N
,
H
*
W
*
D
,
transformed_y
.
template
data
<
T
>(),
compute_mean_tensor
.
data
<
BatchNormParamType
<
T
>>
(),
compute_inv_var_tensor
.
data
<
BatchNormParamType
<
T
>>
());
}
else
{
BNForwardTraining
<
T
,
block
,
DataLayout
::
kNHWC
>
// init block&grid config
int
block_x
=
std
::
min
(
phi
::
funcs
::
details
::
GetLastPow2
(
C
),
WARP_SIZE
);
int
block_y
=
std
::
min
(
phi
::
funcs
::
details
::
GetLastPow2
(
N
*
H
*
W
*
D
/
16
),
block_size
/
block_x
);
if
(
block_x
*
block_y
!=
block_size
)
{
block_x
=
std
::
min
(
phi
::
funcs
::
details
::
GetLastPow2
(
C
),
block_size
/
block_y
);
}
int
grid_x
=
(
C
+
block_x
-
1
)
/
block_x
;
int
grid_y
=
std
::
min
((
N
*
H
*
W
*
D
+
block_y
*
16
-
1
)
/
(
block_y
*
16
),
MAX_GRID_SIZE
);
block
.
x
=
block_x
;
block
.
y
=
block_y
;
grid
.
x
=
grid_x
;
grid
.
y
=
grid_y
;
if
(
grid
.
y
>
1
)
{
block_data_tensor
=
phi
::
Empty
<
BatchNormParamType
<
T
>
,
Context
>
(
ctx
,
{
2
*
C
*
grid
.
y
});
flag_tensor
=
phi
::
Empty
<
int
,
Context
>
(
ctx
,
{
grid
.
x
});
block_data_ptr
=
block_data_tensor
.
data
<
BatchNormParamType
<
T
>>
();
flag_ptr
=
flag_tensor
.
data
<
int
>
();
funcs
::
SetConstant
<
Context
,
int
>
set_zero
;
set_zero
(
ctx
,
&
flag_tensor
,
static_cast
<
int
>
(
0
));
}
BNForwardTraining2DChannelLastCompStat
<
T
,
block_size
>
<<<
grid
,
block
,
0
,
ctx
.
stream
()
>>>
(
transformed_x
.
template
data
<
T
>(),
scale
.
template
data
<
BatchNormParamType
<
T
>
>
(),
...
...
@@ -555,7 +1039,23 @@ void BatchNormKernel(const Context &ctx,
mean_out
->
template
data
<
BatchNormParamType
<
T
>
>
(),
variance_out
->
template
data
<
BatchNormParamType
<
T
>
>
(),
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
(),
saved_variance
->
template
data
<
BatchNormParamType
<
T
>
>
());
saved_variance
->
template
data
<
BatchNormParamType
<
T
>
>
(),
compute_mean_tensor
.
data
<
BatchNormParamType
<
T
>>
(),
compute_inv_var_tensor
.
data
<
BatchNormParamType
<
T
>>
(),
block_data_ptr
,
flag_ptr
);
BNForwardTraining2DChannelLastWriteRes
<
T
>
<<<
grid
,
block
,
0
,
ctx
.
stream
()
>>>
(
transformed_x
.
template
data
<
T
>(),
scale
.
template
data
<
BatchNormParamType
<
T
>
>
(),
bias
.
template
data
<
BatchNormParamType
<
T
>
>
(),
C
,
N
,
H
*
W
*
D
,
transformed_y
.
template
data
<
T
>(),
compute_mean_tensor
.
data
<
BatchNormParamType
<
T
>>
(),
compute_inv_var_tensor
.
data
<
BatchNormParamType
<
T
>>
());
}
}
else
{
#if CUDNN_VERSION_MIN(7, 4, 1)
...
...
python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
浏览文件 @
1bc47c84
...
...
@@ -82,50 +82,58 @@ class TestBatchNorm(unittest.TestCase):
self
.
assertRaises
(
ValueError
,
error2d_dataformat
)
self
.
assertRaises
(
ValueError
,
error3d_dataformat
)
def
test_eager_api
(
self
):
places
=
[
fluid
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
for
p
in
places
:
shape
=
[
4
,
10
,
4
,
4
]
def
test_large_batch
(
self
):
def
compute_v1
(
x
):
def
compute_baseline
(
x
):
with
fluid
.
dygraph
.
guard
(
p
):
bn
=
fluid
.
dygraph
.
BatchNorm
(
shape
[
1
])
#bn = paddle.nn.BatchNorm2D(shape[1])
x1
=
paddle
.
to_tensor
(
x
)
x1
.
stop_gradient
=
False
y
=
bn
(
x1
)
y
.
backward
()
return
y
.
numpy
(),
x1
.
gradient
()
def
compute_v2
(
x
):
def
compute_1d
(
x
):
with
fluid
.
dygraph
.
guard
(
p
):
with
_test_eager_guard
():
print
(
"v2"
)
bn
=
paddle
.
nn
.
BatchNorm2D
(
shape
[
1
])
bn
=
paddle
.
nn
.
BatchNorm1D
(
shape
[
1
])
x1
=
paddle
.
to_tensor
(
x
)
x1
.
stop_gradient
=
False
y
=
bn
(
x1
)
y
.
backward
()
return
y
.
numpy
(),
x1
.
gradient
()
places
=
[
fluid
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
for
p
in
places
:
# [N, C]
shape
=
[
200000
,
4
]
x
=
np
.
random
.
randn
(
*
shape
).
astype
(
"float32"
)
y1
,
g1
=
compute_baseline
(
x
)
y2
,
g2
=
compute_1d
(
x
)
self
.
assertTrue
(
np
.
allclose
(
g1
,
g2
))
self
.
assertTrue
(
np
.
allclose
(
y1
,
y2
))
# [N, C, L]
shape
=
[
1000000
,
4
,
4
]
x
=
np
.
random
.
randn
(
*
shape
).
astype
(
"float32"
)
y1
,
g1
=
compute_
v1
(
x
)
y2
,
g2
=
compute_
v2
(
x
)
y1
,
g1
=
compute_
baseline
(
x
)
y2
,
g2
=
compute_
1d
(
x
)
self
.
assertTrue
(
np
.
allclose
(
g1
,
g2
))
self
.
assertTrue
(
np
.
allclose
(
y1
,
y2
))
def
test_eager_api
_1d
(
self
):
def
test_eager_api
(
self
):
places
=
[
fluid
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
for
p
in
places
:
shape
=
[
200000
,
4
]
shape
=
[
4
,
10
,
4
,
4
]
def
compute_v1
(
x
):
with
fluid
.
dygraph
.
guard
(
p
):
bn
=
fluid
.
dygraph
.
BatchNorm
(
shape
[
1
])
#bn = paddle.nn.BatchNorm2D(shape[1])
x1
=
paddle
.
to_tensor
(
x
)
x1
.
stop_gradient
=
False
y
=
bn
(
x1
)
...
...
@@ -135,7 +143,8 @@ class TestBatchNorm(unittest.TestCase):
def
compute_v2
(
x
):
with
fluid
.
dygraph
.
guard
(
p
):
with
_test_eager_guard
():
bn
=
paddle
.
nn
.
BatchNorm1D
(
shape
[
1
])
print
(
"v2"
)
bn
=
paddle
.
nn
.
BatchNorm2D
(
shape
[
1
])
x1
=
paddle
.
to_tensor
(
x
)
x1
.
stop_gradient
=
False
y
=
bn
(
x1
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录