Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
5e813b53
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5e813b53
编写于
12月 01, 2019
作者:
J
Jie Fang
提交者:
gongweibao
12月 01, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
nhwc optimization for batchnorm (#21090)
上级
fce24315
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
461 addition
and
77 deletion
+461
-77
paddle/fluid/framework/grad_op_desc_maker.h
paddle/fluid/framework/grad_op_desc_maker.h
+4
-0
paddle/fluid/imperative/dygraph_grad_maker.h
paddle/fluid/imperative/dygraph_grad_maker.h
+6
-0
paddle/fluid/operators/batch_norm_op.cc
paddle/fluid/operators/batch_norm_op.cc
+41
-19
paddle/fluid/operators/batch_norm_op.cu
paddle/fluid/operators/batch_norm_op.cu
+249
-35
paddle/fluid/operators/batch_norm_op.h
paddle/fluid/operators/batch_norm_op.h
+93
-6
paddle/fluid/platform/dynload/cudnn.cc
paddle/fluid/platform/dynload/cudnn.cc
+4
-0
paddle/fluid/platform/dynload/cudnn.h
paddle/fluid/platform/dynload/cudnn.h
+9
-0
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+24
-10
python/paddle/fluid/tests/unittests/test_batch_norm_op.py
python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+31
-7
未找到文件。
paddle/fluid/framework/grad_op_desc_maker.h
浏览文件 @
5e813b53
...
...
@@ -141,6 +141,10 @@ class GradOpDescMakerBase {
return
(
fwd_op_
.
Inputs
().
count
(
name
)
>
0
);
}
bool
HasOutput
(
const
std
::
string
&
name
)
const
{
return
(
fwd_op_
.
Outputs
().
count
(
name
)
>
0
);
}
private:
const
OpDesc
&
fwd_op_
;
const
std
::
unordered_set
<
std
::
string
>&
no_grad_set_
;
...
...
paddle/fluid/imperative/dygraph_grad_maker.h
浏览文件 @
5e813b53
...
...
@@ -107,6 +107,12 @@ class GradOpBaseMakerBase {
return
it
!=
var_base_map_in_
.
end
();
}
bool
HasOutput
(
const
std
::
string
name
)
const
{
auto
it
=
var_base_map_out_
.
find
(
name
);
return
it
!=
var_base_map_out_
.
end
();
}
private:
std
::
vector
<
std
::
shared_ptr
<
VarBase
>>
GetVarBaseList
(
const
std
::
string
&
name
,
bool
is_grad
,
...
...
paddle/fluid/operators/batch_norm_op.cc
浏览文件 @
5e813b53
...
...
@@ -25,27 +25,42 @@ namespace paddle {
namespace
operators
{
void
BatchNormOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of ConvOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Scale"
),
"Input(Scale) of ConvOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Bias"
),
"Input(Bias) of ConvOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Mean"
),
"Input(Mean) of ConvOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Variance"
),
"Input(Variance) of ConvOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Y"
),
"Output(Y) of ConvOp should not be null."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"X"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(X) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Scale"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Scale) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Bias"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Bias) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Mean"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Mean) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Variance"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Variance) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"Y"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(Y) of BatchNormOp should not be null."
));
bool
is_test
=
ctx
->
Attrs
().
Get
<
bool
>
(
"is_test"
);
if
(
!
is_test
)
{
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"MeanOut"
),
"Output(MeanOut) of ConvOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"VarianceOut"
),
"Output(VarianceOut) of ConvOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"SavedMean"
),
"Output(SavedMean) of ConvOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"SavedVariance"
),
"Output(SavedVariance) of ConvOp should not be null."
);
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"MeanOut"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(MeanOut) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"VarianceOut"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(VarianceOut) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"SavedMean"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(SavedMean) of BatchNormOp should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"SavedVariance"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(SavedVariance) of BatchNormOp should not be null."
));
}
// make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
...
...
@@ -200,6 +215,10 @@ void BatchNormOpMaker::Make() {
"Variance of the current mini batch, "
"will apply to output when training"
)
.
AsIntermediate
();
AddOutput
(
"ReserveSpace"
,
"Reserve GPU space for triggering the new semi-persistent "
"NHWC kernel"
)
.
AsDispensable
();
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
...
...
@@ -643,6 +662,9 @@ std::unique_ptr<T> BatchNormGradMaker<T>::Apply() const {
op
->
SetInput
(
"Bias"
,
this
->
Input
(
"Bias"
));
op
->
SetInput
(
"SavedMean"
,
this
->
Output
(
"SavedMean"
));
op
->
SetInput
(
"SavedVariance"
,
this
->
Output
(
"SavedVariance"
));
if
(
this
->
HasOutput
(
"ReserveSpace"
))
{
op
->
SetInput
(
"ReserveSpace"
,
this
->
Output
(
"ReserveSpace"
));
}
// used when setting use_global_stats True during training
if
(
boost
::
get
<
bool
>
(
this
->
GetAttr
(
"use_global_stats"
)))
{
...
...
paddle/fluid/operators/batch_norm_op.cu
浏览文件 @
5e813b53
...
...
@@ -56,12 +56,39 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
const
auto
&
x_dims
=
x
->
dims
();
PADDLE_ENFORCE
(
x_dims
.
size
()
>=
2
&&
x_dims
.
size
()
<=
5
,
"The Input dim size should be between 2 and 5"
);
int
N
,
C
,
H
,
W
,
D
;
ExtractNCWHD
(
x_dims
,
data_layout
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
N
,
C
,
H
,
W
,
D
;
ExtractNCWHD
(
x_dims
,
data_layout
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
const
bool
fast_nhwc_batch_norm
=
is_test
||
(
dtype
==
CUDNN_DATA_HALF
&&
FLAGS_cudnn_batchnorm_spatial_persistent
);
auto
compute_format
=
fast_nhwc_batch_norm
&&
data_layout
==
DataLayout
::
kNHWC
?
DataLayout
::
kNHWC
:
DataLayout
::
kNCHW
;
Tensor
transformed_x
(
x
->
type
());
Tensor
transformed_y
(
y
->
type
());
if
(
data_layout
==
DataLayout
::
kNHWC
&&
compute_format
==
DataLayout
::
kNCHW
&&
x_dims
.
size
()
>
2
)
{
VLOG
(
3
)
<<
"Transform input tensor from NHWC to NCHW."
;
ResizeToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
x
,
&
transformed_x
);
TransToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
x
,
&
transformed_x
);
ResizeToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
y
,
&
transformed_y
);
}
else
{
transformed_x
.
ShareDataWith
(
*
x
);
transformed_y
.
ShareDataWith
(
*
y
);
}
// ------------------- cudnn descriptors ---------------------
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
bn_param_desc_
;
...
...
@@ -90,7 +117,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
VLOG
(
3
)
<<
"Setting descriptors."
;
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
if
(
data_layou
t
==
DataLayout
::
kNCHW
)
{
if
(
compute_forma
t
==
DataLayout
::
kNCHW
)
{
dims
=
{
N
,
C
,
H
,
W
,
D
};
strides
=
{
C
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
}
else
{
...
...
@@ -126,8 +153,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
handle
,
// Note: PERSISTENT not implemented for inference
CUDNN_BATCHNORM_SPATIAL
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x
->
template
data
<
T
>(),
data_desc_
,
y
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
transformed_x
.
template
data
<
T
>(),
data_desc_
,
transformed_y
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
bn_param_desc_
,
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
bias
->
template
data
<
BatchNormParamType
<
T
>
>
(),
est_mean
->
template
data
<
BatchNormParamType
<
T
>
>
(),
...
...
@@ -167,23 +195,102 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
}
else
{
double
this_factor
=
1.
-
momentum
;
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationForwardTraining
(
handle
,
mode_
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x
->
template
data
<
T
>(),
data_desc_
,
y
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
bn_param_desc_
,
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
bias
->
template
data
<
BatchNormParamType
<
T
>
>
(),
this_factor
,
mean_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
variance_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())));
bool
called
=
false
;
#if CUDNN_VERSION_MIN(7, 4, 1)
if
(
compute_format
==
DataLayout
::
kNHWC
)
{
called
=
true
;
size_t
workspace_size
=
0
;
size_t
reserve_space_size
=
0
;
void
*
reserve_space_ptr
=
nullptr
;
void
*
workspace_ptr
=
nullptr
;
Tensor
workspace_tensor
;
// Create reserve space and workspace for batch norm.
// Create tensor for each batchnorm op, it will be used in the
// backward. Thus this tensor shouldn't be temp.
auto
*
reserve_space
=
ctx
.
Output
<
Tensor
>
(
"ReserveSpace"
);
PADDLE_ENFORCE_NOT_NULL
(
reserve_space
,
platform
::
errors
::
NotFound
(
"The argument ReserveSpace of batch_norm op is not found."
));
// --------------- cudnn batchnorm workspace ---------------
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize
(
/*handle=*/
handle
,
/*mode=*/
mode_
,
/*bnIps=*/
CUDNN_BATCHNORM_OPS_BN
,
/*xDesc=*/
data_desc_
,
/*zDesc=*/
nullptr
,
/*yDesc=*/
data_desc_
,
/*bnScaleBiasMeanVarDesc=*/
bn_param_desc_
,
/*activationDesc=*/
nullptr
,
/*sizeInBytes=*/
&
workspace_size
));
// -------------- cudnn batchnorm reserve space --------------
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetBatchNormalizationTrainingExReserveSpaceSize
(
/*handle=*/
handle
,
/*mode=*/
mode_
,
/*bnOps=*/
CUDNN_BATCHNORM_OPS_BN
,
/*activationDesc=*/
nullptr
,
/*xDesc=*/
data_desc_
,
/*sizeInBytes=*/
&
reserve_space_size
));
reserve_space_ptr
=
reserve_space
->
mutable_data
(
ctx
.
GetPlace
(),
transformed_x
.
type
(),
reserve_space_size
);
workspace_ptr
=
workspace_tensor
.
mutable_data
(
ctx
.
GetPlace
(),
transformed_x
.
type
(),
workspace_size
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationForwardTrainingEx
(
handle
,
mode_
,
CUDNN_BATCHNORM_OPS_BN
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
transformed_x
.
template
data
<
T
>(),
nullptr
,
nullptr
,
data_desc_
,
transformed_y
.
template
data
<
T
>(),
bn_param_desc_
,
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
bias
->
template
data
<
BatchNormParamType
<
T
>
>
(),
this_factor
,
mean_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
variance_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
nullptr
,
workspace_ptr
,
workspace_size
,
reserve_space_ptr
,
reserve_space_size
));
}
#endif
if
(
!
called
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationForwardTraining
(
handle
,
mode_
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
transformed_x
.
template
data
<
T
>(),
data_desc_
,
transformed_y
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
bn_param_desc_
,
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
bias
->
template
data
<
BatchNormParamType
<
T
>
>
(),
this_factor
,
mean_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
variance_out
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())));
}
}
}
if
(
data_layout
==
DataLayout
::
kNHWC
&&
compute_format
==
DataLayout
::
kNCHW
&&
x_dims
.
size
()
>
2
)
{
VLOG
(
3
)
<<
"Transform batchnorm output from NCHW to NHWC"
;
TransToChannelLast
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
&
transformed_y
,
y
);
}
// clean when exit.
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
CUDNN_ENFORCE
(
...
...
@@ -337,9 +444,41 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
PADDLE_ENFORCE_EQ
(
scale
->
dims
().
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
scale
->
dims
()[
0
],
C
);
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
const
auto
*
reserve_space
=
ctx
.
Input
<
Tensor
>
(
"ReserveSpace"
);
const
bool
fast_nhwc_batch_norm
=
dtype
==
CUDNN_DATA_HALF
&&
FLAGS_cudnn_batchnorm_spatial_persistent
&&
reserve_space
!=
nullptr
;
auto
compute_format
=
fast_nhwc_batch_norm
&&
data_layout
==
DataLayout
::
kNHWC
?
DataLayout
::
kNHWC
:
DataLayout
::
kNCHW
;
Tensor
transformed_x
(
x
->
type
());
Tensor
transformed_d_y
(
d_y
->
type
());
Tensor
transformed_d_x
(
d_x
->
type
());
if
(
data_layout
==
DataLayout
::
kNHWC
&&
compute_format
==
DataLayout
::
kNCHW
)
{
VLOG
(
3
)
<<
"Transform input tensor from NHWC to NCHW."
;
ResizeToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
x
,
&
transformed_x
);
TransToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
x
,
&
transformed_x
);
ResizeToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
d_y
,
&
transformed_d_y
);
TransToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
d_y
,
&
transformed_d_y
);
ResizeToChannelFirst
<
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
d_x
,
&
transformed_d_x
);
}
else
{
transformed_x
.
ShareDataWith
(
*
x
);
transformed_d_y
.
ShareDataWith
(
*
d_y
);
transformed_d_x
.
ShareDataWith
(
*
d_x
);
}
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
if
(
data_layou
t
==
DataLayout
::
kNCHW
)
{
if
(
compute_forma
t
==
DataLayout
::
kNCHW
)
{
dims
=
{
N
,
C
,
H
,
W
,
D
};
strides
=
{
C
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
}
else
{
...
...
@@ -348,7 +487,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
}
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
const
int
num
=
x
->
numel
();
const
int
num
=
transformed_x
.
numel
();
const
int
block
=
512
;
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
...
...
@@ -404,20 +543,95 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
saved_var
->
template
data
<
BatchNormParamType
<
T
>
>
();
if
(
d_scale
&&
d_bias
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
mode_
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x
->
template
data
<
T
>(),
data_desc_
,
d_y
->
template
data
<
T
>(),
data_desc_
,
d_x
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
bn_param_desc_
,
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
d_bias
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean_data
,
saved_var_data
));
bool
called
=
false
;
#if CUDNN_VERSION_MIN(7, 4, 1)
if
(
compute_format
==
DataLayout
::
kNHWC
)
{
called
=
true
;
size_t
workspace_size
=
0
;
void
*
workspace_ptr
=
nullptr
;
Tensor
workspace_tensor
;
auto
reserve_space_size
=
reserve_space
->
memory_size
();
// --------------- cudnn batchnorm workspace ---------------
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnGetBatchNormalizationBackwardExWorkspaceSize
(
/*handle=*/
dev_ctx
.
cudnn_handle
(),
/*mode=*/
mode_
,
/*bnIps=*/
CUDNN_BATCHNORM_OPS_BN
,
/*xDesc=*/
data_desc_
,
/*yDesc=*/
data_desc_
,
/*dyDesc=*/
data_desc_
,
/*dzDesc=*/
nullptr
,
/*dxDesc=*/
data_desc_
,
/*bnScaleBiasMeanVarDesc=*/
bn_param_desc_
,
/*activationDesc=*/
nullptr
,
/*sizeInBytes=*/
&
workspace_size
));
workspace_ptr
=
workspace_tensor
.
mutable_data
(
ctx
.
GetPlace
(),
transformed_x
.
type
(),
workspace_size
);
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationBackwardEx
(
/*handle=*/
dev_ctx
.
cudnn_handle
(),
/*mode=*/
mode_
,
/*bnOps=*/
CUDNN_BATCHNORM_OPS_BN
,
/*alphaDataDiff=*/
CudnnDataType
<
T
>::
kOne
(),
/*betaDataDiff=*/
CudnnDataType
<
T
>::
kZero
(),
/*alphaParamDiff=*/
CudnnDataType
<
T
>::
kOne
(),
/*betaParamDiff=*/
CudnnDataType
<
T
>::
kZero
(),
/*xDesc=*/
data_desc_
,
/*xData=*/
transformed_x
.
template
data
<
T
>(),
/*yDesc=*/
nullptr
,
/*yData=*/
nullptr
,
/*dyDesc=*/
data_desc_
,
/*dyData=*/
transformed_d_y
.
template
data
<
T
>(),
/*dzDesc=*/
nullptr
,
/*dzData=*/
nullptr
,
/*dxDesc=*/
data_desc_
,
/*dxData=*/
transformed_d_x
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
/*dBnScaleBiasDesc=*/
bn_param_desc_
,
/*bnScaleData=*/
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
/*bnBiasData=*/
nullptr
,
/*dBnScaleData=*/
d_scale
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
/*dBnBiasData=*/
d_bias
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
/*epsilon=*/
epsilon
,
/*savedMean=*/
saved_mean_data
,
/*savedInvVariance=*/
saved_var_data
,
/*activationDesc=*/
nullptr
,
/*workspace=*/
workspace_ptr
,
/*workSpaceSizeInBytes=*/
workspace_size
,
/*reserveSpace=*/
const_cast
<
T
*>
(
reserve_space
->
template
data
<
T
>()),
/*reserveSpaceSizeInBytes=*/
reserve_space_size
));
}
#endif
if
(
!
called
)
{
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
mode_
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
transformed_x
.
template
data
<
T
>(),
data_desc_
,
transformed_d_y
.
template
data
<
T
>(),
data_desc_
,
transformed_d_x
.
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
bn_param_desc_
,
scale
->
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
d_bias
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean_data
,
saved_var_data
));
}
if
(
data_layout
==
DataLayout
::
kNHWC
&&
compute_format
==
DataLayout
::
kNCHW
)
{
VLOG
(
3
)
<<
"Transform batchnorm output from NCHW to NHWC"
;
TransToChannelLast
<
paddle
::
platform
::
CUDADeviceContext
,
T
>
(
ctx
,
&
transformed_d_x
,
d_x
);
}
}
else
{
if
(
data_layout
==
framework
::
DataLayout
::
kNCHW
)
{
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
if
(
d_x
)
{
BNBackwardData
<
T
,
block
,
framework
::
DataLayout
::
kNCHW
><<<
grid2
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
...
...
@@ -450,7 +664,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
const
auto
*
running_var_data
=
running_var
->
template
data
<
BatchNormParamType
<
T
>
>
();
if
(
data_layout
==
framework
::
DataLayout
::
kNCHW
)
{
if
(
compute_format
==
DataLayout
::
kNCHW
)
{
if
(
d_x
)
{
KeBNBackwardData
<
T
,
framework
::
DataLayout
::
kNCHW
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
...
...
paddle/fluid/operators/batch_norm_op.h
浏览文件 @
5e813b53
...
...
@@ -16,8 +16,10 @@ limitations under the License. */
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/norm_utils.h"
namespace
paddle
{
...
...
@@ -39,24 +41,109 @@ template <typename T>
using
ConstEigenVectorArrayMap
=
Eigen
::
Map
<
const
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>>
;
template
<
typename
DeviceContext
,
typename
T
>
inline
void
ResizeToChannelFirst
(
const
framework
::
ExecutionContext
&
context
,
const
Tensor
*
input
,
Tensor
*
transformed_input
)
{
int
dim
=
input
->
dims
().
size
()
-
2
;
if
(
dim
==
3
)
{
// input
transformed_input
->
Resize
(
input
->
dims
());
auto
in_dims_vec
=
framework
::
vectorize
(
input
->
dims
());
in_dims_vec
[
1
]
=
input
->
dims
()[
4
];
in_dims_vec
[
2
]
=
input
->
dims
()[
1
];
in_dims_vec
[
3
]
=
input
->
dims
()[
2
];
in_dims_vec
[
4
]
=
input
->
dims
()[
3
];
transformed_input
->
Resize
(
framework
::
make_ddim
(
in_dims_vec
));
transformed_input
->
mutable_data
<
T
>
(
context
.
GetPlace
());
}
else
if
(
dim
==
2
)
{
// input
transformed_input
->
Resize
(
input
->
dims
());
auto
in_dims_vec
=
framework
::
vectorize
(
input
->
dims
());
in_dims_vec
[
1
]
=
input
->
dims
()[
3
];
in_dims_vec
[
2
]
=
input
->
dims
()[
1
];
in_dims_vec
[
3
]
=
input
->
dims
()[
2
];
transformed_input
->
Resize
(
framework
::
make_ddim
(
in_dims_vec
));
transformed_input
->
mutable_data
<
T
>
(
context
.
GetPlace
());
}
else
if
(
dim
==
1
)
{
transformed_input
->
Resize
(
input
->
dims
());
auto
in_dims_vec
=
framework
::
vectorize
(
input
->
dims
());
in_dims_vec
[
1
]
=
input
->
dims
()[
2
];
in_dims_vec
[
2
]
=
input
->
dims
()[
1
];
transformed_input
->
Resize
(
framework
::
make_ddim
(
in_dims_vec
));
transformed_input
->
mutable_data
<
T
>
(
context
.
GetPlace
());
}
}
template
<
typename
DeviceContext
,
typename
T
>
inline
void
TransToChannelFirst
(
const
framework
::
ExecutionContext
&
context
,
const
Tensor
*
input
,
Tensor
*
transformed_input
)
{
int
dim
=
input
->
dims
().
size
()
-
2
;
if
(
dim
==
3
)
{
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
std
::
vector
<
int
>
axis
{
0
,
4
,
1
,
2
,
3
};
math
::
Transpose
<
DeviceContext
,
T
,
5
>
trans5
;
trans5
(
dev_ctx
,
*
input
,
transformed_input
,
axis
);
}
else
if
(
dim
==
2
)
{
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
std
::
vector
<
int
>
axis
{
0
,
3
,
1
,
2
};
math
::
Transpose
<
DeviceContext
,
T
,
4
>
trans4
;
trans4
(
dev_ctx
,
*
input
,
transformed_input
,
axis
);
}
else
if
(
dim
==
1
)
{
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
std
::
vector
<
int
>
axis
{
0
,
2
,
1
};
math
::
Transpose
<
DeviceContext
,
T
,
3
>
trans3
;
trans3
(
dev_ctx
,
*
input
,
transformed_input
,
axis
);
}
}
template
<
typename
DeviceContext
,
typename
T
>
inline
void
TransToChannelLast
(
const
framework
::
ExecutionContext
&
context
,
const
Tensor
*
input
,
Tensor
*
transformed_input
)
{
int
dim
=
input
->
dims
().
size
()
-
2
;
if
(
dim
==
3
)
{
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
std
::
vector
<
int
>
axis
{
0
,
2
,
3
,
4
,
1
};
math
::
Transpose
<
DeviceContext
,
T
,
5
>
trans5
;
trans5
(
dev_ctx
,
*
input
,
transformed_input
,
axis
);
}
else
if
(
dim
==
2
)
{
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
std
::
vector
<
int
>
axis
{
0
,
2
,
3
,
1
};
math
::
Transpose
<
DeviceContext
,
T
,
4
>
trans4
;
trans4
(
dev_ctx
,
*
input
,
transformed_input
,
axis
);
}
else
if
(
dim
==
1
)
{
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
std
::
vector
<
int
>
axis
{
0
,
2
,
1
};
math
::
Transpose
<
DeviceContext
,
T
,
3
>
trans3
;
trans3
(
dev_ctx
,
*
input
,
transformed_input
,
axis
);
}
}
class
BatchNormOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
;
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
class
BatchNormGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
;
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
class
BatchNormOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
...
...
@@ -85,13 +172,13 @@ class BatchNormOpInferVarType
template
<
typename
DeviceContext
,
typename
T
>
class
BatchNormKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
template
<
typename
DeviceContext
,
typename
T
>
class
BatchNormGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
;
};
}
// namespace operators
...
...
paddle/fluid/platform/dynload/cudnn.cc
浏览文件 @
5e813b53
...
...
@@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R7
(
DEFINE_WRAP
);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
CUDNN_DNN_ROUTINE_EACH_AFTER_R7
(
DEFINE_WRAP
);
#endif
#ifdef PADDLE_USE_DSO
bool
HasCUDNN
()
{
std
::
call_once
(
cudnn_dso_flag
,
...
...
paddle/fluid/platform/dynload/cudnn.h
浏览文件 @
5e813b53
...
...
@@ -189,6 +189,15 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R7
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#endif
#if CUDNN_VERSION >= 7401
#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro) \
__macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
__macro(cudnnBatchNormalizationForwardTrainingEx); \
__macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize); \
__macro(cudnnBatchNormalizationBackwardEx); \
__macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
CUDNN_DNN_ROUTINE_EACH_AFTER_R7
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
#endif
}
// namespace dynload
}
// namespace platform
}
// namespace paddle
python/paddle/fluid/layers/nn.py
浏览文件 @
5e813b53
...
...
@@ -2523,6 +2523,13 @@ def batch_norm(input,
check_type_and_dtype(input, 'input', Variable,
['float16', 'float32', 'float64'], 'batch_norm')
dtype = helper.input_dtype()
has_reserve_space = False
if data_layout == 'NHWC':
flag = os.environ.get('FLAGS_cudnn_batchnorm_spatial_persistent')
if flag is not None and flag.lower() in ['true', '1']:
has_reserve_space = True
# use fp32 for bn parameter
if dtype == core.VarDesc.VarType.FP16:
dtype = core.VarDesc.VarType.FP32
...
...
@@ -2577,6 +2584,11 @@ def batch_norm(input,
saved_variance = helper.create_variable_for_type_inference(
dtype=dtype, stop_gradient=True)
reserve_space = None
if has_reserve_space:
reserve_space = helper.create_variable_for_type_inference(
dtype=core.VarDesc.VarType.FP16, stop_gradient=True)
batch_norm_out = input if in_place else helper.create_variable_for_type_inference(
dtype)
...
...
@@ -2599,17 +2611,19 @@ def batch_norm(input,
inputs['MomemtumTensor'] = momentum
else:
attrs['momentum'] = momentum
outputs = {
"Y": batch_norm_out,
"MeanOut": mean_out,
"VarianceOut": variance_out,
"SavedMean": saved_mean,
"SavedVariance": saved_variance
}
if reserve_space is not None:
outputs["ReserveSpace"] = reserve_space
helper.append_op(
type="batch_norm",
inputs=inputs,
outputs={
"Y": batch_norm_out,
"MeanOut": mean_out,
"VarianceOut": variance_out,
"SavedMean": saved_mean,
"SavedVariance": saved_variance
},
attrs=attrs)
type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
return helper.append_activation(batch_norm_out)
...
...
python/paddle/fluid/tests/unittests/test_batch_norm_op.py
浏览文件 @
5e813b53
...
...
@@ -14,6 +14,7 @@
from
__future__
import
print_function
import
os
import
unittest
import
numpy
as
np
import
paddle.fluid.core
as
core
...
...
@@ -413,16 +414,28 @@ class TestBatchNormOpTraining(unittest.TestCase):
inputs
[
'MomentumTensor'
]
=
block
.
var
(
'momentum_var'
)
else
:
attrs
[
'momentum'
]
=
momentum
outputs
=
{
"Y"
:
block
.
var
(
'y'
),
"MeanOut"
:
block
.
var
(
'mean'
),
# share memory
"VarianceOut"
:
block
.
var
(
'variance'
),
# share memory
"SavedMean"
:
block
.
var
(
'saved_mean'
),
"SavedVariance"
:
block
.
var
(
'saved_variance'
)
}
has_reserve_space
=
False
if
data_format
==
'NHWC'
:
flag
=
os
.
environ
.
get
(
'FLAGS_cudnn_batchnorm_spatial_persistent'
)
if
flag
is
not
None
and
flag
.
lower
()
in
[
'true'
,
'1'
]:
has_reserve_space
=
True
if
has_reserve_space
:
block
.
create_var
(
name
=
"reserve_space"
,
dtype
=
'float16'
)
outputs
[
"ReserveSpace"
]
=
block
.
var
(
'reserve_space'
)
del
os
.
environ
[
'FLAGS_cudnn_batchnorm_spatial_persistent'
]
bn_op
=
block
.
append_op
(
type
=
"batch_norm"
,
inputs
=
inputs
,
outputs
=
{
"Y"
:
block
.
var
(
'y'
),
"MeanOut"
:
block
.
var
(
'mean'
),
# share memory
"VarianceOut"
:
block
.
var
(
'variance'
),
# share memory
"SavedMean"
:
block
.
var
(
'saved_mean'
),
"SavedVariance"
:
block
.
var
(
'saved_variance'
)
},
outputs
=
outputs
,
attrs
=
attrs
)
block
.
create_var
(
name
=
'y@GRAD'
,
dtype
=
'float32'
,
shape
=
y
.
shape
)
...
...
@@ -479,6 +492,17 @@ class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining):
self
.
fetch_list
=
[
'y'
,
'mean'
,
'variance'
,
'x@GRAD'
]
class
TestBatchNormOpTrainingCase2
(
TestBatchNormOpTraining
):
def
init_test_case
(
self
):
self
.
use_global_stats
=
False
self
.
no_grad_set
=
set
()
self
.
fetch_list
=
[
'y'
,
'mean'
,
'variance'
,
'saved_mean'
,
'saved_variance'
,
'x@GRAD'
,
'scale@GRAD'
,
'bias@GRAD'
]
os
.
environ
[
'FLAGS_cudnn_batchnorm_spatial_persistent'
]
=
"1"
class
TestBatchNormOpTrainingMomentumVariable
(
TestBatchNormOpTraining
):
def
init_test_case
(
self
):
self
.
use_momentum_variable
=
True
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录