Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
cc272afb
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
cc272afb
编写于
5月 26, 2022
作者:
Y
YuanRisheng
提交者:
GitHub
5月 26, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Phi]Refactor InstanceNormKernel and InstanceNormGradKernel (#42978)
* move instance_norm * change mutable_data * fix compile bugs
上级
8f7f3ac9
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
987 addition
and
604 deletion
+987
-604
paddle/fluid/operators/instance_norm_op.cc
paddle/fluid/operators/instance_norm_op.cc
+0
-220
paddle/fluid/operators/instance_norm_op.cu
paddle/fluid/operators/instance_norm_op.cu
+0
-384
paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
+146
-0
paddle/phi/kernels/cpu/instance_norm_kernel.cc
paddle/phi/kernels/cpu/instance_norm_kernel.cc
+126
-0
paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+319
-0
paddle/phi/kernels/gpu/instance_norm_kernel.cu
paddle/phi/kernels/gpu/instance_norm_kernel.cu
+221
-0
paddle/phi/kernels/gpu/instance_norm_utils.h
paddle/phi/kernels/gpu/instance_norm_utils.h
+73
-0
paddle/phi/kernels/instance_norm_grad_kernel.h
paddle/phi/kernels/instance_norm_grad_kernel.h
+33
-0
paddle/phi/kernels/instance_norm_kernel.h
paddle/phi/kernels/instance_norm_kernel.h
+31
-0
paddle/phi/ops/compat/instance_norm_sig.cc
paddle/phi/ops/compat/instance_norm_sig.cc
+38
-0
未找到文件。
paddle/fluid/operators/instance_norm_op.cc
浏览文件 @
cc272afb
...
...
@@ -170,104 +170,6 @@ NCHW `[batch, in_channels, in_height, in_width]`
)DOC"
);
}
template
<
typename
T
>
class
InstanceNormKernel
<
platform
::
CPUDeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
&
x_dims
=
x
->
dims
();
const
int
N
=
x_dims
[
0
];
const
int
C
=
x_dims
[
1
];
const
int
NxC
=
N
*
C
;
const
int
sample_size
=
x
->
numel
()
/
N
/
C
;
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
auto
*
saved_mean
=
ctx
.
Output
<
Tensor
>
(
"SavedMean"
);
auto
*
saved_variance
=
ctx
.
Output
<
Tensor
>
(
"SavedVariance"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>();
auto
*
place
=
dev_ctx
.
eigen_device
();
Eigen
::
DSizes
<
int
,
2
>
shape
(
NxC
,
sample_size
);
// Once eigen on Windows is updated, the if branch can be removed.
#ifndef EIGEN_HAS_INDEX_LIST
Eigen
::
DSizes
<
int
,
2
>
bcast
(
1
,
sample_size
);
Eigen
::
DSizes
<
int
,
2
>
C_shape
(
C
,
1
);
Eigen
::
DSizes
<
int
,
2
>
NxC_shape
(
NxC
,
1
);
Eigen
::
DSizes
<
int
,
1
>
rdims
(
1
);
#else
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>
,
int
>
bcast
;
bcast
.
set
(
1
,
sample_size
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
C_shape
;
C_shape
.
set
(
0
,
C
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
NxC_shape
;
NxC_shape
.
set
(
0
,
NxC
);
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>>
rdims
;
#endif
phi
::
funcs
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
set_constant
;
saved_mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
saved_variance
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_constant
(
dev_ctx
,
saved_mean
,
static_cast
<
T
>
(
0
));
set_constant
(
dev_ctx
,
saved_variance
,
static_cast
<
T
>
(
0
));
auto
saved_mean_a
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
saved_mean
);
auto
saved_mean_e
=
saved_mean_a
.
reshape
(
NxC_shape
);
auto
saved_variance_a
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
saved_variance
);
auto
saved_variance_e
=
saved_variance_a
.
reshape
(
NxC_shape
);
auto
x_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
x_arr
=
x_e
.
reshape
(
shape
);
saved_mean_e
.
device
(
*
place
)
=
x_arr
.
mean
(
rdims
);
auto
saved_variance_arr
=
(
x_arr
-
saved_mean_e
.
broadcast
(
bcast
)).
square
().
mean
(
rdims
)
+
epsilon
;
saved_variance_e
.
device
(
*
place
)
=
saved_variance_arr
.
sqrt
().
inverse
();
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
Tensor
scale_data
;
Tensor
bias_data
;
if
(
!
scale
)
{
scale_data
.
mutable_data
<
T
>
({
C
},
ctx
.
GetPlace
());
set_constant
(
dev_ctx
,
&
scale_data
,
static_cast
<
T
>
(
1
));
}
if
(
!
bias
)
{
bias_data
.
mutable_data
<
T
>
({
C
},
ctx
.
GetPlace
());
set_constant
(
dev_ctx
,
&
bias_data
,
static_cast
<
T
>
(
0
));
}
auto
scale_e
=
scale
?
framework
::
EigenVector
<
T
>::
Flatten
(
*
scale
)
:
framework
::
EigenVector
<
T
>::
Flatten
(
const_cast
<
const
framework
::
Tensor
&>
(
scale_data
));
auto
scale_arr
=
scale_e
.
reshape
(
C_shape
);
auto
bias_e
=
bias
?
framework
::
EigenVector
<
T
>::
Flatten
(
*
bias
)
:
framework
::
EigenVector
<
T
>::
Flatten
(
const_cast
<
const
framework
::
Tensor
&>
(
bias_data
));
auto
bias_arr
=
bias_e
.
reshape
(
C_shape
);
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
y_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
y_arr
=
y_e
.
reshape
(
shape
);
// (x - mean) * inv_std * scale + bias
Eigen
::
DSizes
<
int
,
2
>
bcast_param
(
N
,
sample_size
);
y_arr
.
device
(
*
place
)
=
(
x_arr
-
saved_mean_e
.
broadcast
(
bcast
))
*
saved_variance_e
.
broadcast
(
bcast
)
*
scale_arr
.
broadcast
(
bcast_param
)
+
bias_arr
.
broadcast
(
bcast_param
);
}
};
void
InstanceNormGradOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"InstanceNormGrad"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Y"
)),
"Input"
,
...
...
@@ -312,120 +214,6 @@ framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
),
ctx
.
GetPlace
());
}
template
<
typename
T
>
class
InstanceNormGradKernel
<
platform
::
CPUDeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
saved_mean
=
ctx
.
Input
<
Tensor
>
(
"SavedMean"
);
const
auto
*
saved_inv_variance
=
ctx
.
Input
<
Tensor
>
(
"SavedVariance"
);
const
auto
&
x_dims
=
x
->
dims
();
const
int
N
=
x_dims
[
0
];
const
int
C
=
x_dims
[
1
];
const
int
NxC
=
N
*
C
;
const
int
sample_size
=
x
->
numel
()
/
N
/
C
;
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>();
auto
*
place
=
dev_ctx
.
eigen_device
();
Eigen
::
DSizes
<
int
,
2
>
rshape
(
NxC
,
sample_size
);
Eigen
::
DSizes
<
int
,
2
>
param_shape
(
N
,
C
);
Eigen
::
DSizes
<
int
,
2
>
shape
(
NxC
,
sample_size
);
#ifndef EIGEN_HAS_INDEX_LIST
Eigen
::
DSizes
<
int
,
1
>
rdims
(
0
);
Eigen
::
DSizes
<
int
,
1
>
mean_rdims
(
1
);
Eigen
::
DSizes
<
int
,
2
>
bcast
(
1
,
sample_size
);
Eigen
::
DSizes
<
int
,
2
>
C_shape
(
C
,
1
);
Eigen
::
DSizes
<
int
,
2
>
NxC_shape
(
NxC
,
1
);
#else
Eigen
::
IndexList
<
Eigen
::
type2index
<
0
>>
rdims
;
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>>
mean_rdims
;
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>
,
int
>
bcast
;
bcast
.
set
(
1
,
sample_size
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
C_shape
;
C_shape
.
set
(
0
,
C
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
NxC_shape
;
NxC_shape
.
set
(
0
,
NxC
);
#endif
phi
::
funcs
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
set_constant
;
Tensor
scale_data
;
if
(
!
scale
)
{
scale_data
.
mutable_data
<
T
>
({
C
},
ctx
.
GetPlace
());
set_constant
(
dev_ctx
,
&
scale_data
,
static_cast
<
T
>
(
1
));
}
auto
scale_e
=
scale
?
framework
::
EigenVector
<
T
>::
Flatten
(
*
scale
)
:
framework
::
EigenVector
<
T
>::
Flatten
(
const_cast
<
const
framework
::
Tensor
&>
(
scale_data
));
auto
mean_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
saved_mean
);
auto
inv_var_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
saved_inv_variance
);
auto
dy_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
d_y
);
auto
x_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
scale_arr
=
scale_e
.
reshape
(
C_shape
);
auto
mean_arr
=
mean_e
.
reshape
(
NxC_shape
);
auto
inv_var_arr
=
inv_var_e
.
reshape
(
NxC_shape
);
auto
dy_arr
=
dy_e
.
reshape
(
shape
);
auto
x_arr
=
x_e
.
reshape
(
shape
);
auto
tmp
=
(
x_arr
-
mean_arr
.
eval
().
broadcast
(
bcast
))
*
inv_var_arr
.
eval
().
broadcast
(
bcast
);
// math: d_bias = np.sum(d_y, axis=(n,h,w))
// math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
if
(
d_scale
&&
d_bias
)
{
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_constant
(
dev_ctx
,
d_scale
,
static_cast
<
T
>
(
0
));
set_constant
(
dev_ctx
,
d_bias
,
static_cast
<
T
>
(
0
));
auto
d_scale_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
d_scale
);
auto
d_scale_data
=
d_scale_e
.
reshape
(
C_shape
);
auto
d_bias_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
d_bias
);
auto
d_bias_data
=
d_bias_e
.
reshape
(
C_shape
);
d_bias_data
.
device
(
*
place
)
=
dy_arr
.
sum
(
mean_rdims
).
reshape
(
param_shape
).
sum
(
rdims
);
d_scale_data
.
device
(
*
place
)
=
(
tmp
*
dy_arr
).
sum
(
mean_rdims
).
reshape
(
param_shape
).
sum
(
rdims
);
}
auto
dy_mean
=
dy_arr
.
mean
(
mean_rdims
).
reshape
(
NxC_shape
).
eval
().
broadcast
(
bcast
);
Eigen
::
DSizes
<
int
,
2
>
bcast_param
(
N
,
sample_size
);
set_constant
(
dev_ctx
,
d_x
,
static_cast
<
T
>
(
0
));
// math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
// axis=(h,w))
// - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
// mean),
// axis=(h,w))
auto
dx_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
d_x
);
auto
dx_arr
=
dx_e
.
reshape
(
shape
);
dx_arr
.
device
(
*
place
)
=
scale_arr
.
broadcast
(
bcast_param
)
*
inv_var_arr
.
broadcast
(
bcast
)
*
(
dy_arr
-
dy_mean
-
tmp
*
(
dy_arr
*
tmp
)
.
mean
(
mean_rdims
)
.
reshape
(
NxC_shape
)
.
eval
()
.
broadcast
(
bcast
));
}
};
void
InstanceNormDoubleGradOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"InstanceNormDoubleGrad"
);
...
...
@@ -699,14 +487,6 @@ REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp,
REGISTER_OPERATOR
(
instance_norm_grad_grad
,
ops
::
InstanceNormDoubleGradOp
,
ops
::
InstanceNormDoubleGradOpInplaceInferer
);
REGISTER_OP_CPU_KERNEL
(
instance_norm
,
ops
::
InstanceNormKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
InstanceNormKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
instance_norm_grad
,
ops
::
InstanceNormGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
InstanceNormGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
instance_norm_grad_grad
,
ops
::
InstanceNormDoubleGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
...
...
paddle/fluid/operators/instance_norm_op.cu
浏览文件 @
cc272afb
...
...
@@ -70,181 +70,6 @@ static __global__ void add_param(const T *input, T *output,
}
}
template
<
typename
T
>
class
InstanceNormKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
true
,
platform
::
errors
::
PreconditionNotMet
(
"It must be CUDAPlace."
));
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
&
x_dims
=
x
->
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
2
,
platform
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the size of X's dimensions must greater than "
"or equal to 2. But received: "
"the size of X's dimensions is [%d]"
,
x_dims
.
size
()));
PADDLE_ENFORCE_LE
(
x_dims
.
size
(),
5
,
platform
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the size of X's dimensions must smaller than"
"or equal to 5. But received: "
"the size of X's dimensions is [%d]"
,
x_dims
.
size
()));
int
N
,
C
,
H
,
W
,
D
;
ExtractNCWHD
(
x_dims
,
DataLayout
::
kNCHW
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
int
NxC
=
N
*
C
;
Tensor
x_tmp
;
x_tmp
.
ShareDataWith
(
*
x
).
Resize
({
1
,
NxC
,
H
,
W
,
D
});
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t
data_desc_
;
miopenTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
in_param_desc_
));
#else
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_param_desc_
));
#endif
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
<<
"CUDNN_BN_MIN_EPSILON. Setting it to "
<<
"CUDNN_BN_MIN_EPSILON instead."
;
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
VLOG
(
3
)
<<
"Setting descriptors."
;
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
dims
=
{
1
,
NxC
,
H
,
W
,
D
};
strides
=
{
NxC
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenSetTensorDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
const_cast
<
int
*>
(
dims
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
miopenBNSpatial
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
CUDNN_BATCHNORM_SPATIAL
));
#endif
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
Tensor
scale_tmp
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
NxC
},
dev_ctx
);
scale_tmp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
Tensor
bias_tmp
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
NxC
},
dev_ctx
);
bias_tmp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
int
n
=
x
->
numel
();
const
int
block
=
512
;
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
const
int
grid
=
std
::
min
((
NxC
+
block
-
1
)
/
block
,
max_blocks
);
phi
::
funcs
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
set_constant
;
if
(
scale
)
{
repeat_param
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
scale
->
data
<
T
>
(),
scale_tmp
.
data
<
T
>
(),
N
,
C
);
}
else
{
set_constant
(
dev_ctx
,
&
scale_tmp
,
static_cast
<
T
>
(
1
));
}
if
(
bias
)
{
repeat_param
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
bias
->
data
<
T
>
(),
bias_tmp
.
data
<
T
>
(),
N
,
C
);
}
else
{
set_constant
(
dev_ctx
,
&
bias_tmp
,
static_cast
<
T
>
(
0
));
}
auto
handle
=
dev_ctx
.
cudnn_handle
();
phi
::
funcs
::
SetConstant
<
platform
::
CUDADeviceContext
,
BatchNormParamType
<
T
>>
functor
;
auto
*
saved_mean
=
ctx
.
Output
<
Tensor
>
(
"SavedMean"
);
auto
*
saved_variance
=
ctx
.
Output
<
Tensor
>
(
"SavedVariance"
);
saved_mean
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
saved_variance
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
functor
(
dev_ctx
,
saved_mean
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
functor
(
dev_ctx
,
saved_variance
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenBatchNormalizationForwardTraining
(
handle
,
miopenBNSpatial
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kOne
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kZero
())),
data_desc_
,
static_cast
<
const
void
*>
(
x_tmp
.
template
data
<
T
>()),
data_desc_
,
static_cast
<
void
*>
(
y
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
())),
in_param_desc_
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
())),
0
,
nullptr
,
nullptr
,
epsilon
,
static_cast
<
void
*>
(
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())),
static_cast
<
void
*>
(
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()))));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
in_param_desc_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationForwardTraining
(
handle
,
CUDNN_BATCHNORM_SPATIAL
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
y
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
0
,
nullptr
,
nullptr
,
epsilon
,
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_param_desc_
));
#endif
}
};
template
<
typename
T
,
int
BlockDim
>
static
__global__
void
GradComputeDX
(
const
T
*
dy
,
const
BatchNormParamType
<
T
>
*
scale
,
...
...
@@ -297,203 +122,6 @@ static __global__ void GradComputeDX(const T *dy,
}
}
template
<
typename
T
>
class
InstanceNormGradKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
true
,
platform
::
errors
::
PreconditionNotMet
(
"It must use CUDAPlace."
));
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
const
auto
&
x_dims
=
x
->
dims
();
int
N
,
C
,
H
,
W
,
D
;
ExtractNCWHD
(
x_dims
,
DataLayout
::
kNCHW
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
int
NxC
=
N
*
C
;
Tensor
x_tmp
,
d_y_tmp
;
x_tmp
.
ShareDataWith
(
*
x
).
Resize
({
1
,
NxC
,
H
,
W
,
D
});
d_y_tmp
.
ShareDataWith
(
*
d_y
).
Resize
({
1
,
NxC
,
H
,
W
,
D
});
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
d_scale
&&
d_bias
)
{
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
if
(
scale
)
{
PADDLE_ENFORCE_EQ
(
scale
->
dims
().
size
(),
1UL
,
platform
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the size of scale's dimensions must be equal to 1. But "
"received: the size of scale's dimensions"
"is [%d]"
,
scale
->
dims
().
size
()));
PADDLE_ENFORCE_EQ
(
scale
->
dims
()[
0
],
C
,
platform
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the first dimension of scale must be equal to "
"Channels([%d]). But received: "
"the first dimension of scale is [%d],"
"the dimensions of scale is [%s], "
,
C
,
scale
->
dims
()[
0
],
scale
->
dims
()));
}
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
phi
::
funcs
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
set_constant
;
const
int
n
=
x
->
numel
();
const
int
block
=
512
;
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
const
int
grid
=
std
::
min
(
NxC
,
max_blocks
);
const
int
grid1
=
(
C
+
block
-
1
)
/
block
;
Tensor
scale_tmp
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
NxC
},
dev_ctx
);
scale_tmp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
Tensor
d_scale_tmp
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
NxC
},
dev_ctx
);
Tensor
d_bias_tmp
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
NxC
},
dev_ctx
);
if
(
scale
)
{
repeat_param
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
scale
->
data
<
T
>
(),
scale_tmp
.
data
<
T
>
(),
N
,
C
);
}
else
{
set_constant
(
dev_ctx
,
&
scale_tmp
,
static_cast
<
T
>
(
1
));
}
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
dims
=
{
1
,
NxC
,
H
,
W
,
D
};
strides
=
{
NxC
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
if
((
H
*
W
*
D
)
==
1
)
{
framework
::
TensorCopy
(
*
d_y
,
ctx
.
GetPlace
(),
d_x
);
phi
::
funcs
::
SetConstant
<
platform
::
CUDADeviceContext
,
BatchNormParamType
<
T
>>
functor
;
functor
(
dev_ctx
,
d_scale
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
functor
(
dev_ctx
,
d_bias
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
return
;
}
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t
data_desc_
;
miopenTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
in_param_desc_
));
#else
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_param_desc_
));
#endif
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
<<
"CUDNN_BN_MIN_EPSILON. Setting it to "
<<
"CUDNN_BN_MIN_EPSILON instead."
;
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenSetTensorDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
const_cast
<
int
*>
(
dims
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
miopenBNSpatial
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
CUDNN_BATCHNORM_SPATIAL
));
#endif
const
auto
*
saved_mean
=
ctx
.
Input
<
Tensor
>
(
"SavedMean"
);
const
auto
*
saved_var
=
ctx
.
Input
<
Tensor
>
(
"SavedVariance"
);
const
auto
*
saved_mean_data
=
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
();
const
auto
*
saved_var_data
=
saved_var
->
template
data
<
BatchNormParamType
<
T
>
>
();
if
(
d_scale
&&
d_bias
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
miopenBNSpatial
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
d_y_tmp
.
template
data
<
T
>(),
data_desc_
,
d_x
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale_tmp
.
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
d_bias_tmp
.
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean_data
,
saved_var_data
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
CUDNN_BATCHNORM_SPATIAL
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
d_y_tmp
.
template
data
<
T
>(),
data_desc_
,
d_x
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale_tmp
.
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
d_bias_tmp
.
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean_data
,
saved_var_data
));
#endif
}
else
{
if
(
d_x
)
{
GradComputeDX
<
T
,
block
><<<
NxC
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_y
->
data
<
T
>
(),
scale_tmp
.
data
<
BatchNormParamType
<
T
>>
(),
saved_mean_data
,
x
->
data
<
T
>
(),
saved_var_data
,
C
,
H
*
W
*
D
,
d_x
->
data
<
T
>
());
}
}
if
(
d_scale
&&
d_bias
)
{
add_param
<
T
,
block
,
false
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_scale_tmp
.
data
<
T
>
(),
d_scale
->
data
<
T
>
(),
N
,
C
);
add_param
<
T
,
block
,
false
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_bias_tmp
.
data
<
T
>
(),
d_bias
->
data
<
T
>
(),
N
,
C
);
}
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
in_param_desc_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_param_desc_
));
#endif
}
};
static
__device__
__forceinline__
float
real_sqrt
(
float
x
)
{
return
1.
/
sqrtf
(
x
);
}
...
...
@@ -793,22 +421,10 @@ namespace ops = paddle::operators;
namespace
plat
=
paddle
::
platform
;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_CUDA_KERNEL
(
instance_norm
,
ops
::
InstanceNormKernel
<
plat
::
CUDADeviceContext
,
float
>
);
REGISTER_OP_CUDA_KERNEL
(
instance_norm_grad
,
ops
::
InstanceNormGradKernel
<
plat
::
CUDADeviceContext
,
float
>
);
REGISTER_OP_CUDA_KERNEL
(
instance_norm_grad_grad
,
ops
::
InstanceNormDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
);
#else
REGISTER_OP_CUDA_KERNEL
(
instance_norm
,
ops
::
InstanceNormKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
InstanceNormKernel
<
plat
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
instance_norm_grad
,
ops
::
InstanceNormGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
InstanceNormGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
instance_norm_grad_grad
,
ops
::
InstanceNormDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
...
...
paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
InstanceNormGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
d_y
,
paddle
::
optional
<
const
DenseTensor
&>
scale
,
const
DenseTensor
&
saved_mean
,
const
DenseTensor
&
saved_variance
,
float
epsilon
,
DenseTensor
*
d_x
,
DenseTensor
*
d_scale
,
DenseTensor
*
d_bias
)
{
const
auto
*
scale_ptr
=
scale
.
get_ptr
();
const
auto
&
x_dims
=
x
.
dims
();
const
int
N
=
x_dims
[
0
];
const
int
C
=
x_dims
[
1
];
const
int
NxC
=
N
*
C
;
const
int
sample_size
=
x
.
numel
()
/
N
/
C
;
dev_ctx
.
template
Alloc
<
T
>(
d_x
);
auto
*
place
=
dev_ctx
.
eigen_device
();
Eigen
::
DSizes
<
int
,
2
>
rshape
(
NxC
,
sample_size
);
Eigen
::
DSizes
<
int
,
2
>
param_shape
(
N
,
C
);
Eigen
::
DSizes
<
int
,
2
>
shape
(
NxC
,
sample_size
);
#ifndef EIGEN_HAS_INDEX_LIST
Eigen
::
DSizes
<
int
,
1
>
rdims
(
0
);
Eigen
::
DSizes
<
int
,
1
>
mean_rdims
(
1
);
Eigen
::
DSizes
<
int
,
2
>
bcast
(
1
,
sample_size
);
Eigen
::
DSizes
<
int
,
2
>
C_shape
(
C
,
1
);
Eigen
::
DSizes
<
int
,
2
>
NxC_shape
(
NxC
,
1
);
#else
Eigen
::
IndexList
<
Eigen
::
type2index
<
0
>>
rdims
;
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>>
mean_rdims
;
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>
,
int
>
bcast
;
bcast
.
set
(
1
,
sample_size
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
C_shape
;
C_shape
.
set
(
0
,
C
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
NxC_shape
;
NxC_shape
.
set
(
0
,
NxC
);
#endif
phi
::
funcs
::
SetConstant
<
CPUContext
,
T
>
set_constant
;
DenseTensor
scale_data
;
if
(
!
scale_ptr
)
{
scale_data
.
Resize
({
C
});
dev_ctx
.
template
Alloc
<
T
>(
&
scale_data
);
set_constant
(
dev_ctx
,
&
scale_data
,
static_cast
<
T
>
(
1
));
}
auto
scale_e
=
scale_ptr
?
EigenVector
<
T
>::
Flatten
(
*
scale_ptr
)
:
EigenVector
<
T
>::
Flatten
(
const_cast
<
const
DenseTensor
&>
(
scale_data
));
auto
mean_e
=
EigenVector
<
T
>::
Flatten
(
saved_mean
);
auto
inv_var_e
=
EigenVector
<
T
>::
Flatten
(
saved_variance
);
auto
dy_e
=
EigenVector
<
T
>::
Flatten
(
d_y
);
auto
x_e
=
EigenVector
<
T
>::
Flatten
(
x
);
auto
scale_arr
=
scale_e
.
reshape
(
C_shape
);
auto
mean_arr
=
mean_e
.
reshape
(
NxC_shape
);
auto
inv_var_arr
=
inv_var_e
.
reshape
(
NxC_shape
);
auto
dy_arr
=
dy_e
.
reshape
(
shape
);
auto
x_arr
=
x_e
.
reshape
(
shape
);
auto
tmp
=
(
x_arr
-
mean_arr
.
eval
().
broadcast
(
bcast
))
*
inv_var_arr
.
eval
().
broadcast
(
bcast
);
// math: d_bias = np.sum(d_y, axis=(n,h,w))
// math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
if
(
d_scale
&&
d_bias
)
{
dev_ctx
.
template
Alloc
<
T
>(
d_scale
);
dev_ctx
.
template
Alloc
<
T
>(
d_bias
);
set_constant
(
dev_ctx
,
d_scale
,
static_cast
<
T
>
(
0
));
set_constant
(
dev_ctx
,
d_bias
,
static_cast
<
T
>
(
0
));
auto
d_scale_e
=
EigenVector
<
T
>::
Flatten
(
*
d_scale
);
auto
d_scale_data
=
d_scale_e
.
reshape
(
C_shape
);
auto
d_bias_e
=
EigenVector
<
T
>::
Flatten
(
*
d_bias
);
auto
d_bias_data
=
d_bias_e
.
reshape
(
C_shape
);
d_bias_data
.
device
(
*
place
)
=
dy_arr
.
sum
(
mean_rdims
).
reshape
(
param_shape
).
sum
(
rdims
);
d_scale_data
.
device
(
*
place
)
=
(
tmp
*
dy_arr
).
sum
(
mean_rdims
).
reshape
(
param_shape
).
sum
(
rdims
);
}
auto
dy_mean
=
dy_arr
.
mean
(
mean_rdims
).
reshape
(
NxC_shape
).
eval
().
broadcast
(
bcast
);
Eigen
::
DSizes
<
int
,
2
>
bcast_param
(
N
,
sample_size
);
set_constant
(
dev_ctx
,
d_x
,
static_cast
<
T
>
(
0
));
// math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
// axis=(h,w))
// - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
// mean),
// axis=(h,w))
auto
dx_e
=
EigenVector
<
T
>::
Flatten
(
*
d_x
);
auto
dx_arr
=
dx_e
.
reshape
(
shape
);
dx_arr
.
device
(
*
place
)
=
scale_arr
.
broadcast
(
bcast_param
)
*
inv_var_arr
.
broadcast
(
bcast
)
*
(
dy_arr
-
dy_mean
-
tmp
*
(
dy_arr
*
tmp
)
.
mean
(
mean_rdims
)
.
reshape
(
NxC_shape
)
.
eval
()
.
broadcast
(
bcast
));
}
}
// namespace phi
PD_REGISTER_KERNEL
(
instance_norm_grad
,
CPU
,
ALL_LAYOUT
,
phi
::
InstanceNormGradKernel
,
float
,
double
)
{}
paddle/phi/kernels/cpu/instance_norm_kernel.cc
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/instance_norm_kernel.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
InstanceNormKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
paddle
::
optional
<
const
DenseTensor
&>
scale
,
paddle
::
optional
<
const
DenseTensor
&>
bias
,
float
epsilon_f
,
DenseTensor
*
y
,
DenseTensor
*
saved_mean
,
DenseTensor
*
saved_variance
)
{
const
auto
&
x_dims
=
x
.
dims
();
T
epsilon
=
static_cast
<
T
>
(
epsilon_f
);
const
int
N
=
x_dims
[
0
];
const
int
C
=
x_dims
[
1
];
const
int
NxC
=
N
*
C
;
const
int
sample_size
=
x
.
numel
()
/
N
/
C
;
auto
*
place
=
dev_ctx
.
eigen_device
();
Eigen
::
DSizes
<
int
,
2
>
shape
(
NxC
,
sample_size
);
// Once eigen on Windows is updated, the if branch can be removed.
#ifndef EIGEN_HAS_INDEX_LIST
Eigen
::
DSizes
<
int
,
2
>
bcast
(
1
,
sample_size
);
Eigen
::
DSizes
<
int
,
2
>
C_shape
(
C
,
1
);
Eigen
::
DSizes
<
int
,
2
>
NxC_shape
(
NxC
,
1
);
Eigen
::
DSizes
<
int
,
1
>
rdims
(
1
);
#else
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>
,
int
>
bcast
;
bcast
.
set
(
1
,
sample_size
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
C_shape
;
C_shape
.
set
(
0
,
C
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
NxC_shape
;
NxC_shape
.
set
(
0
,
NxC
);
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>>
rdims
;
#endif
phi
::
funcs
::
SetConstant
<
CPUContext
,
T
>
set_constant
;
dev_ctx
.
template
Alloc
<
T
>(
saved_mean
);
dev_ctx
.
template
Alloc
<
T
>(
saved_variance
);
set_constant
(
dev_ctx
,
saved_mean
,
static_cast
<
T
>
(
0
));
set_constant
(
dev_ctx
,
saved_variance
,
static_cast
<
T
>
(
0
));
auto
saved_mean_a
=
EigenVector
<
T
>::
Flatten
(
*
saved_mean
);
auto
saved_mean_e
=
saved_mean_a
.
reshape
(
NxC_shape
);
auto
saved_variance_a
=
EigenVector
<
T
>::
Flatten
(
*
saved_variance
);
auto
saved_variance_e
=
saved_variance_a
.
reshape
(
NxC_shape
);
auto
x_e
=
EigenVector
<
T
>::
Flatten
(
x
);
auto
x_arr
=
x_e
.
reshape
(
shape
);
saved_mean_e
.
device
(
*
place
)
=
x_arr
.
mean
(
rdims
);
auto
saved_variance_arr
=
(
x_arr
-
saved_mean_e
.
broadcast
(
bcast
)).
square
().
mean
(
rdims
)
+
epsilon
;
saved_variance_e
.
device
(
*
place
)
=
saved_variance_arr
.
sqrt
().
inverse
();
const
auto
scale_ptr
=
scale
.
get_ptr
();
const
auto
bias_ptr
=
bias
.
get_ptr
();
DenseTensor
scale_data
;
DenseTensor
bias_data
;
if
(
!
scale_ptr
)
{
scale_data
.
Resize
({
C
});
dev_ctx
.
template
Alloc
<
T
>(
&
scale_data
);
set_constant
(
dev_ctx
,
&
scale_data
,
static_cast
<
T
>
(
1
));
}
if
(
!
bias_ptr
)
{
bias_data
.
Resize
({
C
});
dev_ctx
.
template
Alloc
<
T
>(
&
bias_data
);
set_constant
(
dev_ctx
,
&
bias_data
,
static_cast
<
T
>
(
0
));
}
auto
scale_e
=
scale_ptr
?
EigenVector
<
T
>::
Flatten
(
*
scale_ptr
)
:
EigenVector
<
T
>::
Flatten
(
const_cast
<
const
DenseTensor
&>
(
scale_data
));
auto
scale_arr
=
scale_e
.
reshape
(
C_shape
);
auto
bias_e
=
bias_ptr
?
EigenVector
<
T
>::
Flatten
(
*
bias_ptr
)
:
EigenVector
<
T
>::
Flatten
(
const_cast
<
const
DenseTensor
&>
(
bias_data
));
auto
bias_arr
=
bias_e
.
reshape
(
C_shape
);
dev_ctx
.
template
Alloc
<
T
>(
y
);
auto
y_e
=
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
y_arr
=
y_e
.
reshape
(
shape
);
// (x - mean) * inv_std * scale + bias
Eigen
::
DSizes
<
int
,
2
>
bcast_param
(
N
,
sample_size
);
y_arr
.
device
(
*
place
)
=
(
x_arr
-
saved_mean_e
.
broadcast
(
bcast
))
*
saved_variance_e
.
broadcast
(
bcast
)
*
scale_arr
.
broadcast
(
bcast_param
)
+
bias_arr
.
broadcast
(
bcast_param
);
}
}
// namespace phi
PD_REGISTER_KERNEL
(
instance_norm
,
CPU
,
ALL_LAYOUT
,
phi
::
InstanceNormKernel
,
float
,
double
)
{}
paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
namespace
phi
{
template
<
typename
T
,
int
BlockDim
>
static
__global__
void
GradComputeDX
(
const
T
*
dy
,
const
BatchNormParamType
<
T
>
*
scale
,
const
BatchNormParamType
<
T
>
*
mean
,
const
T
*
x
,
const
BatchNormParamType
<
T
>
*
variance
,
const
int
C
,
const
int
sample_size
,
T
*
dx
)
{
int
beg_idx
=
blockIdx
.
x
*
sample_size
+
threadIdx
.
x
;
int
end_idx
=
(
blockIdx
.
x
+
1
)
*
sample_size
;
int
ncid
=
blockIdx
.
x
;
int
c
=
ncid
%
C
;
BatchNormParamType
<
T
>
mean_val
=
mean
[
ncid
];
BatchNormParamType
<
T
>
inv_var_val
=
variance
[
ncid
];
typedef
cub
::
BlockReduce
<
BatchNormParamType
<
T
>
,
BlockDim
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
dy_storage
;
__shared__
typename
BlockReduce
::
TempStorage
dy_x_sub_mean_storage
;
__shared__
BatchNormParamType
<
T
>
dy_sum_val
;
__shared__
BatchNormParamType
<
T
>
dy_x_sub_mean_sum_val
;
BatchNormParamType
<
T
>
dy_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
BatchNormParamType
<
T
>
dy_x_sub_mean_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
for
(
int
i
=
beg_idx
;
i
<
end_idx
;
i
+=
BlockDim
)
{
BatchNormParamType
<
T
>
dy_i
=
static_cast
<
BatchNormParamType
<
T
>>
(
dy
[
i
]);
dy_sum
+=
dy_i
;
dy_x_sub_mean_sum
+=
dy_i
*
(
static_cast
<
BatchNormParamType
<
T
>>
(
x
[
i
])
-
mean_val
);
}
dy_sum
=
BlockReduce
(
dy_storage
).
Reduce
(
dy_sum
,
cub
::
Sum
());
dy_x_sub_mean_sum
=
BlockReduce
(
dy_x_sub_mean_storage
).
Reduce
(
dy_x_sub_mean_sum
,
cub
::
Sum
());
if
(
threadIdx
.
x
==
0
)
{
dy_sum_val
=
dy_sum
;
dy_x_sub_mean_sum_val
=
dy_x_sub_mean_sum
;
}
__syncthreads
();
for
(
int
i
=
beg_idx
;
i
<
end_idx
;
i
+=
BlockDim
)
{
dx
[
i
]
=
(
static_cast
<
BatchNormParamType
<
T
>>
(
dy
[
i
])
-
dy_sum_val
/
static_cast
<
BatchNormParamType
<
T
>>
(
sample_size
)
-
(
static_cast
<
BatchNormParamType
<
T
>>
(
x
[
i
])
-
mean_val
)
*
dy_x_sub_mean_sum_val
*
inv_var_val
*
inv_var_val
/
sample_size
)
*
scale
[
c
]
*
inv_var_val
;
}
}
template
<
typename
T
,
typename
Context
>
void
InstanceNormGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
d_y
,
paddle
::
optional
<
const
DenseTensor
&>
scale
,
const
DenseTensor
&
saved_mean
,
const
DenseTensor
&
saved_variance
,
float
epsilon_f
,
DenseTensor
*
d_x
,
DenseTensor
*
d_scale
,
DenseTensor
*
d_bias
)
{
double
epsilon
=
static_cast
<
double
>
(
epsilon_f
);
const
auto
*
scale_ptr
=
scale
.
get_ptr
();
const
auto
&
x_dims
=
x
.
dims
();
int
N
,
C
,
H
,
W
,
D
;
paddle
::
operators
::
ExtractNCWHD
(
x_dims
,
DataLayout
::
kNCHW
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
int
NxC
=
N
*
C
;
DenseTensor
x_tmp
,
d_y_tmp
;
x_tmp
.
ShareDataWith
(
x
).
Resize
({
1
,
NxC
,
H
,
W
,
D
});
d_y_tmp
.
ShareDataWith
(
d_y
).
Resize
({
1
,
NxC
,
H
,
W
,
D
});
dev_ctx
.
template
Alloc
<
T
>(
d_x
);
if
(
d_scale
&&
d_bias
)
{
dev_ctx
.
template
Alloc
<
T
>(
d_scale
);
dev_ctx
.
template
Alloc
<
T
>(
d_bias
);
}
if
(
scale_ptr
)
{
PADDLE_ENFORCE_EQ
(
scale_ptr
->
dims
().
size
(),
1UL
,
phi
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the size of scale's dimensions must be equal to 1. But "
"received: the size of scale's dimensions"
"is [%d]"
,
scale_ptr
->
dims
().
size
()));
PADDLE_ENFORCE_EQ
(
scale_ptr
->
dims
()[
0
],
C
,
phi
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the first dimension of scale must be equal to "
"Channels([%d]). But received: "
"the first dimension of scale is [%d],"
"the dimensions of scale is [%s], "
,
C
,
scale_ptr
->
dims
()[
0
],
scale_ptr
->
dims
()));
}
phi
::
funcs
::
SetConstant
<
GPUContext
,
T
>
set_constant
;
const
int
n
=
x
.
numel
();
const
int
block
=
512
;
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
const
int
grid
=
std
::
min
(
NxC
,
max_blocks
);
const
int
grid1
=
(
C
+
block
-
1
)
/
block
;
DenseTensor
scale_tmp
;
scale_tmp
.
Resize
({
NxC
});
dev_ctx
.
template
Alloc
<
T
>(
&
scale_tmp
);
DenseTensor
d_scale_tmp
;
d_scale_tmp
.
Resize
({
NxC
});
dev_ctx
.
template
Alloc
<
T
>(
&
d_scale_tmp
);
DenseTensor
d_bias_tmp
;
d_bias_tmp
.
Resize
({
NxC
});
dev_ctx
.
template
Alloc
<
T
>(
&
d_bias_tmp
);
if
(
scale_ptr
)
{
repeat_param
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
scale_ptr
->
data
<
T
>
(),
scale_tmp
.
data
<
T
>
(),
N
,
C
);
}
else
{
set_constant
(
dev_ctx
,
&
scale_tmp
,
static_cast
<
T
>
(
1
));
}
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
dims
=
{
1
,
NxC
,
H
,
W
,
D
};
strides
=
{
NxC
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
if
((
H
*
W
*
D
)
==
1
)
{
phi
::
Copy
(
dev_ctx
,
d_y
,
dev_ctx
.
GetPlace
(),
false
,
d_x
);
phi
::
funcs
::
SetConstant
<
GPUContext
,
BatchNormParamType
<
T
>>
functor
;
functor
(
dev_ctx
,
d_scale
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
functor
(
dev_ctx
,
d_bias
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
return
;
}
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t
data_desc_
;
miopenTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
in_param_desc_
));
#else
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_param_desc_
));
#endif
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
<<
"CUDNN_BN_MIN_EPSILON. Setting it to "
<<
"CUDNN_BN_MIN_EPSILON instead."
;
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenSetTensorDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
const_cast
<
int
*>
(
dims
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
miopenBNSpatial
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
CUDNN_BATCHNORM_SPATIAL
));
#endif
const
auto
*
saved_mean_data
=
saved_mean
.
template
data
<
BatchNormParamType
<
T
>
>
();
const
auto
*
saved_var_data
=
saved_variance
.
template
data
<
BatchNormParamType
<
T
>
>
();
if
(
d_scale
&&
d_bias
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
miopenBNSpatial
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
d_y_tmp
.
template
data
<
T
>(),
data_desc_
,
d_x
->
template
data
<
T
>(),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d_bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
epsilon
,
saved_mean_data
,
saved_var_data
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
CUDNN_BATCHNORM_SPATIAL
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
d_y_tmp
.
template
data
<
T
>(),
data_desc_
,
d_x
->
template
data
<
T
>(),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d_bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
epsilon
,
saved_mean_data
,
saved_var_data
));
#endif
}
else
{
if
(
d_x
)
{
GradComputeDX
<
T
,
block
><<<
NxC
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_y
.
data
<
T
>
(),
scale_tmp
.
data
<
BatchNormParamType
<
T
>>
(),
saved_mean_data
,
x
.
data
<
T
>
(),
saved_var_data
,
C
,
H
*
W
*
D
,
d_x
->
data
<
T
>
());
}
}
if
(
d_scale
&&
d_bias
)
{
add_param
<
T
,
block
,
false
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_scale_tmp
.
data
<
T
>
(),
d_scale
->
data
<
T
>
(),
N
,
C
);
add_param
<
T
,
block
,
false
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_bias_tmp
.
data
<
T
>
(),
d_bias
->
data
<
T
>
(),
N
,
C
);
}
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
in_param_desc_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_param_desc_
));
#endif
}
}
// namespace phi
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
PD_REGISTER_KERNEL
(
instance_norm_grad
,
GPU
,
ALL_LAYOUT
,
phi
::
InstanceNormGradKernel
,
float
)
{}
#else
PD_REGISTER_KERNEL
(
instance_norm_grad
,
GPU
,
ALL_LAYOUT
,
phi
::
InstanceNormGradKernel
,
float
,
double
)
{}
#endif
paddle/phi/kernels/gpu/instance_norm_kernel.cu
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/instance_norm_kernel.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
InstanceNormKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
paddle
::
optional
<
const
DenseTensor
&>
scale
,
paddle
::
optional
<
const
DenseTensor
&>
bias
,
float
epsilon_f
,
DenseTensor
*
y
,
DenseTensor
*
saved_mean
,
DenseTensor
*
saved_variance
)
{
double
epsilon
=
static_cast
<
double
>
(
epsilon_f
);
auto
&
x_dims
=
x
.
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
2
,
phi
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the size of X's dimensions must greater than "
"or equal to 2. But received: "
"the size of X's dimensions is [%d]"
,
x_dims
.
size
()));
PADDLE_ENFORCE_LE
(
x_dims
.
size
(),
5
,
phi
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the size of X's dimensions must smaller than"
"or equal to 5. But received: "
"the size of X's dimensions is [%d]"
,
x_dims
.
size
()));
int
N
,
C
,
H
,
W
,
D
;
paddle
::
operators
::
ExtractNCWHD
(
x_dims
,
DataLayout
::
kNCHW
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
int
NxC
=
N
*
C
;
DenseTensor
x_tmp
;
x_tmp
.
ShareDataWith
(
x
).
Resize
({
1
,
NxC
,
H
,
W
,
D
});
dev_ctx
.
template
Alloc
<
T
>(
y
);
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t
data_desc_
;
miopenTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
in_param_desc_
));
#else
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_param_desc_
));
#endif
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
<<
"CUDNN_BN_MIN_EPSILON. Setting it to "
<<
"CUDNN_BN_MIN_EPSILON instead."
;
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
VLOG
(
3
)
<<
"Setting descriptors."
;
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
dims
=
{
1
,
NxC
,
H
,
W
,
D
};
strides
=
{
NxC
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenSetTensorDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
const_cast
<
int
*>
(
dims
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
miopenBNSpatial
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
CUDNN_BATCHNORM_SPATIAL
));
#endif
const
auto
scale_ptr
=
scale
.
get_ptr
();
const
auto
bias_ptr
=
bias
.
get_ptr
();
DenseTensor
scale_tmp
;
scale_tmp
.
Resize
({
NxC
});
dev_ctx
.
template
Alloc
<
T
>(
&
scale_tmp
);
DenseTensor
bias_tmp
;
bias_tmp
.
Resize
({
NxC
});
dev_ctx
.
template
Alloc
<
T
>(
&
bias_tmp
);
const
int
n
=
x
.
numel
();
const
int
block
=
512
;
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
const
int
grid
=
std
::
min
((
NxC
+
block
-
1
)
/
block
,
max_blocks
);
phi
::
funcs
::
SetConstant
<
GPUContext
,
T
>
set_constant
;
if
(
scale_ptr
)
{
repeat_param
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
scale_ptr
->
data
<
T
>
(),
scale_tmp
.
data
<
T
>
(),
N
,
C
);
}
else
{
set_constant
(
dev_ctx
,
&
scale_tmp
,
static_cast
<
T
>
(
1
));
}
if
(
bias_ptr
)
{
repeat_param
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
bias_ptr
->
data
<
T
>
(),
bias_tmp
.
data
<
T
>
(),
N
,
C
);
}
else
{
set_constant
(
dev_ctx
,
&
bias_tmp
,
static_cast
<
T
>
(
0
));
}
auto
handle
=
dev_ctx
.
cudnn_handle
();
phi
::
funcs
::
SetConstant
<
GPUContext
,
BatchNormParamType
<
T
>>
functor
;
dev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
saved_mean
);
dev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
saved_variance
);
functor
(
dev_ctx
,
saved_mean
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
functor
(
dev_ctx
,
saved_variance
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenBatchNormalizationForwardTraining
(
handle
,
miopenBNSpatial
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kOne
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kZero
())),
data_desc_
,
static_cast
<
const
void
*>
(
x_tmp
.
template
data
<
T
>()),
data_desc_
,
static_cast
<
void
*>
(
y
->
template
data
<
T
>()),
in_param_desc_
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
())),
0
,
nullptr
,
nullptr
,
epsilon
,
static_cast
<
void
*>
(
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
()),
static_cast
<
void
*>
(
saved_variance
->
template
data
<
BatchNormParamType
<
T
>
>
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
in_param_desc_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnBatchNormalizationForwardTraining
(
handle
,
CUDNN_BATCHNORM_SPATIAL
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
y
->
template
data
<
T
>(),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
0
,
nullptr
,
nullptr
,
epsilon
,
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
(),
saved_variance
->
template
data
<
BatchNormParamType
<
T
>
>
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_param_desc_
));
#endif
}
}
// namespace phi
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
PD_REGISTER_KERNEL
(
instance_norm
,
GPU
,
ALL_LAYOUT
,
phi
::
InstanceNormKernel
,
float
)
{}
#else
PD_REGISTER_KERNEL
(
instance_norm
,
GPU
,
ALL_LAYOUT
,
phi
::
InstanceNormKernel
,
float
,
double
)
{}
#endif
paddle/phi/kernels/gpu/instance_norm_utils.h
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cfloat>
#include <string>
#include <vector>
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#endif
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
namespace
phi
{
template
<
typename
T
>
using
CudnnDataType
=
paddle
::
platform
::
CudnnDataType
<
T
>
;
template
<
typename
T
>
using
BatchNormParamType
=
typename
CudnnDataType
<
T
>::
BatchNormParamType
;
template
<
typename
T
>
static
__global__
void
repeat_param
(
const
T
*
input
,
T
*
output
,
const
int
repeat_num
,
const
int
C
)
{
CUDA_KERNEL_LOOP
(
i
,
repeat_num
*
C
)
{
int
index
=
i
%
C
;
output
[
i
]
=
input
[
index
];
}
}
template
<
typename
T
,
int
BlockDim
,
bool
AVG
>
static
__global__
void
add_param
(
const
T
*
input
,
T
*
output
,
const
int
repeat_num
,
const
int
C
)
{
typedef
cub
::
BlockReduce
<
T
,
BlockDim
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
ou_storage
;
for
(
int
i
=
blockIdx
.
x
;
i
<
C
;
i
+=
gridDim
.
x
)
{
T
ou
=
static_cast
<
T
>
(
0
);
for
(
int
j
=
threadIdx
.
x
;
j
<
repeat_num
;
j
+=
blockDim
.
x
)
{
const
int
index
=
j
*
C
+
i
;
ou
+=
static_cast
<
T
>
(
input
[
index
]);
}
ou
=
BlockReduce
(
ou_storage
).
Reduce
(
ou
,
cub
::
Sum
());
if
(
threadIdx
.
x
==
0
)
{
output
[
i
]
=
ou
;
}
__syncthreads
();
if
(
AVG
)
{
output
[
i
]
/=
repeat_num
;
}
}
}
}
// namespace phi
paddle/phi/kernels/instance_norm_grad_kernel.h
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
InstanceNormGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y_grad
,
paddle
::
optional
<
const
DenseTensor
&>
scale
,
const
DenseTensor
&
saved_mean
,
const
DenseTensor
&
saved_variance
,
float
epsilon
,
DenseTensor
*
x_grad
,
DenseTensor
*
scale_grad
,
DenseTensor
*
bias_grad
);
}
// namespace phi
paddle/phi/kernels/instance_norm_kernel.h
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
InstanceNormKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
paddle
::
optional
<
const
DenseTensor
&>
scale
,
paddle
::
optional
<
const
DenseTensor
&>
bias
,
float
epsilon
,
DenseTensor
*
y
,
DenseTensor
*
saved_mean
,
DenseTensor
*
saved_variance
);
}
// namespace phi
paddle/phi/ops/compat/instance_norm_sig.cc
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace
phi
{
KernelSignature
InstanceNormOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"instance_norm"
,
{
"X"
,
"Scale"
,
"Bias"
},
{
"epsilon"
},
{
"Y"
,
"SavedMean"
,
"SavedVariance"
});
}
KernelSignature
InstanceNormGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"instance_norm_grad"
,
{
"X"
,
"Y@GRAD"
,
"Scale"
,
"SavedMean"
,
"SavedVariance"
},
{
"epsilon"
},
{
"X@GRAD"
,
"Scale@GRAD"
,
"Bias@GRAD"
});
}
}
// namespace phi
PD_REGISTER_ARG_MAPPING_FN
(
instance_norm
,
phi
::
InstanceNormOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
instance_norm_grad
,
phi
::
InstanceNormGradOpArgumentMapping
);
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录