Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
cc272afb
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
cc272afb
编写于
5月 26, 2022
作者:
Y
YuanRisheng
提交者:
GitHub
5月 26, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Phi]Refactor InstanceNormKernel and InstanceNormGradKernel (#42978)
* move instance_norm * change mutable_data * fix compile bugs
上级
8f7f3ac9
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
987 addition
and
604 deletion
+987
-604
paddle/fluid/operators/instance_norm_op.cc
paddle/fluid/operators/instance_norm_op.cc
+0
-220
paddle/fluid/operators/instance_norm_op.cu
paddle/fluid/operators/instance_norm_op.cu
+0
-384
paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
+146
-0
paddle/phi/kernels/cpu/instance_norm_kernel.cc
paddle/phi/kernels/cpu/instance_norm_kernel.cc
+126
-0
paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+319
-0
paddle/phi/kernels/gpu/instance_norm_kernel.cu
paddle/phi/kernels/gpu/instance_norm_kernel.cu
+221
-0
paddle/phi/kernels/gpu/instance_norm_utils.h
paddle/phi/kernels/gpu/instance_norm_utils.h
+73
-0
paddle/phi/kernels/instance_norm_grad_kernel.h
paddle/phi/kernels/instance_norm_grad_kernel.h
+33
-0
paddle/phi/kernels/instance_norm_kernel.h
paddle/phi/kernels/instance_norm_kernel.h
+31
-0
paddle/phi/ops/compat/instance_norm_sig.cc
paddle/phi/ops/compat/instance_norm_sig.cc
+38
-0
未找到文件。
paddle/fluid/operators/instance_norm_op.cc
浏览文件 @
cc272afb
...
...
@@ -170,104 +170,6 @@ NCHW `[batch, in_channels, in_height, in_width]`
)DOC"
);
}
template
<
typename
T
>
class
InstanceNormKernel
<
platform
::
CPUDeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
&
x_dims
=
x
->
dims
();
const
int
N
=
x_dims
[
0
];
const
int
C
=
x_dims
[
1
];
const
int
NxC
=
N
*
C
;
const
int
sample_size
=
x
->
numel
()
/
N
/
C
;
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
auto
*
saved_mean
=
ctx
.
Output
<
Tensor
>
(
"SavedMean"
);
auto
*
saved_variance
=
ctx
.
Output
<
Tensor
>
(
"SavedVariance"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>();
auto
*
place
=
dev_ctx
.
eigen_device
();
Eigen
::
DSizes
<
int
,
2
>
shape
(
NxC
,
sample_size
);
// Once eigen on Windows is updated, the if branch can be removed.
#ifndef EIGEN_HAS_INDEX_LIST
Eigen
::
DSizes
<
int
,
2
>
bcast
(
1
,
sample_size
);
Eigen
::
DSizes
<
int
,
2
>
C_shape
(
C
,
1
);
Eigen
::
DSizes
<
int
,
2
>
NxC_shape
(
NxC
,
1
);
Eigen
::
DSizes
<
int
,
1
>
rdims
(
1
);
#else
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>
,
int
>
bcast
;
bcast
.
set
(
1
,
sample_size
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
C_shape
;
C_shape
.
set
(
0
,
C
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
NxC_shape
;
NxC_shape
.
set
(
0
,
NxC
);
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>>
rdims
;
#endif
phi
::
funcs
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
set_constant
;
saved_mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
saved_variance
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_constant
(
dev_ctx
,
saved_mean
,
static_cast
<
T
>
(
0
));
set_constant
(
dev_ctx
,
saved_variance
,
static_cast
<
T
>
(
0
));
auto
saved_mean_a
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
saved_mean
);
auto
saved_mean_e
=
saved_mean_a
.
reshape
(
NxC_shape
);
auto
saved_variance_a
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
saved_variance
);
auto
saved_variance_e
=
saved_variance_a
.
reshape
(
NxC_shape
);
auto
x_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
x_arr
=
x_e
.
reshape
(
shape
);
saved_mean_e
.
device
(
*
place
)
=
x_arr
.
mean
(
rdims
);
auto
saved_variance_arr
=
(
x_arr
-
saved_mean_e
.
broadcast
(
bcast
)).
square
().
mean
(
rdims
)
+
epsilon
;
saved_variance_e
.
device
(
*
place
)
=
saved_variance_arr
.
sqrt
().
inverse
();
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
Tensor
scale_data
;
Tensor
bias_data
;
if
(
!
scale
)
{
scale_data
.
mutable_data
<
T
>
({
C
},
ctx
.
GetPlace
());
set_constant
(
dev_ctx
,
&
scale_data
,
static_cast
<
T
>
(
1
));
}
if
(
!
bias
)
{
bias_data
.
mutable_data
<
T
>
({
C
},
ctx
.
GetPlace
());
set_constant
(
dev_ctx
,
&
bias_data
,
static_cast
<
T
>
(
0
));
}
auto
scale_e
=
scale
?
framework
::
EigenVector
<
T
>::
Flatten
(
*
scale
)
:
framework
::
EigenVector
<
T
>::
Flatten
(
const_cast
<
const
framework
::
Tensor
&>
(
scale_data
));
auto
scale_arr
=
scale_e
.
reshape
(
C_shape
);
auto
bias_e
=
bias
?
framework
::
EigenVector
<
T
>::
Flatten
(
*
bias
)
:
framework
::
EigenVector
<
T
>::
Flatten
(
const_cast
<
const
framework
::
Tensor
&>
(
bias_data
));
auto
bias_arr
=
bias_e
.
reshape
(
C_shape
);
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
y_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
y_arr
=
y_e
.
reshape
(
shape
);
// (x - mean) * inv_std * scale + bias
Eigen
::
DSizes
<
int
,
2
>
bcast_param
(
N
,
sample_size
);
y_arr
.
device
(
*
place
)
=
(
x_arr
-
saved_mean_e
.
broadcast
(
bcast
))
*
saved_variance_e
.
broadcast
(
bcast
)
*
scale_arr
.
broadcast
(
bcast_param
)
+
bias_arr
.
broadcast
(
bcast_param
);
}
};
void
InstanceNormGradOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"InstanceNormGrad"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Y"
)),
"Input"
,
...
...
@@ -312,120 +214,6 @@ framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
),
ctx
.
GetPlace
());
}
template
<
typename
T
>
class
InstanceNormGradKernel
<
platform
::
CPUDeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
saved_mean
=
ctx
.
Input
<
Tensor
>
(
"SavedMean"
);
const
auto
*
saved_inv_variance
=
ctx
.
Input
<
Tensor
>
(
"SavedVariance"
);
const
auto
&
x_dims
=
x
->
dims
();
const
int
N
=
x_dims
[
0
];
const
int
C
=
x_dims
[
1
];
const
int
NxC
=
N
*
C
;
const
int
sample_size
=
x
->
numel
()
/
N
/
C
;
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>();
auto
*
place
=
dev_ctx
.
eigen_device
();
Eigen
::
DSizes
<
int
,
2
>
rshape
(
NxC
,
sample_size
);
Eigen
::
DSizes
<
int
,
2
>
param_shape
(
N
,
C
);
Eigen
::
DSizes
<
int
,
2
>
shape
(
NxC
,
sample_size
);
#ifndef EIGEN_HAS_INDEX_LIST
Eigen
::
DSizes
<
int
,
1
>
rdims
(
0
);
Eigen
::
DSizes
<
int
,
1
>
mean_rdims
(
1
);
Eigen
::
DSizes
<
int
,
2
>
bcast
(
1
,
sample_size
);
Eigen
::
DSizes
<
int
,
2
>
C_shape
(
C
,
1
);
Eigen
::
DSizes
<
int
,
2
>
NxC_shape
(
NxC
,
1
);
#else
Eigen
::
IndexList
<
Eigen
::
type2index
<
0
>>
rdims
;
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>>
mean_rdims
;
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>
,
int
>
bcast
;
bcast
.
set
(
1
,
sample_size
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
C_shape
;
C_shape
.
set
(
0
,
C
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
NxC_shape
;
NxC_shape
.
set
(
0
,
NxC
);
#endif
phi
::
funcs
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
set_constant
;
Tensor
scale_data
;
if
(
!
scale
)
{
scale_data
.
mutable_data
<
T
>
({
C
},
ctx
.
GetPlace
());
set_constant
(
dev_ctx
,
&
scale_data
,
static_cast
<
T
>
(
1
));
}
auto
scale_e
=
scale
?
framework
::
EigenVector
<
T
>::
Flatten
(
*
scale
)
:
framework
::
EigenVector
<
T
>::
Flatten
(
const_cast
<
const
framework
::
Tensor
&>
(
scale_data
));
auto
mean_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
saved_mean
);
auto
inv_var_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
saved_inv_variance
);
auto
dy_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
d_y
);
auto
x_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
x
);
auto
scale_arr
=
scale_e
.
reshape
(
C_shape
);
auto
mean_arr
=
mean_e
.
reshape
(
NxC_shape
);
auto
inv_var_arr
=
inv_var_e
.
reshape
(
NxC_shape
);
auto
dy_arr
=
dy_e
.
reshape
(
shape
);
auto
x_arr
=
x_e
.
reshape
(
shape
);
auto
tmp
=
(
x_arr
-
mean_arr
.
eval
().
broadcast
(
bcast
))
*
inv_var_arr
.
eval
().
broadcast
(
bcast
);
// math: d_bias = np.sum(d_y, axis=(n,h,w))
// math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
if
(
d_scale
&&
d_bias
)
{
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
set_constant
(
dev_ctx
,
d_scale
,
static_cast
<
T
>
(
0
));
set_constant
(
dev_ctx
,
d_bias
,
static_cast
<
T
>
(
0
));
auto
d_scale_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
d_scale
);
auto
d_scale_data
=
d_scale_e
.
reshape
(
C_shape
);
auto
d_bias_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
d_bias
);
auto
d_bias_data
=
d_bias_e
.
reshape
(
C_shape
);
d_bias_data
.
device
(
*
place
)
=
dy_arr
.
sum
(
mean_rdims
).
reshape
(
param_shape
).
sum
(
rdims
);
d_scale_data
.
device
(
*
place
)
=
(
tmp
*
dy_arr
).
sum
(
mean_rdims
).
reshape
(
param_shape
).
sum
(
rdims
);
}
auto
dy_mean
=
dy_arr
.
mean
(
mean_rdims
).
reshape
(
NxC_shape
).
eval
().
broadcast
(
bcast
);
Eigen
::
DSizes
<
int
,
2
>
bcast_param
(
N
,
sample_size
);
set_constant
(
dev_ctx
,
d_x
,
static_cast
<
T
>
(
0
));
// math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
// axis=(h,w))
// - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
// mean),
// axis=(h,w))
auto
dx_e
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
d_x
);
auto
dx_arr
=
dx_e
.
reshape
(
shape
);
dx_arr
.
device
(
*
place
)
=
scale_arr
.
broadcast
(
bcast_param
)
*
inv_var_arr
.
broadcast
(
bcast
)
*
(
dy_arr
-
dy_mean
-
tmp
*
(
dy_arr
*
tmp
)
.
mean
(
mean_rdims
)
.
reshape
(
NxC_shape
)
.
eval
()
.
broadcast
(
bcast
));
}
};
void
InstanceNormDoubleGradOp
::
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"InstanceNormDoubleGrad"
);
...
...
@@ -699,14 +487,6 @@ REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp,
REGISTER_OPERATOR
(
instance_norm_grad_grad
,
ops
::
InstanceNormDoubleGradOp
,
ops
::
InstanceNormDoubleGradOpInplaceInferer
);
REGISTER_OP_CPU_KERNEL
(
instance_norm
,
ops
::
InstanceNormKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
InstanceNormKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
instance_norm_grad
,
ops
::
InstanceNormGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
InstanceNormGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
instance_norm_grad_grad
,
ops
::
InstanceNormDoubleGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
...
...
paddle/fluid/operators/instance_norm_op.cu
浏览文件 @
cc272afb
...
...
@@ -70,181 +70,6 @@ static __global__ void add_param(const T *input, T *output,
}
}
template
<
typename
T
>
class
InstanceNormKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
true
,
platform
::
errors
::
PreconditionNotMet
(
"It must be CUDAPlace."
));
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
&
x_dims
=
x
->
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
2
,
platform
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the size of X's dimensions must greater than "
"or equal to 2. But received: "
"the size of X's dimensions is [%d]"
,
x_dims
.
size
()));
PADDLE_ENFORCE_LE
(
x_dims
.
size
(),
5
,
platform
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the size of X's dimensions must smaller than"
"or equal to 5. But received: "
"the size of X's dimensions is [%d]"
,
x_dims
.
size
()));
int
N
,
C
,
H
,
W
,
D
;
ExtractNCWHD
(
x_dims
,
DataLayout
::
kNCHW
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
int
NxC
=
N
*
C
;
Tensor
x_tmp
;
x_tmp
.
ShareDataWith
(
*
x
).
Resize
({
1
,
NxC
,
H
,
W
,
D
});
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t
data_desc_
;
miopenTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
in_param_desc_
));
#else
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_param_desc_
));
#endif
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
<<
"CUDNN_BN_MIN_EPSILON. Setting it to "
<<
"CUDNN_BN_MIN_EPSILON instead."
;
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
VLOG
(
3
)
<<
"Setting descriptors."
;
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
dims
=
{
1
,
NxC
,
H
,
W
,
D
};
strides
=
{
NxC
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenSetTensorDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
const_cast
<
int
*>
(
dims
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
miopenBNSpatial
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
CUDNN_BATCHNORM_SPATIAL
));
#endif
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
Tensor
scale_tmp
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
NxC
},
dev_ctx
);
scale_tmp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
Tensor
bias_tmp
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
NxC
},
dev_ctx
);
bias_tmp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
int
n
=
x
->
numel
();
const
int
block
=
512
;
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
const
int
grid
=
std
::
min
((
NxC
+
block
-
1
)
/
block
,
max_blocks
);
phi
::
funcs
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
set_constant
;
if
(
scale
)
{
repeat_param
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
scale
->
data
<
T
>
(),
scale_tmp
.
data
<
T
>
(),
N
,
C
);
}
else
{
set_constant
(
dev_ctx
,
&
scale_tmp
,
static_cast
<
T
>
(
1
));
}
if
(
bias
)
{
repeat_param
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
bias
->
data
<
T
>
(),
bias_tmp
.
data
<
T
>
(),
N
,
C
);
}
else
{
set_constant
(
dev_ctx
,
&
bias_tmp
,
static_cast
<
T
>
(
0
));
}
auto
handle
=
dev_ctx
.
cudnn_handle
();
phi
::
funcs
::
SetConstant
<
platform
::
CUDADeviceContext
,
BatchNormParamType
<
T
>>
functor
;
auto
*
saved_mean
=
ctx
.
Output
<
Tensor
>
(
"SavedMean"
);
auto
*
saved_variance
=
ctx
.
Output
<
Tensor
>
(
"SavedVariance"
);
saved_mean
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
saved_variance
->
mutable_data
<
BatchNormParamType
<
T
>>
(
ctx
.
GetPlace
());
functor
(
dev_ctx
,
saved_mean
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
functor
(
dev_ctx
,
saved_variance
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenBatchNormalizationForwardTraining
(
handle
,
miopenBNSpatial
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kOne
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kZero
())),
data_desc_
,
static_cast
<
const
void
*>
(
x_tmp
.
template
data
<
T
>()),
data_desc_
,
static_cast
<
void
*>
(
y
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
())),
in_param_desc_
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
())),
0
,
nullptr
,
nullptr
,
epsilon
,
static_cast
<
void
*>
(
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())),
static_cast
<
void
*>
(
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()))));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
in_param_desc_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationForwardTraining
(
handle
,
CUDNN_BATCHNORM_SPATIAL
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
y
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
0
,
nullptr
,
nullptr
,
epsilon
,
saved_mean
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
saved_variance
->
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_param_desc_
));
#endif
}
};
template
<
typename
T
,
int
BlockDim
>
static
__global__
void
GradComputeDX
(
const
T
*
dy
,
const
BatchNormParamType
<
T
>
*
scale
,
...
...
@@ -297,203 +122,6 @@ static __global__ void GradComputeDX(const T *dy,
}
}
template
<
typename
T
>
class
InstanceNormGradKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
true
,
platform
::
errors
::
PreconditionNotMet
(
"It must use CUDAPlace."
));
double
epsilon
=
static_cast
<
double
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
const
auto
&
x_dims
=
x
->
dims
();
int
N
,
C
,
H
,
W
,
D
;
ExtractNCWHD
(
x_dims
,
DataLayout
::
kNCHW
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
int
NxC
=
N
*
C
;
Tensor
x_tmp
,
d_y_tmp
;
x_tmp
.
ShareDataWith
(
*
x
).
Resize
({
1
,
NxC
,
H
,
W
,
D
});
d_y_tmp
.
ShareDataWith
(
*
d_y
).
Resize
({
1
,
NxC
,
H
,
W
,
D
});
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
d_scale
&&
d_bias
)
{
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
if
(
scale
)
{
PADDLE_ENFORCE_EQ
(
scale
->
dims
().
size
(),
1UL
,
platform
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the size of scale's dimensions must be equal to 1. But "
"received: the size of scale's dimensions"
"is [%d]"
,
scale
->
dims
().
size
()));
PADDLE_ENFORCE_EQ
(
scale
->
dims
()[
0
],
C
,
platform
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the first dimension of scale must be equal to "
"Channels([%d]). But received: "
"the first dimension of scale is [%d],"
"the dimensions of scale is [%s], "
,
C
,
scale
->
dims
()[
0
],
scale
->
dims
()));
}
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
phi
::
funcs
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
set_constant
;
const
int
n
=
x
->
numel
();
const
int
block
=
512
;
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
const
int
grid
=
std
::
min
(
NxC
,
max_blocks
);
const
int
grid1
=
(
C
+
block
-
1
)
/
block
;
Tensor
scale_tmp
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
NxC
},
dev_ctx
);
scale_tmp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
Tensor
d_scale_tmp
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
NxC
},
dev_ctx
);
Tensor
d_bias_tmp
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
NxC
},
dev_ctx
);
if
(
scale
)
{
repeat_param
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
scale
->
data
<
T
>
(),
scale_tmp
.
data
<
T
>
(),
N
,
C
);
}
else
{
set_constant
(
dev_ctx
,
&
scale_tmp
,
static_cast
<
T
>
(
1
));
}
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
dims
=
{
1
,
NxC
,
H
,
W
,
D
};
strides
=
{
NxC
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
if
((
H
*
W
*
D
)
==
1
)
{
framework
::
TensorCopy
(
*
d_y
,
ctx
.
GetPlace
(),
d_x
);
phi
::
funcs
::
SetConstant
<
platform
::
CUDADeviceContext
,
BatchNormParamType
<
T
>>
functor
;
functor
(
dev_ctx
,
d_scale
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
functor
(
dev_ctx
,
d_bias
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
return
;
}
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t
data_desc_
;
miopenTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
in_param_desc_
));
#else
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_param_desc_
));
#endif
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
<<
"CUDNN_BN_MIN_EPSILON. Setting it to "
<<
"CUDNN_BN_MIN_EPSILON instead."
;
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenSetTensorDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
const_cast
<
int
*>
(
dims
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
miopenBNSpatial
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
CUDNN_BATCHNORM_SPATIAL
));
#endif
const
auto
*
saved_mean
=
ctx
.
Input
<
Tensor
>
(
"SavedMean"
);
const
auto
*
saved_var
=
ctx
.
Input
<
Tensor
>
(
"SavedVariance"
);
const
auto
*
saved_mean_data
=
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
();
const
auto
*
saved_var_data
=
saved_var
->
template
data
<
BatchNormParamType
<
T
>
>
();
if
(
d_scale
&&
d_bias
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
miopenBNSpatial
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
d_y_tmp
.
template
data
<
T
>(),
data_desc_
,
d_x
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale_tmp
.
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
d_bias_tmp
.
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean_data
,
saved_var_data
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
CUDNN_BATCHNORM_SPATIAL
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
d_y_tmp
.
template
data
<
T
>(),
data_desc_
,
d_x
->
template
mutable_data
<
T
>(
ctx
.
GetPlace
()),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale_tmp
.
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
d_bias_tmp
.
template
mutable_data
<
BatchNormParamType
<
T
>
>
(
ctx
.
GetPlace
()),
epsilon
,
saved_mean_data
,
saved_var_data
));
#endif
}
else
{
if
(
d_x
)
{
GradComputeDX
<
T
,
block
><<<
NxC
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_y
->
data
<
T
>
(),
scale_tmp
.
data
<
BatchNormParamType
<
T
>>
(),
saved_mean_data
,
x
->
data
<
T
>
(),
saved_var_data
,
C
,
H
*
W
*
D
,
d_x
->
data
<
T
>
());
}
}
if
(
d_scale
&&
d_bias
)
{
add_param
<
T
,
block
,
false
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_scale_tmp
.
data
<
T
>
(),
d_scale
->
data
<
T
>
(),
N
,
C
);
add_param
<
T
,
block
,
false
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_bias_tmp
.
data
<
T
>
(),
d_bias
->
data
<
T
>
(),
N
,
C
);
}
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
in_param_desc_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_param_desc_
));
#endif
}
};
static
__device__
__forceinline__
float
real_sqrt
(
float
x
)
{
return
1.
/
sqrtf
(
x
);
}
...
...
@@ -793,22 +421,10 @@ namespace ops = paddle::operators;
namespace
plat
=
paddle
::
platform
;
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
REGISTER_OP_CUDA_KERNEL
(
instance_norm
,
ops
::
InstanceNormKernel
<
plat
::
CUDADeviceContext
,
float
>
);
REGISTER_OP_CUDA_KERNEL
(
instance_norm_grad
,
ops
::
InstanceNormGradKernel
<
plat
::
CUDADeviceContext
,
float
>
);
REGISTER_OP_CUDA_KERNEL
(
instance_norm_grad_grad
,
ops
::
InstanceNormDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
);
#else
REGISTER_OP_CUDA_KERNEL
(
instance_norm
,
ops
::
InstanceNormKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
InstanceNormKernel
<
plat
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
instance_norm_grad
,
ops
::
InstanceNormGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
InstanceNormGradKernel
<
plat
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
instance_norm_grad_grad
,
ops
::
InstanceNormDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
...
...
paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
InstanceNormGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
d_y
,
paddle
::
optional
<
const
DenseTensor
&>
scale
,
const
DenseTensor
&
saved_mean
,
const
DenseTensor
&
saved_variance
,
float
epsilon
,
DenseTensor
*
d_x
,
DenseTensor
*
d_scale
,
DenseTensor
*
d_bias
)
{
const
auto
*
scale_ptr
=
scale
.
get_ptr
();
const
auto
&
x_dims
=
x
.
dims
();
const
int
N
=
x_dims
[
0
];
const
int
C
=
x_dims
[
1
];
const
int
NxC
=
N
*
C
;
const
int
sample_size
=
x
.
numel
()
/
N
/
C
;
dev_ctx
.
template
Alloc
<
T
>(
d_x
);
auto
*
place
=
dev_ctx
.
eigen_device
();
Eigen
::
DSizes
<
int
,
2
>
rshape
(
NxC
,
sample_size
);
Eigen
::
DSizes
<
int
,
2
>
param_shape
(
N
,
C
);
Eigen
::
DSizes
<
int
,
2
>
shape
(
NxC
,
sample_size
);
#ifndef EIGEN_HAS_INDEX_LIST
Eigen
::
DSizes
<
int
,
1
>
rdims
(
0
);
Eigen
::
DSizes
<
int
,
1
>
mean_rdims
(
1
);
Eigen
::
DSizes
<
int
,
2
>
bcast
(
1
,
sample_size
);
Eigen
::
DSizes
<
int
,
2
>
C_shape
(
C
,
1
);
Eigen
::
DSizes
<
int
,
2
>
NxC_shape
(
NxC
,
1
);
#else
Eigen
::
IndexList
<
Eigen
::
type2index
<
0
>>
rdims
;
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>>
mean_rdims
;
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>
,
int
>
bcast
;
bcast
.
set
(
1
,
sample_size
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
C_shape
;
C_shape
.
set
(
0
,
C
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
NxC_shape
;
NxC_shape
.
set
(
0
,
NxC
);
#endif
phi
::
funcs
::
SetConstant
<
CPUContext
,
T
>
set_constant
;
DenseTensor
scale_data
;
if
(
!
scale_ptr
)
{
scale_data
.
Resize
({
C
});
dev_ctx
.
template
Alloc
<
T
>(
&
scale_data
);
set_constant
(
dev_ctx
,
&
scale_data
,
static_cast
<
T
>
(
1
));
}
auto
scale_e
=
scale_ptr
?
EigenVector
<
T
>::
Flatten
(
*
scale_ptr
)
:
EigenVector
<
T
>::
Flatten
(
const_cast
<
const
DenseTensor
&>
(
scale_data
));
auto
mean_e
=
EigenVector
<
T
>::
Flatten
(
saved_mean
);
auto
inv_var_e
=
EigenVector
<
T
>::
Flatten
(
saved_variance
);
auto
dy_e
=
EigenVector
<
T
>::
Flatten
(
d_y
);
auto
x_e
=
EigenVector
<
T
>::
Flatten
(
x
);
auto
scale_arr
=
scale_e
.
reshape
(
C_shape
);
auto
mean_arr
=
mean_e
.
reshape
(
NxC_shape
);
auto
inv_var_arr
=
inv_var_e
.
reshape
(
NxC_shape
);
auto
dy_arr
=
dy_e
.
reshape
(
shape
);
auto
x_arr
=
x_e
.
reshape
(
shape
);
auto
tmp
=
(
x_arr
-
mean_arr
.
eval
().
broadcast
(
bcast
))
*
inv_var_arr
.
eval
().
broadcast
(
bcast
);
// math: d_bias = np.sum(d_y, axis=(n,h,w))
// math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
if
(
d_scale
&&
d_bias
)
{
dev_ctx
.
template
Alloc
<
T
>(
d_scale
);
dev_ctx
.
template
Alloc
<
T
>(
d_bias
);
set_constant
(
dev_ctx
,
d_scale
,
static_cast
<
T
>
(
0
));
set_constant
(
dev_ctx
,
d_bias
,
static_cast
<
T
>
(
0
));
auto
d_scale_e
=
EigenVector
<
T
>::
Flatten
(
*
d_scale
);
auto
d_scale_data
=
d_scale_e
.
reshape
(
C_shape
);
auto
d_bias_e
=
EigenVector
<
T
>::
Flatten
(
*
d_bias
);
auto
d_bias_data
=
d_bias_e
.
reshape
(
C_shape
);
d_bias_data
.
device
(
*
place
)
=
dy_arr
.
sum
(
mean_rdims
).
reshape
(
param_shape
).
sum
(
rdims
);
d_scale_data
.
device
(
*
place
)
=
(
tmp
*
dy_arr
).
sum
(
mean_rdims
).
reshape
(
param_shape
).
sum
(
rdims
);
}
auto
dy_mean
=
dy_arr
.
mean
(
mean_rdims
).
reshape
(
NxC_shape
).
eval
().
broadcast
(
bcast
);
Eigen
::
DSizes
<
int
,
2
>
bcast_param
(
N
,
sample_size
);
set_constant
(
dev_ctx
,
d_x
,
static_cast
<
T
>
(
0
));
// math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
// axis=(h,w))
// - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
// mean),
// axis=(h,w))
auto
dx_e
=
EigenVector
<
T
>::
Flatten
(
*
d_x
);
auto
dx_arr
=
dx_e
.
reshape
(
shape
);
dx_arr
.
device
(
*
place
)
=
scale_arr
.
broadcast
(
bcast_param
)
*
inv_var_arr
.
broadcast
(
bcast
)
*
(
dy_arr
-
dy_mean
-
tmp
*
(
dy_arr
*
tmp
)
.
mean
(
mean_rdims
)
.
reshape
(
NxC_shape
)
.
eval
()
.
broadcast
(
bcast
));
}
}
// namespace phi
PD_REGISTER_KERNEL
(
instance_norm_grad
,
CPU
,
ALL_LAYOUT
,
phi
::
InstanceNormGradKernel
,
float
,
double
)
{}
paddle/phi/kernels/cpu/instance_norm_kernel.cc
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/instance_norm_kernel.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
InstanceNormKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
paddle
::
optional
<
const
DenseTensor
&>
scale
,
paddle
::
optional
<
const
DenseTensor
&>
bias
,
float
epsilon_f
,
DenseTensor
*
y
,
DenseTensor
*
saved_mean
,
DenseTensor
*
saved_variance
)
{
const
auto
&
x_dims
=
x
.
dims
();
T
epsilon
=
static_cast
<
T
>
(
epsilon_f
);
const
int
N
=
x_dims
[
0
];
const
int
C
=
x_dims
[
1
];
const
int
NxC
=
N
*
C
;
const
int
sample_size
=
x
.
numel
()
/
N
/
C
;
auto
*
place
=
dev_ctx
.
eigen_device
();
Eigen
::
DSizes
<
int
,
2
>
shape
(
NxC
,
sample_size
);
// Once eigen on Windows is updated, the if branch can be removed.
#ifndef EIGEN_HAS_INDEX_LIST
Eigen
::
DSizes
<
int
,
2
>
bcast
(
1
,
sample_size
);
Eigen
::
DSizes
<
int
,
2
>
C_shape
(
C
,
1
);
Eigen
::
DSizes
<
int
,
2
>
NxC_shape
(
NxC
,
1
);
Eigen
::
DSizes
<
int
,
1
>
rdims
(
1
);
#else
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>
,
int
>
bcast
;
bcast
.
set
(
1
,
sample_size
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
C_shape
;
C_shape
.
set
(
0
,
C
);
Eigen
::
IndexList
<
int
,
Eigen
::
type2index
<
1
>>
NxC_shape
;
NxC_shape
.
set
(
0
,
NxC
);
Eigen
::
IndexList
<
Eigen
::
type2index
<
1
>>
rdims
;
#endif
phi
::
funcs
::
SetConstant
<
CPUContext
,
T
>
set_constant
;
dev_ctx
.
template
Alloc
<
T
>(
saved_mean
);
dev_ctx
.
template
Alloc
<
T
>(
saved_variance
);
set_constant
(
dev_ctx
,
saved_mean
,
static_cast
<
T
>
(
0
));
set_constant
(
dev_ctx
,
saved_variance
,
static_cast
<
T
>
(
0
));
auto
saved_mean_a
=
EigenVector
<
T
>::
Flatten
(
*
saved_mean
);
auto
saved_mean_e
=
saved_mean_a
.
reshape
(
NxC_shape
);
auto
saved_variance_a
=
EigenVector
<
T
>::
Flatten
(
*
saved_variance
);
auto
saved_variance_e
=
saved_variance_a
.
reshape
(
NxC_shape
);
auto
x_e
=
EigenVector
<
T
>::
Flatten
(
x
);
auto
x_arr
=
x_e
.
reshape
(
shape
);
saved_mean_e
.
device
(
*
place
)
=
x_arr
.
mean
(
rdims
);
auto
saved_variance_arr
=
(
x_arr
-
saved_mean_e
.
broadcast
(
bcast
)).
square
().
mean
(
rdims
)
+
epsilon
;
saved_variance_e
.
device
(
*
place
)
=
saved_variance_arr
.
sqrt
().
inverse
();
const
auto
scale_ptr
=
scale
.
get_ptr
();
const
auto
bias_ptr
=
bias
.
get_ptr
();
DenseTensor
scale_data
;
DenseTensor
bias_data
;
if
(
!
scale_ptr
)
{
scale_data
.
Resize
({
C
});
dev_ctx
.
template
Alloc
<
T
>(
&
scale_data
);
set_constant
(
dev_ctx
,
&
scale_data
,
static_cast
<
T
>
(
1
));
}
if
(
!
bias_ptr
)
{
bias_data
.
Resize
({
C
});
dev_ctx
.
template
Alloc
<
T
>(
&
bias_data
);
set_constant
(
dev_ctx
,
&
bias_data
,
static_cast
<
T
>
(
0
));
}
auto
scale_e
=
scale_ptr
?
EigenVector
<
T
>::
Flatten
(
*
scale_ptr
)
:
EigenVector
<
T
>::
Flatten
(
const_cast
<
const
DenseTensor
&>
(
scale_data
));
auto
scale_arr
=
scale_e
.
reshape
(
C_shape
);
auto
bias_e
=
bias_ptr
?
EigenVector
<
T
>::
Flatten
(
*
bias_ptr
)
:
EigenVector
<
T
>::
Flatten
(
const_cast
<
const
DenseTensor
&>
(
bias_data
));
auto
bias_arr
=
bias_e
.
reshape
(
C_shape
);
dev_ctx
.
template
Alloc
<
T
>(
y
);
auto
y_e
=
EigenVector
<
T
>::
Flatten
(
*
y
);
auto
y_arr
=
y_e
.
reshape
(
shape
);
// (x - mean) * inv_std * scale + bias
Eigen
::
DSizes
<
int
,
2
>
bcast_param
(
N
,
sample_size
);
y_arr
.
device
(
*
place
)
=
(
x_arr
-
saved_mean_e
.
broadcast
(
bcast
))
*
saved_variance_e
.
broadcast
(
bcast
)
*
scale_arr
.
broadcast
(
bcast_param
)
+
bias_arr
.
broadcast
(
bcast_param
);
}
}
// namespace phi
PD_REGISTER_KERNEL
(
instance_norm
,
CPU
,
ALL_LAYOUT
,
phi
::
InstanceNormKernel
,
float
,
double
)
{}
paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
namespace
phi
{
template
<
typename
T
,
int
BlockDim
>
static
__global__
void
GradComputeDX
(
const
T
*
dy
,
const
BatchNormParamType
<
T
>
*
scale
,
const
BatchNormParamType
<
T
>
*
mean
,
const
T
*
x
,
const
BatchNormParamType
<
T
>
*
variance
,
const
int
C
,
const
int
sample_size
,
T
*
dx
)
{
int
beg_idx
=
blockIdx
.
x
*
sample_size
+
threadIdx
.
x
;
int
end_idx
=
(
blockIdx
.
x
+
1
)
*
sample_size
;
int
ncid
=
blockIdx
.
x
;
int
c
=
ncid
%
C
;
BatchNormParamType
<
T
>
mean_val
=
mean
[
ncid
];
BatchNormParamType
<
T
>
inv_var_val
=
variance
[
ncid
];
typedef
cub
::
BlockReduce
<
BatchNormParamType
<
T
>
,
BlockDim
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
dy_storage
;
__shared__
typename
BlockReduce
::
TempStorage
dy_x_sub_mean_storage
;
__shared__
BatchNormParamType
<
T
>
dy_sum_val
;
__shared__
BatchNormParamType
<
T
>
dy_x_sub_mean_sum_val
;
BatchNormParamType
<
T
>
dy_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
BatchNormParamType
<
T
>
dy_x_sub_mean_sum
=
static_cast
<
BatchNormParamType
<
T
>>
(
0
);
for
(
int
i
=
beg_idx
;
i
<
end_idx
;
i
+=
BlockDim
)
{
BatchNormParamType
<
T
>
dy_i
=
static_cast
<
BatchNormParamType
<
T
>>
(
dy
[
i
]);
dy_sum
+=
dy_i
;
dy_x_sub_mean_sum
+=
dy_i
*
(
static_cast
<
BatchNormParamType
<
T
>>
(
x
[
i
])
-
mean_val
);
}
dy_sum
=
BlockReduce
(
dy_storage
).
Reduce
(
dy_sum
,
cub
::
Sum
());
dy_x_sub_mean_sum
=
BlockReduce
(
dy_x_sub_mean_storage
).
Reduce
(
dy_x_sub_mean_sum
,
cub
::
Sum
());
if
(
threadIdx
.
x
==
0
)
{
dy_sum_val
=
dy_sum
;
dy_x_sub_mean_sum_val
=
dy_x_sub_mean_sum
;
}
__syncthreads
();
for
(
int
i
=
beg_idx
;
i
<
end_idx
;
i
+=
BlockDim
)
{
dx
[
i
]
=
(
static_cast
<
BatchNormParamType
<
T
>>
(
dy
[
i
])
-
dy_sum_val
/
static_cast
<
BatchNormParamType
<
T
>>
(
sample_size
)
-
(
static_cast
<
BatchNormParamType
<
T
>>
(
x
[
i
])
-
mean_val
)
*
dy_x_sub_mean_sum_val
*
inv_var_val
*
inv_var_val
/
sample_size
)
*
scale
[
c
]
*
inv_var_val
;
}
}
template
<
typename
T
,
typename
Context
>
void
InstanceNormGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
d_y
,
paddle
::
optional
<
const
DenseTensor
&>
scale
,
const
DenseTensor
&
saved_mean
,
const
DenseTensor
&
saved_variance
,
float
epsilon_f
,
DenseTensor
*
d_x
,
DenseTensor
*
d_scale
,
DenseTensor
*
d_bias
)
{
double
epsilon
=
static_cast
<
double
>
(
epsilon_f
);
const
auto
*
scale_ptr
=
scale
.
get_ptr
();
const
auto
&
x_dims
=
x
.
dims
();
int
N
,
C
,
H
,
W
,
D
;
paddle
::
operators
::
ExtractNCWHD
(
x_dims
,
DataLayout
::
kNCHW
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
int
NxC
=
N
*
C
;
DenseTensor
x_tmp
,
d_y_tmp
;
x_tmp
.
ShareDataWith
(
x
).
Resize
({
1
,
NxC
,
H
,
W
,
D
});
d_y_tmp
.
ShareDataWith
(
d_y
).
Resize
({
1
,
NxC
,
H
,
W
,
D
});
dev_ctx
.
template
Alloc
<
T
>(
d_x
);
if
(
d_scale
&&
d_bias
)
{
dev_ctx
.
template
Alloc
<
T
>(
d_scale
);
dev_ctx
.
template
Alloc
<
T
>(
d_bias
);
}
if
(
scale_ptr
)
{
PADDLE_ENFORCE_EQ
(
scale_ptr
->
dims
().
size
(),
1UL
,
phi
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the size of scale's dimensions must be equal to 1. But "
"received: the size of scale's dimensions"
"is [%d]"
,
scale_ptr
->
dims
().
size
()));
PADDLE_ENFORCE_EQ
(
scale_ptr
->
dims
()[
0
],
C
,
phi
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the first dimension of scale must be equal to "
"Channels([%d]). But received: "
"the first dimension of scale is [%d],"
"the dimensions of scale is [%s], "
,
C
,
scale_ptr
->
dims
()[
0
],
scale_ptr
->
dims
()));
}
phi
::
funcs
::
SetConstant
<
GPUContext
,
T
>
set_constant
;
const
int
n
=
x
.
numel
();
const
int
block
=
512
;
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
const
int
grid
=
std
::
min
(
NxC
,
max_blocks
);
const
int
grid1
=
(
C
+
block
-
1
)
/
block
;
DenseTensor
scale_tmp
;
scale_tmp
.
Resize
({
NxC
});
dev_ctx
.
template
Alloc
<
T
>(
&
scale_tmp
);
DenseTensor
d_scale_tmp
;
d_scale_tmp
.
Resize
({
NxC
});
dev_ctx
.
template
Alloc
<
T
>(
&
d_scale_tmp
);
DenseTensor
d_bias_tmp
;
d_bias_tmp
.
Resize
({
NxC
});
dev_ctx
.
template
Alloc
<
T
>(
&
d_bias_tmp
);
if
(
scale_ptr
)
{
repeat_param
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
scale_ptr
->
data
<
T
>
(),
scale_tmp
.
data
<
T
>
(),
N
,
C
);
}
else
{
set_constant
(
dev_ctx
,
&
scale_tmp
,
static_cast
<
T
>
(
1
));
}
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
dims
=
{
1
,
NxC
,
H
,
W
,
D
};
strides
=
{
NxC
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
if
((
H
*
W
*
D
)
==
1
)
{
phi
::
Copy
(
dev_ctx
,
d_y
,
dev_ctx
.
GetPlace
(),
false
,
d_x
);
phi
::
funcs
::
SetConstant
<
GPUContext
,
BatchNormParamType
<
T
>>
functor
;
functor
(
dev_ctx
,
d_scale
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
functor
(
dev_ctx
,
d_bias
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
return
;
}
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t
data_desc_
;
miopenTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
in_param_desc_
));
#else
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_param_desc_
));
#endif
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
<<
"CUDNN_BN_MIN_EPSILON. Setting it to "
<<
"CUDNN_BN_MIN_EPSILON instead."
;
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenSetTensorDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
const_cast
<
int
*>
(
dims
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
miopenBNSpatial
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
CUDNN_BATCHNORM_SPATIAL
));
#endif
const
auto
*
saved_mean_data
=
saved_mean
.
template
data
<
BatchNormParamType
<
T
>
>
();
const
auto
*
saved_var_data
=
saved_variance
.
template
data
<
BatchNormParamType
<
T
>
>
();
if
(
d_scale
&&
d_bias
)
{
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
miopenBNSpatial
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
d_y_tmp
.
template
data
<
T
>(),
data_desc_
,
d_x
->
template
data
<
T
>(),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d_bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
epsilon
,
saved_mean_data
,
saved_var_data
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnBatchNormalizationBackward
(
dev_ctx
.
cudnn_handle
(),
CUDNN_BATCHNORM_SPATIAL
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
d_y_tmp
.
template
data
<
T
>(),
data_desc_
,
d_x
->
template
data
<
T
>(),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d_scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d_bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
epsilon
,
saved_mean_data
,
saved_var_data
));
#endif
}
else
{
if
(
d_x
)
{
GradComputeDX
<
T
,
block
><<<
NxC
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_y
.
data
<
T
>
(),
scale_tmp
.
data
<
BatchNormParamType
<
T
>>
(),
saved_mean_data
,
x
.
data
<
T
>
(),
saved_var_data
,
C
,
H
*
W
*
D
,
d_x
->
data
<
T
>
());
}
}
if
(
d_scale
&&
d_bias
)
{
add_param
<
T
,
block
,
false
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_scale_tmp
.
data
<
T
>
(),
d_scale
->
data
<
T
>
(),
N
,
C
);
add_param
<
T
,
block
,
false
><<<
grid1
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
d_bias_tmp
.
data
<
T
>
(),
d_bias
->
data
<
T
>
(),
N
,
C
);
}
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
in_param_desc_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_param_desc_
));
#endif
}
}
// namespace phi
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
PD_REGISTER_KERNEL
(
instance_norm_grad
,
GPU
,
ALL_LAYOUT
,
phi
::
InstanceNormGradKernel
,
float
)
{}
#else
PD_REGISTER_KERNEL
(
instance_norm_grad
,
GPU
,
ALL_LAYOUT
,
phi
::
InstanceNormGradKernel
,
float
,
double
)
{}
#endif
paddle/phi/kernels/gpu/instance_norm_kernel.cu
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/instance_norm_kernel.h"
#include "paddle/fluid/operators/norm_utils.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
InstanceNormKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
paddle
::
optional
<
const
DenseTensor
&>
scale
,
paddle
::
optional
<
const
DenseTensor
&>
bias
,
float
epsilon_f
,
DenseTensor
*
y
,
DenseTensor
*
saved_mean
,
DenseTensor
*
saved_variance
)
{
double
epsilon
=
static_cast
<
double
>
(
epsilon_f
);
auto
&
x_dims
=
x
.
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
2
,
phi
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the size of X's dimensions must greater than "
"or equal to 2. But received: "
"the size of X's dimensions is [%d]"
,
x_dims
.
size
()));
PADDLE_ENFORCE_LE
(
x_dims
.
size
(),
5
,
phi
::
errors
::
InvalidArgument
(
"The `shape` in InstanceNormOp is invalid: "
"the size of X's dimensions must smaller than"
"or equal to 5. But received: "
"the size of X's dimensions is [%d]"
,
x_dims
.
size
()));
int
N
,
C
,
H
,
W
,
D
;
paddle
::
operators
::
ExtractNCWHD
(
x_dims
,
DataLayout
::
kNCHW
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
int
NxC
=
N
*
C
;
DenseTensor
x_tmp
;
x_tmp
.
ShareDataWith
(
x
).
Resize
({
1
,
NxC
,
H
,
W
,
D
});
dev_ctx
.
template
Alloc
<
T
>(
y
);
#ifdef PADDLE_WITH_HIP
miopenTensorDescriptor_t
data_desc_
;
miopenTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenCreateTensorDescriptor
(
&
in_param_desc_
));
#else
cudnnTensorDescriptor_t
data_desc_
;
cudnnTensorDescriptor_t
in_param_desc_
;
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnCreateTensorDescriptor
(
&
in_param_desc_
));
#endif
if
(
epsilon
<=
CUDNN_BN_MIN_EPSILON
-
FLT_EPSILON
)
{
LOG
(
ERROR
)
<<
"Provided epsilon is smaller than "
<<
"CUDNN_BN_MIN_EPSILON. Setting it to "
<<
"CUDNN_BN_MIN_EPSILON instead."
;
}
epsilon
=
std
::
max
(
epsilon
,
CUDNN_BN_MIN_EPSILON
);
VLOG
(
3
)
<<
"Setting descriptors."
;
std
::
vector
<
int
>
dims
;
std
::
vector
<
int
>
strides
;
dims
=
{
1
,
NxC
,
H
,
W
,
D
};
strides
=
{
NxC
*
H
*
W
*
D
,
H
*
W
*
D
,
W
*
D
,
D
,
1
};
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenSetTensorDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
const_cast
<
int
*>
(
dims
.
data
()),
const_cast
<
int
*>
(
strides
.
data
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
miopenBNSpatial
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnSetTensorNdDescriptor
(
data_desc_
,
CudnnDataType
<
T
>::
type
,
x_dims
.
size
()
>
3
?
x_dims
.
size
()
:
4
,
dims
.
data
(),
strides
.
data
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnDeriveBNTensorDescriptor
(
in_param_desc_
,
data_desc_
,
CUDNN_BATCHNORM_SPATIAL
));
#endif
const
auto
scale_ptr
=
scale
.
get_ptr
();
const
auto
bias_ptr
=
bias
.
get_ptr
();
DenseTensor
scale_tmp
;
scale_tmp
.
Resize
({
NxC
});
dev_ctx
.
template
Alloc
<
T
>(
&
scale_tmp
);
DenseTensor
bias_tmp
;
bias_tmp
.
Resize
({
NxC
});
dev_ctx
.
template
Alloc
<
T
>(
&
bias_tmp
);
const
int
n
=
x
.
numel
();
const
int
block
=
512
;
int
max_threads
=
dev_ctx
.
GetMaxPhysicalThreadCount
();
const
int
max_blocks
=
std
::
max
(
max_threads
/
block
,
1
);
const
int
grid
=
std
::
min
((
NxC
+
block
-
1
)
/
block
,
max_blocks
);
phi
::
funcs
::
SetConstant
<
GPUContext
,
T
>
set_constant
;
if
(
scale_ptr
)
{
repeat_param
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
scale_ptr
->
data
<
T
>
(),
scale_tmp
.
data
<
T
>
(),
N
,
C
);
}
else
{
set_constant
(
dev_ctx
,
&
scale_tmp
,
static_cast
<
T
>
(
1
));
}
if
(
bias_ptr
)
{
repeat_param
<
T
><<<
grid
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
bias_ptr
->
data
<
T
>
(),
bias_tmp
.
data
<
T
>
(),
N
,
C
);
}
else
{
set_constant
(
dev_ctx
,
&
bias_tmp
,
static_cast
<
T
>
(
0
));
}
auto
handle
=
dev_ctx
.
cudnn_handle
();
phi
::
funcs
::
SetConstant
<
GPUContext
,
BatchNormParamType
<
T
>>
functor
;
dev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
saved_mean
);
dev_ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
saved_variance
);
functor
(
dev_ctx
,
saved_mean
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
functor
(
dev_ctx
,
saved_variance
,
static_cast
<
BatchNormParamType
<
T
>>
(
0
));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenBatchNormalizationForwardTraining
(
handle
,
miopenBNSpatial
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kOne
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
CudnnDataType
<
T
>::
kZero
())),
data_desc_
,
static_cast
<
const
void
*>
(
x_tmp
.
template
data
<
T
>()),
data_desc_
,
static_cast
<
void
*>
(
y
->
template
data
<
T
>()),
in_param_desc_
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
())),
0
,
nullptr
,
nullptr
,
epsilon
,
static_cast
<
void
*>
(
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
()),
static_cast
<
void
*>
(
saved_variance
->
template
data
<
BatchNormParamType
<
T
>
>
())));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
miopenDestroyTensorDescriptor
(
in_param_desc_
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnBatchNormalizationForwardTraining
(
handle
,
CUDNN_BATCHNORM_SPATIAL
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kZero
(),
data_desc_
,
x_tmp
.
template
data
<
T
>(),
data_desc_
,
y
->
template
data
<
T
>(),
in_param_desc_
,
scale_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
bias_tmp
.
template
data
<
BatchNormParamType
<
T
>
>
(),
0
,
nullptr
,
nullptr
,
epsilon
,
saved_mean
->
template
data
<
BatchNormParamType
<
T
>
>
(),
saved_variance
->
template
data
<
BatchNormParamType
<
T
>
>
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
data_desc_
));
PADDLE_ENFORCE_GPU_SUCCESS
(
paddle
::
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
in_param_desc_
));
#endif
}
}
// namespace phi
#ifdef PADDLE_WITH_HIP
// MIOPEN do not support double
PD_REGISTER_KERNEL
(
instance_norm
,
GPU
,
ALL_LAYOUT
,
phi
::
InstanceNormKernel
,
float
)
{}
#else
PD_REGISTER_KERNEL
(
instance_norm
,
GPU
,
ALL_LAYOUT
,
phi
::
InstanceNormKernel
,
float
,
double
)
{}
#endif
paddle/phi/kernels/gpu/instance_norm_utils.h
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cfloat>
#include <string>
#include <vector>
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#endif
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
namespace
phi
{
template
<
typename
T
>
using
CudnnDataType
=
paddle
::
platform
::
CudnnDataType
<
T
>
;
template
<
typename
T
>
using
BatchNormParamType
=
typename
CudnnDataType
<
T
>::
BatchNormParamType
;
template
<
typename
T
>
static
__global__
void
repeat_param
(
const
T
*
input
,
T
*
output
,
const
int
repeat_num
,
const
int
C
)
{
CUDA_KERNEL_LOOP
(
i
,
repeat_num
*
C
)
{
int
index
=
i
%
C
;
output
[
i
]
=
input
[
index
];
}
}
template
<
typename
T
,
int
BlockDim
,
bool
AVG
>
static
__global__
void
add_param
(
const
T
*
input
,
T
*
output
,
const
int
repeat_num
,
const
int
C
)
{
typedef
cub
::
BlockReduce
<
T
,
BlockDim
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
ou_storage
;
for
(
int
i
=
blockIdx
.
x
;
i
<
C
;
i
+=
gridDim
.
x
)
{
T
ou
=
static_cast
<
T
>
(
0
);
for
(
int
j
=
threadIdx
.
x
;
j
<
repeat_num
;
j
+=
blockDim
.
x
)
{
const
int
index
=
j
*
C
+
i
;
ou
+=
static_cast
<
T
>
(
input
[
index
]);
}
ou
=
BlockReduce
(
ou_storage
).
Reduce
(
ou
,
cub
::
Sum
());
if
(
threadIdx
.
x
==
0
)
{
output
[
i
]
=
ou
;
}
__syncthreads
();
if
(
AVG
)
{
output
[
i
]
/=
repeat_num
;
}
}
}
}
// namespace phi
paddle/phi/kernels/instance_norm_grad_kernel.h
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
InstanceNormGradKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y_grad
,
paddle
::
optional
<
const
DenseTensor
&>
scale
,
const
DenseTensor
&
saved_mean
,
const
DenseTensor
&
saved_variance
,
float
epsilon
,
DenseTensor
*
x_grad
,
DenseTensor
*
scale_grad
,
DenseTensor
*
bias_grad
);
}
// namespace phi
paddle/phi/kernels/instance_norm_kernel.h
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
InstanceNormKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
paddle
::
optional
<
const
DenseTensor
&>
scale
,
paddle
::
optional
<
const
DenseTensor
&>
bias
,
float
epsilon
,
DenseTensor
*
y
,
DenseTensor
*
saved_mean
,
DenseTensor
*
saved_variance
);
}
// namespace phi
paddle/phi/ops/compat/instance_norm_sig.cc
0 → 100644
浏览文件 @
cc272afb
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace
phi
{
KernelSignature
InstanceNormOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"instance_norm"
,
{
"X"
,
"Scale"
,
"Bias"
},
{
"epsilon"
},
{
"Y"
,
"SavedMean"
,
"SavedVariance"
});
}
KernelSignature
InstanceNormGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"instance_norm_grad"
,
{
"X"
,
"Y@GRAD"
,
"Scale"
,
"SavedMean"
,
"SavedVariance"
},
{
"epsilon"
},
{
"X@GRAD"
,
"Scale@GRAD"
,
"Bias@GRAD"
});
}
}
// namespace phi
PD_REGISTER_ARG_MAPPING_FN
(
instance_norm
,
phi
::
InstanceNormOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
instance_norm_grad
,
phi
::
InstanceNormGradOpArgumentMapping
);
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录