Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
34fd65cf
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
34fd65cf
编写于
11月 25, 2022
作者:
W
Wang Bojun
提交者:
GitHub
11月 25, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Group norm fp16 support (#48222)
* group norm fp16 support
上级
9a227ee7
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
637 addition
and
352 deletion
+637
-352
paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
+7
-3
paddle/fluid/inference/tensorrt/op_teller.cc
paddle/fluid/inference/tensorrt/op_teller.cc
+0
-9
paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
...e/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
+163
-78
paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
...le/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
+73
-33
paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
+130
-109
paddle/phi/kernels/gpu/group_norm_kernel.cu
paddle/phi/kernels/gpu/group_norm_kernel.cu
+94
-74
paddle/phi/kernels/gpu/group_norm_utils.h
paddle/phi/kernels/gpu/group_norm_utils.h
+39
-25
paddle/phi/kernels/group_norm_kernel.h
paddle/phi/kernels/group_norm_kernel.h
+4
-5
python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py
...sts/unittests/ir/inference/test_trt_convert_group_norm.py
+15
-16
python/paddle/fluid/tests/unittests/test_group_norm_op.py
python/paddle/fluid/tests/unittests/test_group_norm_op.py
+60
-0
python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+52
-0
未找到文件。
paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
浏览文件 @
34fd65cf
...
...
@@ -34,7 +34,7 @@ class GroupNormOpConverter : public OpConverter {
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
3
)
<<
"convert a fluid group_norm op
"
;
VLOG
(
4
)
<<
"convert a fluid group_norm op to tensorrt group_norm plugin
"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
...
...
@@ -61,6 +61,8 @@ class GroupNormOpConverter : public OpConverter {
framework
::
DDim
bias_dims
;
auto
scale_weights
=
GetWeight
(
scale_name
,
&
scale_dims
);
auto
bias_weights
=
GetWeight
(
bias_name
,
&
bias_dims
);
bool
with_fp16
=
engine_
->
WithFp16
()
&&
!
engine_
->
disable_trt_plugin_fp16
();
if
(
engine_
->
with_dynamic_shape
())
{
int
gn_num
=
groups
;
std
::
vector
<
int64_t
>
mean_shape
({
gn_num
});
...
...
@@ -74,7 +76,8 @@ class GroupNormOpConverter : public OpConverter {
epsilon
,
groups
,
mean_shape
,
variance_shape
);
variance_shape
,
with_fp16
);
nvinfer1
::
ILayer
*
groupnorm_layer
=
engine_
->
AddDynamicPlugin
(
&
input_itensor
,
1
,
plugin
);
auto
output_name
=
op_desc
.
Output
(
"Y"
)[
0
];
...
...
@@ -92,7 +95,8 @@ class GroupNormOpConverter : public OpConverter {
epsilon
,
groups
,
mean_shape
,
variance_shape
);
variance_shape
,
with_fp16
);
nvinfer1
::
ILayer
*
groupnorm_layer
=
engine_
->
AddPlugin
(
&
input_itensor
,
1
,
plugin
);
auto
output_name
=
op_desc
.
Output
(
"Y"
)[
0
];
...
...
paddle/fluid/inference/tensorrt/op_teller.cc
浏览文件 @
34fd65cf
...
...
@@ -415,15 +415,6 @@ struct SimpleOpTypeSetTeller : public Teller {
<<
layout_str
;
return
false
;
}
auto
*
block
=
desc
.
Block
();
if
(
block
==
nullptr
)
return
false
;
auto
x_var_name
=
desc
.
Input
(
"X"
)[
0
];
auto
*
x_var_desc
=
block
->
FindVar
(
x_var_name
);
auto
dtype
=
x_var_desc
->
GetDataType
();
if
(
dtype
!=
5
)
{
VLOG
(
3
)
<<
"Group norm trt plugin only support float32"
;
return
false
;
}
}
if
(
op_type
==
"concat"
)
{
if
(
!
desc
.
HasAttr
(
"axis"
))
{
...
...
paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
浏览文件 @
34fd65cf
...
...
@@ -25,7 +25,53 @@ namespace tensorrt {
namespace
plugin
{
using
DataLayout
=
phi
::
DataLayout
;
int
GroupNormPlugin
::
initialize
()
TRT_NOEXCEPT
{
return
0
;
}
int
GroupNormPlugin
::
initialize
()
TRT_NOEXCEPT
{
if
(
!
with_fp16_
)
{
// if use fp32
cudaMalloc
(
&
scale_gpu_
,
sizeof
(
float
)
*
scale_
.
size
());
cudaMalloc
(
&
bias_gpu_
,
sizeof
(
float
)
*
bias_
.
size
());
cudaMemcpy
(
scale_gpu_
,
scale_
.
data
(),
scale_
.
size
()
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
bias_gpu_
,
bias_
.
data
(),
bias_
.
size
()
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
}
else
{
// if use fp16
std
::
vector
<
half
>
scale_half
(
scale_
.
size
());
std
::
vector
<
half
>
bias_half
(
bias_
.
size
());
for
(
int
i
=
0
;
i
<
scale_
.
size
();
++
i
)
{
scale_half
[
i
]
=
static_cast
<
half
>
(
scale_
[
i
]);
}
for
(
int
i
=
0
;
i
<
bias_
.
size
();
++
i
)
{
bias_half
[
i
]
=
static_cast
<
half
>
(
bias_
[
i
]);
}
cudaMalloc
(
&
scale_gpu_
,
sizeof
(
half
)
*
scale_half
.
size
());
cudaMalloc
(
&
bias_gpu_
,
sizeof
(
half
)
*
bias_half
.
size
());
cudaMemcpy
(
scale_gpu_
,
scale_half
.
data
(),
scale_half
.
size
()
*
sizeof
(
half
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
bias_gpu_
,
bias_half
.
data
(),
bias_half
.
size
()
*
sizeof
(
half
),
cudaMemcpyHostToDevice
);
}
return
0
;
}
bool
GroupNormPlugin
::
supportsFormat
(
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
)
const
TRT_NOEXCEPT
{
if
(
with_fp16_
)
{
return
((
type
==
nvinfer1
::
DataType
::
kHALF
)
&&
(
format
==
nvinfer1
::
PluginFormat
::
kLINEAR
));
}
else
{
return
((
type
==
nvinfer1
::
DataType
::
kFLOAT
)
&&
(
format
==
nvinfer1
::
PluginFormat
::
kLINEAR
));
}
}
nvinfer1
::
Dims
GroupNormPlugin
::
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputDims
,
int
nbInputs
)
TRT_NOEXCEPT
{
...
...
@@ -70,48 +116,48 @@ int GroupNormPlugin::enqueue(int batch_size,
"but got channel number:%d, bias's size:%d."
,
C
,
bias_
.
size
()));
int
device_id
;
cudaGetDevice
(
&
device_id
)
;
const
float
*
input
=
static_cast
<
const
float
*>
(
inputs
[
0
]
);
float
*
output
=
static_cast
<
float
*>
(
outputs
[
0
]);
scale_t
.
Resize
(
phi
::
make_ddim
({
C
})
);
bias_t
.
Resize
(
phi
::
make_ddim
({
C
})
);
mean_t
.
Resize
(
phi
::
make_ddim
(
mean_shape_
));
variance_t
.
Resize
(
phi
::
make_ddim
(
variance_shape_
));
float
*
scale_d
=
scale_t
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
device_id
));
float
*
bias_d
=
bias_t
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
device_id
));
float
*
mean_d
=
mean_t
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
device_id
));
float
*
variance_d
=
variance_t
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
device_id
));
phi
::
DenseTensor
temp_variance_t
;
temp_variance_t
.
Resize
(
phi
::
make_ddim
(
variance_shape_
));
float
*
temp_variance_d
=
temp_variance_t
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
device_id
)
);
cudaMemcpyAsync
(
scale_d
,
scale_
.
data
(),
sizeof
(
float
)
*
C
,
cudaMemcpyHostToDevice
,
stream
)
;
cudaMemcpyAsync
(
bias_d
,
bias_
.
data
(),
sizeof
(
float
)
*
C
,
cudaMemcpyHostToDevice
,
stream
);
phi
::
GroupNormDirectCUDAFunctor
<
float
>
group_norm
;
group_norm
(
stream
,
input
,
input_shape
,
bias_d
,
scale_d
,
mean_d
,
temp_variance
_d
,
groups_
,
eps_
,
output
,
mean_d
,
variance_d
,
DataLayout
::
kNCHW
);
float
*
mean_d
=
static_cast
<
float
*>
(
workspace
);
float
*
variance_d
=
mean_d
+
input_shape
[
0
]
*
groups_
;
float
*
temp_variance_d
=
variance_d
+
input_shape
[
0
]
*
groups_
;
auto
input_type
=
getDataType
(
);
if
(
input_type
==
nvinfer1
::
DataType
::
kFLOAT
)
{
VLOG
(
1
)
<<
"TRT Plugin DataType selected. GroupNorm-->fp32"
;
const
float
*
input
=
static_cast
<
const
float
*>
(
inputs
[
0
]
);
float
*
output
=
static_cast
<
float
*>
(
outputs
[
0
]
);
phi
::
GroupNormDirectCUDAFunctor
<
float
>
group_norm
;
group_norm
(
stream
,
input
,
input_shape
,
reinterpret_cast
<
float
*>
(
bias_gpu_
),
reinterpret_cast
<
float
*>
(
scale_gpu_
),
temp_variance_d
,
groups_
,
eps_
,
output
,
mean_d
,
variance_d
,
DataLayout
::
kNCHW
);
}
else
if
(
input_type
==
nvinfer1
::
DataType
::
kHALF
)
{
VLOG
(
1
)
<<
"TRT Plugin DataType selected. GroupNorm-->fp16"
;
const
half
*
input
=
static_cast
<
const
half
*>
(
inputs
[
0
]);
half
*
output
=
static_cast
<
half
*>
(
outputs
[
0
]);
phi
::
GroupNormDirectCUDAFunctor
<
half
,
float
>
group_norm
;
group_norm
(
stream
,
input
,
input_shape
,
reinterpret_cast
<
const
half
*>
(
bias_gpu_
)
,
reinterpret_cast
<
const
half
*>
(
scale_gpu_
)
,
temp_variance_d
,
groups_
,
eps_
,
output
,
mean
_d
,
variance_d
,
DataLayout
::
kNCHW
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Fatal
(
"The GroupNorm TRT Plugin's input type should be float or half."
));
}
return
cudaGetLastError
()
!=
cudaSuccess
;
}
nvinfer1
::
DimsExprs
GroupNormPluginDynamic
::
getOutputDimensions
(
...
...
@@ -140,8 +186,13 @@ bool GroupNormPluginDynamic::supportsFormatCombination(
nb_inputs
+
nb_outputs
));
const
nvinfer1
::
PluginTensorDesc
&
in
=
in_out
[
pos
];
if
(
pos
==
0
)
{
return
(
in
.
type
==
nvinfer1
::
DataType
::
kFLOAT
)
&&
(
in
.
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
);
if
(
with_fp16_
)
{
return
((
in
.
type
==
nvinfer1
::
DataType
::
kHALF
)
&&
(
in
.
format
==
nvinfer1
::
PluginFormat
::
kLINEAR
));
}
else
{
return
(
in
.
type
==
nvinfer1
::
DataType
::
kFLOAT
)
&&
(
in
.
format
==
nvinfer1
::
TensorFormat
::
kLINEAR
);
}
}
const
nvinfer1
::
PluginTensorDesc
&
prev
=
in_out
[
pos
-
1
];
// output
...
...
@@ -158,8 +209,50 @@ nvinfer1::DataType GroupNormPluginDynamic::getOutputDataType(
"The groupnorm Plugin only has one input, so the "
"index value should be 0, but get %d."
,
index
));
PADDLE_ENFORCE_EQ
((
input_types
[
0
]
==
nvinfer1
::
DataType
::
kFLOAT
||
input_types
[
0
]
==
nvinfer1
::
DataType
::
kHALF
),
true
,
platform
::
errors
::
InvalidArgument
(
"The input type should be half or float"
));
return
input_types
[
0
];
}
int
GroupNormPluginDynamic
::
initialize
()
TRT_NOEXCEPT
{
if
(
with_fp16_
==
false
)
{
// if use fp32
cudaMalloc
(
&
scale_gpu_
,
sizeof
(
float
)
*
scale_
.
size
());
cudaMalloc
(
&
bias_gpu_
,
sizeof
(
float
)
*
bias_
.
size
());
cudaMemcpy
(
scale_gpu_
,
scale_
.
data
(),
scale_
.
size
()
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
bias_gpu_
,
bias_
.
data
(),
bias_
.
size
()
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
}
else
{
// if use fp16
std
::
vector
<
half
>
scale_half
(
scale_
.
size
());
std
::
vector
<
half
>
bias_half
(
bias_
.
size
());
for
(
int
i
=
0
;
i
<
scale_
.
size
();
++
i
)
{
scale_half
[
i
]
=
static_cast
<
half
>
(
scale_
[
i
]);
}
for
(
int
i
=
0
;
i
<
bias_
.
size
();
++
i
)
{
bias_half
[
i
]
=
static_cast
<
half
>
(
bias_
[
i
]);
}
cudaMalloc
(
&
scale_gpu_
,
sizeof
(
half
)
*
scale_
.
size
());
cudaMalloc
(
&
bias_gpu_
,
sizeof
(
half
)
*
bias_
.
size
());
cudaMemcpy
(
scale_gpu_
,
scale_half
.
data
(),
scale_half
.
size
()
*
sizeof
(
half
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
bias_gpu_
,
bias_half
.
data
(),
bias_half
.
size
()
*
sizeof
(
half
),
cudaMemcpyHostToDevice
);
}
return
0
;
}
int
GroupNormPluginDynamic
::
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
input_desc
,
...
...
@@ -202,46 +295,38 @@ int GroupNormPluginDynamic::enqueue(
C
,
bias_
.
size
()));
int
device_id
;
cudaGetDevice
(
&
device_id
);
float
*
mean_d
=
static_cast
<
float
*>
(
workspace
);
float
*
variance_d
=
mean_d
+
input_shape
[
0
]
*
groups_
;
float
*
temp_variance_d
=
variance_d
+
input_shape
[
0
]
*
groups_
;
auto
input_type
=
input_desc
[
0
].
type
;
if
(
input_type
==
nvinfer1
::
DataType
::
kFLOAT
)
{
const
float
*
input
=
static_cast
<
const
float
*>
(
inputs
[
0
]);
VLOG
(
1
)
<<
"TRT Plugin DataType selected. GroupNorm-->fp32"
;
const
float
*
input
=
reinterpret_cast
<
const
float
*>
(
inputs
[
0
]);
float
*
output
=
static_cast
<
float
*>
(
outputs
[
0
]);
scale_t
.
Resize
(
phi
::
make_ddim
({
C
}));
bias_t
.
Resize
(
phi
::
make_ddim
({
C
}));
mean_t
.
Resize
(
phi
::
make_ddim
(
batched_mean_shape
));
variance_t
.
Resize
(
phi
::
make_ddim
(
batched_variance_shape
));
float
*
scale_d
=
scale_t
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
device_id
));
float
*
bias_d
=
bias_t
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
device_id
));
float
*
mean_d
=
mean_t
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
device_id
));
float
*
variance_d
=
variance_t
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
device_id
));
phi
::
DenseTensor
temp_variance_t
;
temp_variance_t
.
Resize
(
phi
::
make_ddim
(
batched_variance_shape
));
float
*
temp_variance_d
=
temp_variance_t
.
mutable_data
<
float
>
(
platform
::
CUDAPlace
(
device_id
));
cudaMemcpyAsync
(
scale_d
,
scale_
.
data
(),
sizeof
(
float
)
*
C
,
cudaMemcpyHostToDevice
,
stream
);
cudaMemcpyAsync
(
bias_d
,
bias_
.
data
(),
sizeof
(
float
)
*
C
,
cudaMemcpyHostToDevice
,
stream
);
phi
::
GroupNormDirectCUDAFunctor
<
float
>
group_norm
;
phi
::
GroupNormDirectCUDAFunctor
<
float
,
float
>
group_norm
;
group_norm
(
stream
,
input
,
input_shape
,
bias_d
,
scale_d
,
reinterpret_cast
<
float
*>
(
bias_gpu_
),
reinterpret_cast
<
float
*>
(
scale_gpu_
),
temp_variance_d
,
groups
,
eps
,
output
,
mean_d
,
variance_d
,
DataLayout
::
kNCHW
);
}
else
if
(
input_type
==
nvinfer1
::
DataType
::
kHALF
)
{
VLOG
(
1
)
<<
"TRT Plugin DataType selected. GroupNorm-->fp16"
;
const
half
*
input
=
reinterpret_cast
<
const
half
*>
(
inputs
[
0
]);
half
*
output
=
static_cast
<
half
*>
(
outputs
[
0
]);
phi
::
GroupNormDirectCUDAFunctor
<
half
,
float
>
group_norm
;
group_norm
(
stream
,
input
,
input_shape
,
reinterpret_cast
<
half
*>
(
bias_gpu_
),
reinterpret_cast
<
half
*>
(
scale_gpu_
),
temp_variance_d
,
groups
,
eps
,
...
...
paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
浏览文件 @
34fd65cf
...
...
@@ -32,7 +32,7 @@ class GroupNormPlugin : public PluginTensorRT {
return
getBaseSerializationSize
()
+
SerializedSize
(
scale_
)
+
SerializedSize
(
bias_
)
+
SerializedSize
(
eps_
)
+
SerializedSize
(
groups_
)
+
SerializedSize
(
mean_shape_
)
+
SerializedSize
(
variance_shape_
);
SerializedSize
(
variance_shape_
)
+
SerializedSize
(
with_fp16_
)
;
}
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
{
serializeBase
(
buffer
);
...
...
@@ -42,6 +42,7 @@ class GroupNormPlugin : public PluginTensorRT {
SerializeValue
(
&
buffer
,
groups_
);
SerializeValue
(
&
buffer
,
mean_shape_
);
SerializeValue
(
&
buffer
,
variance_shape_
);
SerializeValue
(
&
buffer
,
with_fp16_
);
}
GroupNormPlugin
(
const
float
*
scale
,
...
...
@@ -51,11 +52,13 @@ class GroupNormPlugin : public PluginTensorRT {
float
eps
,
int
groups
,
std
::
vector
<
int64_t
>
mean_shape
,
std
::
vector
<
int64_t
>
variance_shape
)
std
::
vector
<
int64_t
>
variance_shape
,
bool
with_fp16
)
:
groups_
(
groups
),
eps_
(
eps
),
mean_shape_
(
mean_shape
),
variance_shape_
(
variance_shape
)
{
variance_shape_
(
variance_shape
),
with_fp16_
(
with_fp16
)
{
scale_
.
resize
(
scale_num
);
bias_
.
resize
(
bias_num
);
std
::
copy
(
scale
,
scale
+
scale_num
,
scale_
.
data
());
...
...
@@ -69,22 +72,33 @@ class GroupNormPlugin : public PluginTensorRT {
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
groups_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
mean_shape_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
variance_shape_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
with_fp16_
);
}
~
GroupNormPlugin
()
{}
int
initialize
()
TRT_NOEXCEPT
override
;
GroupNormPlugin
*
clone
()
const
TRT_NOEXCEPT
override
{
return
new
GroupNormPlugin
(
scale_
.
data
(),
scale_
.
size
(),
bias_
.
data
(),
bias_
.
size
(),
eps_
,
groups_
,
mean_shape_
,
variance_shape_
);
auto
*
ptr
=
new
GroupNormPlugin
(
scale_
.
data
(),
scale_
.
size
(),
bias_
.
data
(),
bias_
.
size
(),
eps_
,
groups_
,
mean_shape_
,
variance_shape_
,
with_fp16_
);
ptr
->
scale_gpu_
=
scale_gpu_
;
ptr
->
bias_gpu_
=
bias_gpu_
;
return
ptr
;
}
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
{
return
"groupnorm_plugin"
;
}
size_t
getWorkspaceSize
(
int
max_batch_size
)
const
TRT_NOEXCEPT
{
return
3
*
max_batch_size
*
groups_
;
}
bool
supportsFormat
(
nvinfer1
::
DataType
type
,
nvinfer1
::
PluginFormat
format
)
const
TRT_NOEXCEPT
override
;
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
{
return
1
;
}
nvinfer1
::
Dims
getOutputDimensions
(
int
index
,
const
nvinfer1
::
Dims
*
inputs
,
...
...
@@ -101,18 +115,27 @@ class GroupNormPlugin : public PluginTensorRT {
#endif
void
*
workspace
,
cudaStream_t
stream
)
TRT_NOEXCEPT
override
;
void
terminate
()
TRT_NOEXCEPT
override
{
if
(
bias_gpu_
)
{
cudaFree
(
bias_gpu_
);
bias_gpu_
=
nullptr
;
}
if
(
scale_gpu_
)
{
cudaFree
(
scale_gpu_
);
scale_gpu_
=
nullptr
;
}
};
private:
std
::
vector
<
float
>
scale_
;
std
::
vector
<
float
>
bias_
;
phi
::
DenseTensor
scale_t
;
phi
::
DenseTensor
bias_t
;
phi
::
DenseTensor
mean_t
;
phi
::
DenseTensor
variance_t
;
void
*
scale_gpu_
;
void
*
bias_gpu_
;
int
groups_
;
float
eps_
;
std
::
vector
<
int64_t
>
mean_shape_
;
std
::
vector
<
int64_t
>
variance_shape_
;
bool
with_fp16_
;
};
class
GroupNormPluginCreator
:
public
TensorRTPluginCreator
{
public:
...
...
@@ -138,11 +161,13 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
float
eps
,
int
groups
,
std
::
vector
<
int64_t
>
mean_shape
,
std
::
vector
<
int64_t
>
variance_shape
)
std
::
vector
<
int64_t
>
variance_shape
,
bool
with_fp16
)
:
groups_
(
groups
),
eps_
(
eps
),
mean_shape_
(
mean_shape
),
variance_shape_
(
variance_shape
)
{
variance_shape_
(
variance_shape
),
with_fp16_
(
with_fp16
)
{
scale_
.
resize
(
scale_num
);
bias_
.
resize
(
bias_num
);
std
::
copy
(
scale
,
scale
+
scale_num
,
scale_
.
data
());
...
...
@@ -156,28 +181,34 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
groups_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
mean_shape_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
variance_shape_
);
DeserializeValue
(
&
serialData
,
&
serialLength
,
&
with_fp16_
);
}
nvinfer1
::
IPluginV2DynamicExt
*
clone
()
const
TRT_NOEXCEPT
override
{
return
new
GroupNormPluginDynamic
(
scale_
.
data
(),
scale_
.
size
(),
bias_
.
data
(),
bias_
.
size
(),
eps_
,
groups_
,
mean_shape_
,
variance_shape_
);
auto
*
ptr
=
new
GroupNormPluginDynamic
(
scale_
.
data
(),
scale_
.
size
(),
bias_
.
data
(),
bias_
.
size
(),
eps_
,
groups_
,
mean_shape_
,
variance_shape_
,
with_fp16_
);
ptr
->
scale_gpu_
=
scale_gpu_
;
ptr
->
bias_gpu_
=
bias_gpu_
;
return
ptr
;
}
const
char
*
getPluginType
()
const
TRT_NOEXCEPT
override
{
return
"groupnorm_plugin_dynamic"
;
}
int
getNbOutputs
()
const
TRT_NOEXCEPT
override
{
return
1
;
}
int
initialize
()
TRT_NOEXCEPT
override
{
return
0
;
}
int
initialize
()
TRT_NOEXCEPT
override
;
size_t
getSerializationSize
()
const
TRT_NOEXCEPT
override
{
return
SerializedSize
(
scale_
)
+
SerializedSize
(
bias_
)
+
SerializedSize
(
eps_
)
+
SerializedSize
(
groups_
)
+
SerializedSize
(
mean_shape_
)
+
SerializedSize
(
variance_shape_
);
SerializedSize
(
mean_shape_
)
+
SerializedSize
(
variance_shape_
)
+
SerializedSize
(
with_fp16_
);
}
void
serialize
(
void
*
buffer
)
const
TRT_NOEXCEPT
override
{
SerializeValue
(
&
buffer
,
scale_
);
...
...
@@ -186,6 +217,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
SerializeValue
(
&
buffer
,
groups_
);
SerializeValue
(
&
buffer
,
mean_shape_
);
SerializeValue
(
&
buffer
,
variance_shape_
);
SerializeValue
(
&
buffer
,
with_fp16_
);
}
nvinfer1
::
DimsExprs
getOutputDimensions
(
int
output_index
,
...
...
@@ -208,7 +240,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
int
nbInputs
,
const
nvinfer1
::
PluginTensorDesc
*
outputs
,
int
nbOutputs
)
const
TRT_NOEXCEPT
override
{
return
0
;
return
3
*
inputs
[
0
].
dims
.
d
[
0
]
*
groups_
*
sizeof
(
float
)
;
}
int
enqueue
(
const
nvinfer1
::
PluginTensorDesc
*
inputDesc
,
const
nvinfer1
::
PluginTensorDesc
*
outputDesc
,
...
...
@@ -222,19 +254,27 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
TRT_NOEXCEPT
override
;
void
destroy
()
TRT_NOEXCEPT
override
{
delete
this
;
}
// void terminate() TRT_NOEXCEPT override;
void
terminate
()
TRT_NOEXCEPT
override
{
if
(
bias_gpu_
)
{
cudaFree
(
bias_gpu_
);
bias_gpu_
=
nullptr
;
}
if
(
scale_gpu_
)
{
cudaFree
(
scale_gpu_
);
scale_gpu_
=
nullptr
;
}
};
private:
std
::
vector
<
float
>
scale_
;
std
::
vector
<
float
>
bias_
;
phi
::
DenseTensor
scale_t
;
phi
::
DenseTensor
bias_t
;
phi
::
DenseTensor
mean_t
;
phi
::
DenseTensor
variance_t
;
void
*
scale_gpu_
=
nullptr
;
void
*
bias_gpu_
=
nullptr
;
int
groups_
;
float
eps_
;
std
::
vector
<
int64_t
>
mean_shape_
;
std
::
vector
<
int64_t
>
variance_shape_
;
bool
with_fp16_
;
};
class
GroupNormPluginDynamicCreator
:
public
TensorRTPluginCreator
{
public:
...
...
paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
浏览文件 @
34fd65cf
...
...
@@ -22,7 +22,7 @@
namespace
phi
{
template
<
typename
T
,
int
flags
>
template
<
typename
T
,
typename
AccT
,
int
flags
>
__global__
void
GroupNormBackwardGetMeanAndVar
(
const
T
*
x
,
const
T
*
scale
,
const
T
*
bias
,
...
...
@@ -33,9 +33,9 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x,
int
imsize
,
int
groups
,
int
group_size
,
T
epsilon
,
T
*
d_mean
,
T
*
d_var
,
float
epsilon
,
Acc
T
*
d_mean
,
Acc
T
*
d_var
,
T
*
d_scale
,
T
*
d_bias
)
{
int
gid
=
blockIdx
.
y
;
...
...
@@ -45,29 +45,35 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x,
int
number
=
min
(
group_size
,
static_cast
<
int
>
(
C
-
gid
*
group_size
));
int
ccid
=
gid
*
group_size
+
cid
;
if
(
ccid
>=
C
)
return
;
T
x_scale
=
(
flags
&
kHasScale
)
?
scale
[
ccid
]
:
1
;
T
x_bias
=
(
flags
&
kHasBias
)
?
bias
[
ccid
]
:
0
;
T
x_scale_inv
=
0
;
if
(
x_scale
!=
0
)
x_scale_inv
=
1.0
/
x_scale
;
T
d_mean_data
=
0
,
d_var_data
=
0
,
d_scale_data
=
0
,
d_bias_data
=
0
;
T
x_scale
=
(
flags
&
kHasScale
)
?
scale
[
ccid
]
:
static_cast
<
T
>
(
1
);
T
x_bias
=
(
flags
&
kHasBias
)
?
bias
[
ccid
]
:
static_cast
<
T
>
(
0
);
T
x_scale_inv
=
static_cast
<
T
>
(
0
);
if
(
x_scale
!=
static_cast
<
T
>
(
0
))
x_scale_inv
=
static_cast
<
T
>
(
1.0
)
/
x_scale
;
AccT
d_mean_data
=
static_cast
<
AccT
>
(
0
);
AccT
d_var_data
=
static_cast
<
AccT
>
(
0
);
T
d_scale_data
=
static_cast
<
T
>
(
0
);
T
d_bias_data
=
static_cast
<
T
>
(
0
);
for
(
int
imid
=
threadIdx
.
x
;
imid
<
imsize
;
imid
+=
blockDim
.
x
)
{
T
val
,
dval
;
Acc
T
val
,
dval
;
int
hid
=
imid
/
W
;
int
wid
=
imid
%
W
;
val
=
x
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
]
-
x_bias
;
dval
=
d_y
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
];
val
=
static_cast
<
AccT
>
(
x
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
])
-
static_cast
<
AccT
>
(
x_bias
);
dval
=
static_cast
<
AccT
>
(
d_y
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
]);
d_var_data
+=
val
*
dval
;
d_mean_data
+=
dval
*
x_scale
;
d_mean_data
+=
dval
*
static_cast
<
AccT
>
(
x_scale
)
;
val
=
val
*
x_scale_inv
;
d_bias_data
+=
dval
;
d_scale_data
+=
val
*
dval
;
val
=
val
*
static_cast
<
AccT
>
(
x_scale_inv
)
;
d_bias_data
+=
static_cast
<
T
>
(
dval
)
;
d_scale_data
+=
static_cast
<
T
>
(
val
*
dval
)
;
}
CudaAtomicAddWithWarp
(
&
(
d_mean
[
bid
*
groups
+
gid
]),
d_mean_data
);
CudaAtomicAddWithWarp
(
&
(
d_var
[
bid
*
groups
+
gid
]),
d_var_data
);
CudaAtomicAddWithWarp
(
&
(
d_mean
[
bid
*
groups
+
gid
]),
static_cast
<
AccT
>
(
d_mean_data
));
CudaAtomicAddWithWarp
(
&
(
d_var
[
bid
*
groups
+
gid
]),
static_cast
<
AccT
>
(
d_var_data
));
if
(
flags
&
kHasScale
)
{
#if CUDA_VERSION >= 11070
...
...
@@ -85,22 +91,24 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x,
}
}
template
<
typename
T
,
int
flags
>
template
<
typename
T
,
typename
AccT
,
int
flags
>
__global__
void
GroupNormBackward
(
const
T
*
x
,
const
T
*
d_y
,
const
T
*
scale
,
const
T
*
bias
,
const
T
*
var
,
const
T
*
d_mean
,
const
T
*
d_var
,
const
Acc
T
*
var
,
const
Acc
T
*
d_mean
,
const
Acc
T
*
d_var
,
int
N
,
int
C
,
int
W
,
int
imsize
,
int
groups
,
int
group_size
,
T
epsilon
,
float
epsilon
,
T
*
d_x
)
{
// using AccT = typename kps::details::MPTypeTrait<T>::Type;
int
gid
=
blockIdx
.
y
;
int
cid
=
blockIdx
.
x
;
int
bid
=
blockIdx
.
z
;
...
...
@@ -108,132 +116,138 @@ __global__ void GroupNormBackward(const T* x,
int
number
=
min
(
group_size
,
static_cast
<
int
>
(
C
-
gid
*
group_size
));
int
ccid
=
gid
*
group_size
+
cid
;
if
(
ccid
>=
C
)
return
;
T
x_var
=
var
[
bid
*
groups
+
gid
];
T
d_x_mean
=
d_mean
[
bid
*
groups
+
gid
];
T
d_x_var
=
d_var
[
bid
*
groups
+
gid
];
T
x_var_inv
=
1.0
/
sqrt
(
x_var
+
epsilon
);
T
number_inv
=
1.0
/
(
number
*
imsize
);
T
x_scale
=
(
flags
&
kHasScale
)
?
scale
[
ccid
]
:
1
;
T
x_bias
=
(
flags
&
kHasBias
)
?
bias
[
ccid
]
:
0
;
T
x_scale_inv
=
0
;
if
(
x_scale
!=
0
)
x_scale_inv
=
1.0
/
x_scale
;
AccT
x_var
=
var
[
bid
*
groups
+
gid
];
AccT
d_x_mean
=
static_cast
<
AccT
>
(
d_mean
[
bid
*
groups
+
gid
]);
AccT
d_x_var
=
static_cast
<
AccT
>
(
d_var
[
bid
*
groups
+
gid
]);
AccT
x_var_inv
=
static_cast
<
AccT
>
(
1.0
)
/
sqrt
((
x_var
)
+
epsilon
);
AccT
number_inv
=
static_cast
<
AccT
>
(
1.0
)
/
static_cast
<
AccT
>
((
number
*
imsize
));
AccT
x_scale
=
(
flags
&
kHasScale
)
?
static_cast
<
AccT
>
(
scale
[
ccid
])
:
static_cast
<
AccT
>
(
1
);
AccT
x_bias
=
(
flags
&
kHasBias
)
?
static_cast
<
AccT
>
(
bias
[
ccid
])
:
static_cast
<
AccT
>
(
0
);
AccT
x_scale_inv
=
static_cast
<
T
>
(
0
);
if
(
x_scale
!=
static_cast
<
AccT
>
(
0
))
x_scale_inv
=
static_cast
<
AccT
>
(
1.0
)
/
x_scale
;
for
(
int
imid
=
threadIdx
.
x
;
imid
<
imsize
;
imid
+=
blockDim
.
x
)
{
int
hid
=
imid
/
W
;
int
wid
=
imid
%
W
;
T
tmp
=
x
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
]
;
T
v_y
=
(
tmp
-
x_bias
)
*
x_scale_inv
;
T
dly
=
d_y
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
]
;
AccT
tmp
=
static_cast
<
AccT
>
(
x
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
])
;
Acc
T
v_y
=
(
tmp
-
x_bias
)
*
x_scale_inv
;
AccT
dly
=
static_cast
<
AccT
>
(
d_y
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
])
;
d_x
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
]
=
x_var_inv
*
(
dly
*
x_scale
-
number_inv
*
d_x_var
*
v_y
-
number_inv
*
d_x_mean
);
static_cast
<
T
>
(
x_var_inv
*
((
dly
)
*
(
x_scale
)
-
number_inv
*
d_x_var
*
(
v_y
)
-
number_inv
*
d_x_mean
)
);
}
}
template
<
typename
T
>
template
<
typename
T
,
typename
AccT
>
__global__
void
ScalarGetDsDbCUDAKernel
(
int
imsize
,
const
T
*
x
,
const
T
*
dy
,
T
*
ds
,
T
*
db
)
{
int
imsize
,
const
T
*
x
,
const
T
*
dy
,
AccT
*
ds
,
Acc
T
*
db
)
{
const
int
nc
=
blockIdx
.
x
;
T
ds_sum
=
0
;
T
db_sum
=
0
;
Acc
T
ds_sum
=
0
;
Acc
T
db_sum
=
0
;
for
(
int
i
=
threadIdx
.
x
;
i
<
imsize
;
i
+=
blockDim
.
x
)
{
const
int
index
=
nc
*
imsize
+
i
;
ds_sum
+=
dy
[
index
]
*
x
[
index
]
;
db_sum
+=
dy
[
index
]
;
ds_sum
+=
static_cast
<
AccT
>
(
dy
[
index
])
*
static_cast
<
AccT
>
(
x
[
index
])
;
db_sum
+=
static_cast
<
AccT
>
(
dy
[
index
])
;
}
ReduceMeanAndVar
<
T
>
(
db
,
ds
,
db_sum
,
ds_sum
,
1
);
ReduceMeanAndVar
<
Acc
T
>
(
db
,
ds
,
db_sum
,
ds_sum
,
1
);
}
template
<
typename
T
>
template
<
typename
T
,
typename
AccT
>
__global__
void
GetScaleBiasGradientCUDAKernel
(
int
N
,
int
C
,
int
group
,
T
epsilon
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
ds
,
const
T
*
db
,
float
epsilon
,
const
Acc
T
*
mean
,
const
Acc
T
*
var
,
const
Acc
T
*
ds
,
const
Acc
T
*
db
,
T
*
d_scale
,
T
*
d_bias
)
{
const
int
c
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
c
<
C
)
{
const
int
G
=
group
;
const
int
D
=
C
/
G
;
T
sum1
=
0
;
T
sum2
=
0
;
AccT
sum1
=
static_cast
<
AccT
>
(
0
)
;
AccT
sum2
=
static_cast
<
AccT
>
(
0
)
;
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
int
nc
=
n
*
C
+
c
;
const
int
ng
=
n
*
G
+
c
/
D
;
sum1
+=
(
d_scale
==
nullptr
)
?
T
(
0
)
:
((
ds
[
nc
]
-
db
[
nc
]
*
static_cast
<
T
>
(
mean
[
ng
]))
*
static_cast
<
T
>
(
rsqrt
(
var
[
ng
]
+
epsilon
)));
sum2
+=
(
d_bias
==
nullptr
)
?
T
(
0
)
:
db
[
nc
];
sum1
+=
(
d_scale
==
nullptr
)
?
AccT
(
0
)
:
((
ds
[
nc
]
-
db
[
nc
]
*
(
mean
[
ng
]))
*
(
rsqrt
((
var
[
ng
])
+
epsilon
)));
sum2
+=
(
d_bias
==
nullptr
)
?
Acc
T
(
0
)
:
db
[
nc
];
}
if
(
d_scale
!=
nullptr
)
{
d_scale
[
c
]
=
s
um1
;
d_scale
[
c
]
=
s
tatic_cast
<
T
>
(
sum1
)
;
}
if
(
d_bias
!=
nullptr
)
{
d_bias
[
c
]
=
s
um2
;
d_bias
[
c
]
=
s
tatic_cast
<
T
>
(
sum2
)
;
}
}
}
template
<
typename
T
,
int
BlockDim
>
template
<
typename
T
,
typename
AccT
,
int
BlockDim
>
__global__
void
GetBackwardParamsCUDAKernel
(
int
imsize
,
int
groups
,
int
group_size
,
T
epsilon
,
const
T
*
mean
,
const
T
*
var
,
float
epsilon
,
const
Acc
T
*
mean
,
const
Acc
T
*
var
,
const
T
*
scale
,
const
T
*
ds
,
const
T
*
db
,
T
*
p1
,
T
*
p2
,
T
*
p3
)
{
const
Acc
T
*
ds
,
const
Acc
T
*
db
,
Acc
T
*
p1
,
Acc
T
*
p2
,
Acc
T
*
p3
)
{
const
int
n
=
blockIdx
.
x
;
const
int
g
=
blockIdx
.
y
;
const
int
ng
=
n
*
groups
+
g
;
T
sum1
=
0
;
T
sum2
=
0
;
T
var_inv
=
rsqrt
(
var
[
ng
]
+
epsilon
);
Acc
T
sum1
=
0
;
Acc
T
sum2
=
0
;
AccT
var_inv
=
rsqrt
(
static_cast
<
AccT
>
(
var
[
ng
])
+
epsilon
);
for
(
int64_t
i
=
threadIdx
.
x
;
i
<
group_size
;
i
+=
blockDim
.
x
)
{
const
int64_t
index
=
ng
*
group_size
+
i
;
const
int64_t
c
=
g
*
group_size
+
i
;
const
T
scale_v
=
scale
==
nullptr
?
T
(
1
)
:
static_cast
<
T
>
(
scale
[
c
]);
sum1
+=
ds
[
index
]
*
scale_v
;
sum2
+=
db
[
index
]
*
scale_v
;
const
T
scale_c
=
scale
==
nullptr
?
T
(
0
)
:
static_cast
<
T
>
(
scale
[
c
]);
p1
[
index
]
=
scale_c
*
var_inv
;
const
AccT
scale_v
=
scale
==
nullptr
?
static_cast
<
AccT
>
(
1
)
:
static_cast
<
AccT
>
(
scale
[
c
]);
sum1
+=
static_cast
<
AccT
>
(
ds
[
index
])
*
scale_v
;
sum2
+=
static_cast
<
AccT
>
(
db
[
index
])
*
scale_v
;
const
AccT
scale_c
=
scale
==
nullptr
?
static_cast
<
AccT
>
(
0
)
:
static_cast
<
T
>
(
scale
[
c
]);
p1
[
index
]
=
static_cast
<
AccT
>
(
scale_c
)
*
var_inv
;
}
typedef
cub
::
BlockReduce
<
T
,
BlockDim
>
BlockReduce
;
typedef
cub
::
BlockReduce
<
Acc
T
,
BlockDim
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
ds_storage
;
__shared__
typename
BlockReduce
::
TempStorage
db_storage
;
sum1
=
BlockReduce
(
ds_storage
).
Reduce
(
sum1
,
cub
::
Sum
());
sum2
=
BlockReduce
(
db_storage
).
Reduce
(
sum2
,
cub
::
Sum
());
if
(
threadIdx
.
x
==
0
)
{
const
T
s
=
T
(
1
)
/
static_cast
<
T
>
(
group_size
*
imsize
);
const
T
x
=
(
sum2
*
static_cast
<
T
>
(
mean
[
ng
])
-
sum1
)
*
static_cast
<
T
>
(
var_inv
)
*
static_cast
<
T
>
(
var_inv
)
*
static_cast
<
T
>
(
var_inv
)
*
s
;
const
AccT
s
=
static_cast
<
AccT
>
(
1
)
/
static_cast
<
AccT
>
(
group_size
*
imsize
);
const
AccT
x
=
(
sum2
*
static_cast
<
AccT
>
(
mean
[
ng
])
-
sum1
)
*
(
var_inv
)
*
(
var_inv
)
*
(
var_inv
)
*
s
;
p2
[
ng
]
=
x
;
p3
[
ng
]
=
-
x
*
static_cast
<
T
>
(
mean
[
ng
])
-
sum2
*
static_cast
<
T
>
(
var_inv
)
*
s
;
p3
[
ng
]
=
-
x
*
(
mean
[
ng
])
-
(
sum2
*
var_inv
)
*
s
;
}
}
template
<
typename
T
>
template
<
typename
T
,
typename
AccT
>
__global__
void
GetXGradientCUDAKernel
(
int
imsize
,
int
C
,
int
group_size
,
int
groups
,
T
*
p1
,
T
*
p2
,
T
*
p3
,
Acc
T
*
p1
,
Acc
T
*
p2
,
Acc
T
*
p3
,
const
T
*
x
,
const
T
*
dy
,
T
*
dx
)
{
...
...
@@ -245,7 +259,8 @@ __global__ void GetXGradientCUDAKernel(int imsize,
int
nc
=
gid
*
group_size
+
cid
;
for
(
int
imid
=
threadIdx
.
x
;
imid
<
imsize
;
imid
+=
blockDim
.
x
)
{
int
index
=
(
bid
*
C
+
nc
)
*
imsize
+
imid
;
dx
[
index
]
=
p1
[
ccid
]
*
dy
[
index
]
+
p2
[
ng
]
*
x
[
index
]
+
p3
[
ng
];
dx
[
index
]
=
static_cast
<
T
>
(
p1
[
ccid
]
*
static_cast
<
AccT
>
(
dy
[
index
])
+
p2
[
ng
]
*
static_cast
<
AccT
>
(
x
[
index
])
+
p3
[
ng
]);
}
}
...
...
@@ -264,6 +279,7 @@ void GroupNormGradKernel(const Context& dev_ctx,
DenseTensor
*
d_x
,
DenseTensor
*
d_scale
,
DenseTensor
*
d_bias
)
{
using
AccT
=
typename
kps
::
details
::
MPTypeTrait
<
T
>::
Type
;
const
DataLayout
data_layout
=
phi
::
StringToDataLayout
(
data_layout_str
);
const
auto
scale_ptr
=
scale
.
get_ptr
();
const
auto
bias_ptr
=
bias
.
get_ptr
();
...
...
@@ -277,20 +293,20 @@ void GroupNormGradKernel(const Context& dev_ctx,
dev_ctx
.
template
Alloc
<
T
>(
d_x
);
phi
::
funcs
::
SetConstant
<
GPUContext
,
T
>
set_zero
;
phi
::
funcs
::
SetConstant
<
GPUContext
,
AccT
>
set_zero_AccT
;
DenseTensor
ds
,
db
;
ds
.
Resize
({
x_dims
[
0
],
C
});
T
*
ds_data
=
dev_ctx
.
template
Alloc
<
T
>(
&
ds
);
AccT
*
ds_data
=
dev_ctx
.
template
Alloc
<
Acc
T
>(
&
ds
);
db
.
Resize
({
x_dims
[
0
],
C
});
T
*
db_data
=
dev_ctx
.
template
Alloc
<
T
>(
&
db
);
AccT
*
db_data
=
dev_ctx
.
template
Alloc
<
Acc
T
>(
&
db
);
auto
*
y_data
=
y
.
data
<
T
>
();
auto
*
x_data
=
x
.
data
<
T
>
();
T
*
d_x_data
=
nullptr
;
if
(
d_x
)
d_x_data
=
d_x
->
data
<
T
>
();
auto
*
dy_data
=
d_y
.
data
<
T
>
();
auto
*
var_data
=
var
.
data
<
T
>
();
auto
*
mean_data
=
mean
.
data
<
T
>
();
auto
*
var_data
=
var
.
data
<
Acc
T
>
();
auto
*
mean_data
=
mean
.
data
<
Acc
T
>
();
T
*
d_scale_data
=
nullptr
;
if
(
d_scale
)
{
dev_ctx
.
template
Alloc
<
T
>(
d_scale
);
...
...
@@ -338,12 +354,13 @@ void GroupNormGradKernel(const Context& dev_ctx,
}
block_size_nchw
=
std
::
max
(
block_size_nchw
,
kps
::
details
::
kWarpSize
);
dim3
blocks
(
block_size_nchw
);
ScalarGetDsDbCUDAKernel
<
T
><<<
x_dims
[
0
]
*
C
,
blocks
,
0
,
dev_ctx
.
stream
()
>>>
(
imsize
,
x_data
,
dy_data
,
ds_data
,
db_data
);
ScalarGetDsDbCUDAKernel
<
T
,
AccT
>
<<<
x_dims
[
0
]
*
C
,
blocks
,
0
,
dev_ctx
.
stream
()
>>>
(
imsize
,
x_data
,
dy_data
,
ds_data
,
db_data
);
if
(
d_scale
||
d_bias
)
{
const
int
block
=
256
;
GetScaleBiasGradientCUDAKernel
<
T
>
GetScaleBiasGradientCUDAKernel
<
T
,
AccT
>
<<<
(
C
+
block
-
1
)
/
block
,
block
,
0
,
dev_ctx
.
stream
()
>>>
(
x_dims
[
0
],
C
,
...
...
@@ -365,13 +382,13 @@ void GroupNormGradKernel(const Context& dev_ctx,
// p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n);
DenseTensor
p1
,
p2
,
p3
;
p1
.
Resize
({
x_dims
[
0
]
*
C
});
T
*
p1_data
=
dev_ctx
.
template
Alloc
<
T
>(
&
p1
);
AccT
*
p1_data
=
dev_ctx
.
template
Alloc
<
Acc
T
>(
&
p1
);
p2
.
Resize
({
x_dims
[
0
],
groups
});
T
*
p2_data
=
dev_ctx
.
template
Alloc
<
T
>(
&
p2
);
AccT
*
p2_data
=
dev_ctx
.
template
Alloc
<
Acc
T
>(
&
p2
);
p3
.
Resize
({
x_dims
[
0
],
groups
});
T
*
p3_data
=
dev_ctx
.
template
Alloc
<
T
>(
&
p3
);
AccT
*
p3_data
=
dev_ctx
.
template
Alloc
<
Acc
T
>(
&
p3
);
GetBackwardParamsCUDAKernel
<
T
,
block_dims
>
GetBackwardParamsCUDAKernel
<
T
,
AccT
,
block_dims
>
<<<
dim3
(
x_dims
[
0
],
groups
),
block_dims
,
0
,
dev_ctx
.
stream
()
>>>
(
imsize
,
groups
,
...
...
@@ -408,14 +425,14 @@ void GroupNormGradKernel(const Context& dev_ctx,
DenseTensor
temp_var
;
temp_var
.
Resize
(
var
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
&
temp_var
);
set_zero
(
dev_ctx
,
&
temp_var
,
static_cast
<
T
>
(
0
));
T
*
temp_var_data
=
temp_var
.
data
<
T
>
();
set_zero
_AccT
(
dev_ctx
,
&
temp_var
,
static_cast
<
Acc
T
>
(
0
));
auto
*
temp_var_data
=
temp_var
.
data
<
Acc
T
>
();
DenseTensor
temp_mean
;
temp_mean
.
Resize
(
var
.
dims
());
dev_ctx
.
template
Alloc
<
T
>(
&
temp_mean
);
set_zero
(
dev_ctx
,
&
temp_mean
,
static_cast
<
T
>
(
0
));
T
*
temp_mean_data
=
temp_mean
.
data
<
T
>
();
dev_ctx
.
template
Alloc
<
Acc
T
>(
&
temp_mean
);
set_zero
_AccT
(
dev_ctx
,
&
temp_mean
,
static_cast
<
Acc
T
>
(
0
));
auto
*
temp_mean_data
=
temp_mean
.
data
<
Acc
T
>
();
int
flags
=
(
scale_data
!=
nullptr
)
*
kHasScale
+
(
bias_data
!=
nullptr
)
*
kHasBias
;
...
...
@@ -460,6 +477,10 @@ void GroupNormGradKernel(const Context& dev_ctx,
}
// namespace phi
PD_REGISTER_KERNEL
(
group_norm_grad
,
GPU
,
ALL_LAYOUT
,
phi
::
GroupNormGradKernel
,
float
,
double
)
{
}
PD_REGISTER_KERNEL
(
group_norm_grad
,
GPU
,
ALL_LAYOUT
,
phi
::
GroupNormGradKernel
,
float
,
double
,
phi
::
dtype
::
float16
)
{}
paddle/phi/kernels/gpu/group_norm_kernel.cu
浏览文件 @
34fd65cf
...
...
@@ -22,7 +22,7 @@
namespace
phi
{
template
<
typename
T
>
template
<
typename
T
,
typename
AccT
>
__global__
void
GroupNormForwardGetMeanAndVar
(
const
T
*
x
,
int
N
,
int
C
,
...
...
@@ -30,8 +30,8 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x,
int
imsize
,
int
groups
,
int
group_size
,
T
*
mean
,
T
*
var
)
{
Acc
T
*
mean
,
Acc
T
*
var
)
{
int
gid
=
blockIdx
.
y
;
int
cid
=
blockIdx
.
x
;
int
bid
=
blockIdx
.
z
;
...
...
@@ -39,12 +39,13 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x,
int
number
=
min
(
group_size
,
static_cast
<
int
>
(
C
-
gid
*
group_size
));
int
ccid
=
gid
*
group_size
+
cid
;
if
(
ccid
>=
C
)
return
;
T
x_mean
=
0
,
x_var
=
0
;
AccT
x_mean
=
static_cast
<
AccT
>
(
0
);
AccT
x_var
=
static_cast
<
AccT
>
(
0
);
for
(
int
imid
=
threadIdx
.
x
;
imid
<
imsize
;
imid
+=
blockDim
.
x
)
{
T
val
;
Acc
T
val
;
int
hid
=
imid
/
W
;
int
wid
=
imid
%
W
;
val
=
x
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
]
;
val
=
static_cast
<
AccT
>
(
x
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
])
;
x_mean
+=
val
;
x_var
+=
val
*
val
;
...
...
@@ -55,10 +56,10 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x,
CudaAtomicAddWithWarp
(
&
var
[
bid
*
groups
+
gid
],
x_var
);
}
template
<
typename
T
,
int
flags
>
template
<
typename
T
,
typename
AccT
,
int
flags
>
__global__
void
GroupNormForward
(
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
const
Acc
T
*
mean
,
const
Acc
T
*
var
,
const
T
*
scale
,
const
T
*
bias
,
int
N
,
...
...
@@ -67,9 +68,9 @@ __global__ void GroupNormForward(const T* x,
int
imsize
,
int
groups
,
int
group_size
,
T
epsilon
,
Acc
T
epsilon
,
T
*
y
,
T
*
real_var
,
Acc
T
*
real_var
,
const
DataLayout
data_layout
)
{
int
gid
=
blockIdx
.
y
;
int
cid
=
blockIdx
.
x
;
...
...
@@ -78,35 +79,36 @@ __global__ void GroupNormForward(const T* x,
int
ccid
=
gid
*
group_size
+
cid
;
if
(
ccid
>=
C
)
return
;
auto
ng
=
bid
*
groups
+
gid
;
T
x_mean
=
mean
[
ng
];
T
x_var
=
var
[
ng
];
Acc
T
x_mean
=
mean
[
ng
];
Acc
T
x_var
=
var
[
ng
];
x_var
=
x_var
-
x_mean
*
x_mean
;
T
var_inv
=
rsqrt
(
x_var
+
epsilon
);
AccT
var_inv
=
rsqrt
(
x_var
+
epsilon
);
if
(
cid
==
0
&&
threadIdx
.
x
==
0
)
{
real_var
[
ng
]
=
x_var
;
}
for
(
int
imid
=
threadIdx
.
x
;
imid
<
imsize
;
imid
+=
blockDim
.
x
)
{
T
val
;
Acc
T
val
;
int
hid
,
wid
;
int
index
=
(
bid
*
C
+
ccid
)
*
imsize
+
imid
;
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
val
=
x
[
index
]
;
val
=
static_cast
<
AccT
>
(
x
[
index
])
;
}
else
{
hid
=
imid
/
W
;
wid
=
imid
%
W
;
val
=
x
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
]
;
val
=
static_cast
<
AccT
>
(
x
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
])
;
}
val
=
(
val
-
x_mean
)
*
var_inv
;
if
(
flags
&
kHasScale
)
{
val
*=
s
cale
[
ccid
]
;
val
*=
s
tatic_cast
<
AccT
>
(
scale
[
ccid
])
;
}
if
(
flags
&
kHasBias
)
{
val
+=
bias
[
ccid
]
;
val
+=
static_cast
<
AccT
>
(
bias
[
ccid
])
;
}
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
y
[
index
]
=
val
;
y
[
index
]
=
static_cast
<
T
>
(
val
)
;
}
else
{
y
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
]
=
val
;
y
[(
bid
*
H
+
hid
)
*
W
*
C
+
wid
*
C
+
ccid
]
=
static_cast
<
T
>
(
val
)
;
}
}
}
...
...
@@ -122,6 +124,7 @@ void GroupNormKernel(const Context& dev_ctx,
DenseTensor
*
y
,
DenseTensor
*
mean
,
DenseTensor
*
var
)
{
using
AccT
=
typename
kps
::
details
::
MPTypeTrait
<
T
>::
Type
;
const
DataLayout
data_layout
=
phi
::
StringToDataLayout
(
data_layout_str
);
const
auto
scale_ptr
=
scale
.
get_ptr
();
const
auto
bias_ptr
=
bias
.
get_ptr
();
...
...
@@ -135,17 +138,19 @@ void GroupNormKernel(const Context& dev_ctx,
:
x_dims
[
x_dims
.
size
()
-
2
]);
dev_ctx
.
template
Alloc
<
T
>(
y
);
dev_ctx
.
template
Alloc
<
T
>(
mean
);
dev_ctx
.
template
Alloc
<
T
>(
var
);
phi
::
funcs
::
SetConstant
<
GPUContext
,
T
>
set_zero
;
dev_ctx
.
template
Alloc
<
Acc
T
>(
mean
);
dev_ctx
.
template
Alloc
<
Acc
T
>(
var
);
// temp_var is used to calculate the mean^2
DenseTensor
temp_var
;
temp_var
.
Resize
(
var
->
dims
());
dev_ctx
.
template
Alloc
<
T
>(
&
temp_var
);
dev_ctx
.
template
Alloc
<
AccT
>(
&
temp_var
);
phi
::
funcs
::
SetConstant
<
GPUContext
,
T
>
set_zero
;
phi
::
funcs
::
SetConstant
<
GPUContext
,
AccT
>
set_zero_AccT
;
auto
*
x_data
=
x
.
data
<
T
>
();
auto
*
y_data
=
y
->
data
<
T
>
();
auto
*
mean_data
=
mean
->
data
<
T
>
();
auto
*
var_data
=
var
->
data
<
T
>
();
auto
*
temp_var_data
=
temp_var
.
data
<
T
>
();
auto
*
mean_data
=
mean
->
data
<
Acc
T
>
();
auto
*
var_data
=
var
->
data
<
Acc
T
>
();
auto
*
temp_var_data
=
temp_var
.
data
<
Acc
T
>
();
const
T
*
scale_data
=
nullptr
;
if
(
scale_ptr
)
scale_data
=
scale_ptr
->
data
<
T
>
();
...
...
@@ -172,7 +177,6 @@ void GroupNormKernel(const Context& dev_ctx,
dim3
grid
(
group_size
,
groups
,
x_dims
[
0
]);
dim3
threads
(
block_size
,
1
,
1
);
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
using
AccT
=
typename
kps
::
details
::
MPTypeTrait
<
T
>::
Type
;
constexpr
int
vec_size
=
sizeof
(
float4
)
/
sizeof
(
T
);
int
size
=
group_size
*
imsize
;
const
int
max_num_threads
=
1024
;
...
...
@@ -185,7 +189,7 @@ void GroupNormKernel(const Context& dev_ctx,
dim3
grids
(
x_dims
[
0
]
*
groups
);
dim3
blocks
(
block_size_nchw
);
if
(
size
<
vec_size
*
block_size_nchw
)
{
ScalarGetMeanAndVarNCHW
<
T
><<<
grids
,
blocks
,
0
,
dev_ctx
.
stream
()
>>>
(
ScalarGetMeanAndVarNCHW
<
T
,
AccT
><<<
grids
,
blocks
,
0
,
dev_ctx
.
stream
()
>>>
(
x_data
,
mean_data
,
temp_var_data
,
size
);
}
else
{
VectorizedGetMeanAndVarNCHW
<
T
,
AccT
,
vec_size
>
...
...
@@ -193,9 +197,9 @@ void GroupNormKernel(const Context& dev_ctx,
x_data
,
mean_data
,
temp_var_data
,
size
);
}
}
else
{
set_zero
(
dev_ctx
,
mean
,
static_cast
<
T
>
(
0
));
set_zero
(
dev_ctx
,
&
temp_var
,
static_cast
<
T
>
(
0
));
GroupNormForwardGetMeanAndVar
<
T
>
set_zero
_AccT
(
dev_ctx
,
mean
,
static_cast
<
Acc
T
>
(
0
));
set_zero
_AccT
(
dev_ctx
,
&
temp_var
,
static_cast
<
Acc
T
>
(
0
));
GroupNormForwardGetMeanAndVar
<
T
,
AccT
>
<<<
grid
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
x_data
,
x_dims
[
0
],
C
,
...
...
@@ -221,26 +225,26 @@ void GroupNormKernel(const Context& dev_ctx,
imsize
,
groups
,
group_size
,
epsilon
,
static_cast
<
AccT
>
(
epsilon
)
,
y_data
,
var_data
,
data_layout
);
}
template
<
typename
T
>
void
GroupNormDirectCUDAFunctor
<
T
>::
operator
()(
gpuStream_t
stream
,
const
T
*
input
,
std
::
vector
<
int
>
input_shape
,
const
T
*
bias
,
const
T
*
scale
,
T
*
temp_mean
,
T
*
temp_variance
,
int
groups
,
float
eps
,
T
*
output
,
T
*
mean
,
T
*
variance
,
const
DataLayout
data_layout
)
{
template
<
typename
T
,
typename
AccT
>
void
GroupNormDirectCUDAFunctor
<
T
,
AccT
>::
operator
()(
gpuStream_t
stream
,
const
T
*
input
,
std
::
vector
<
int
>
input_shape
,
const
T
*
bias
,
const
T
*
scale
,
Acc
T
*
temp_variance
,
int
groups
,
float
eps
,
T
*
output
,
Acc
T
*
mean
,
Acc
T
*
variance
,
const
DataLayout
data_layout
)
{
const
auto
input_ddim
=
phi
::
make_ddim
(
input_shape
);
const
int
C
=
(
data_layout
==
DataLayout
::
kNCHW
?
input_ddim
[
1
]
...
...
@@ -268,8 +272,7 @@ void GroupNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
dim3
grid
(
group_size
,
groups
,
input_ddim
[
0
]);
dim3
threads
(
block_size
,
1
,
1
);
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
using
AccT
=
typename
phi
::
kps
::
details
::
MPTypeTrait
<
float
>::
Type
;
constexpr
int
vec_size
=
sizeof
(
float4
)
/
sizeof
(
float
);
constexpr
int
vec_size
=
sizeof
(
float4
)
/
sizeof
(
T
);
int
size
=
group_size
*
image_size
;
// group element size
const
int
max_num_threads
=
1024
;
int
max_block_size
=
std
::
min
(
size
/
vec_size
,
max_num_threads
);
...
...
@@ -283,14 +286,22 @@ void GroupNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
dim3
blocks
(
block_size_nchw
);
if
(
size
<
vec_size
*
block_size_nchw
)
{
phi
::
ScalarGetMeanAndVarNCHW
<
T
>
<<<
grids
,
blocks
,
0
,
stream
>>>
(
input
,
temp_
mean
,
temp_variance
,
size
);
phi
::
ScalarGetMeanAndVarNCHW
<
T
,
AccT
>
<<<
grids
,
blocks
,
0
,
stream
>>>
(
input
,
mean
,
temp_variance
,
size
);
}
else
{
phi
::
VectorizedGetMeanAndVarNCHW
<
T
,
AccT
,
vec_size
>
<<<
grids
,
blocks
,
0
,
stream
>>>
(
input
,
temp_
mean
,
temp_variance
,
size
);
<<<
grids
,
blocks
,
0
,
stream
>>>
(
input
,
mean
,
temp_variance
,
size
);
}
}
else
{
phi
::
GroupNormForwardGetMeanAndVar
<
T
>
#ifdef PADDLE_WITH_HIP
hipMemset
(
mean
,
0
,
sizeof
(
AccT
)
*
input_ddim
[
0
]
*
groups
);
hipMemset
(
temp_variance
,
0
,
sizeof
(
AccT
)
*
input_ddim
[
0
]
*
groups
);
#else
cudaMemset
(
mean
,
0
,
sizeof
(
AccT
)
*
input_ddim
[
0
]
*
groups
);
cudaMemset
(
temp_variance
,
0
,
sizeof
(
AccT
)
*
input_ddim
[
0
]
*
groups
);
#endif
phi
::
GroupNormForwardGetMeanAndVar
<
T
,
AccT
>
<<<
grid
,
threads
,
0
,
stream
>>>
(
input
,
input_ddim
[
0
],
C
,
...
...
@@ -298,28 +309,37 @@ void GroupNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
image_size
,
groups
,
group_size
,
temp_
mean
,
mean
,
temp_variance
);
}
GroupNormForward
<
T
,
3
><<<
grid
,
threads
,
0
,
stream
>>>
(
input
,
temp_
mean
,
temp_variance
,
scale
,
bias
,
input_ddim
[
0
],
C
,
W
,
image_size
,
groups
,
group_size
,
eps
,
output
,
variance
,
data_layout
);
// for now, we only support nchw for group norm
GroupNormForward
<
T
,
AccT
,
3
>
<<<
grid
,
threads
,
0
,
stream
>>>
(
input
,
mean
,
temp_variance
,
scale
,
bias
,
input_ddim
[
0
],
C
,
W
,
image_size
,
groups
,
group_size
,
static_cast
<
AccT
>
(
eps
)
,
output
,
variance
,
data_layout
);
}
template
class
GroupNormDirectCUDAFunctor
<
float
>;
template
class
GroupNormDirectCUDAFunctor
<
float
,
float
>;
#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
template
class
GroupNormDirectCUDAFunctor
<
half
,
float
>;
#endif
}
// namespace phi
PD_REGISTER_KERNEL
(
group_norm
,
GPU
,
ALL_LAYOUT
,
phi
::
GroupNormKernel
,
float
,
double
)
{}
PD_REGISTER_KERNEL
(
group_norm
,
GPU
,
ALL_LAYOUT
,
phi
::
GroupNormKernel
,
float
,
double
,
phi
::
dtype
::
float16
)
{}
paddle/phi/kernels/gpu/group_norm_utils.h
浏览文件 @
34fd65cf
...
...
@@ -31,9 +31,10 @@ namespace phi {
enum
GroupNormKernelFlags
{
kHasScale
=
1
,
kHasBias
=
2
};
#define ALIGN_BYTES 16
#define CHECK_CASE(i, flags, kernel_name, ...) \
if (i == flags) { \
kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(__VA_ARGS__); \
#define CHECK_CASE(i, flags, kernel_name, ...) \
if (i == flags) { \
kernel_name<T, AccT, i> \
<<<grid, threads, 0, dev_ctx.stream()>>>(__VA_ARGS__); \
}
// 0 for no scale, no bias
...
...
@@ -75,11 +76,14 @@ __device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
size
+=
offset
;
if
(
tid
>=
offset
)
{
if
(
Num
==
1
)
{
*
out_mean
+=
x
[
tid
];
*
out_var
+=
x
[
tid
]
*
x
[
tid
];
AccT
x_acc
=
static_cast
<
AccT
>
(
x
[
tid
]);
*
out_mean
+=
x_acc
;
*
out_var
+=
x_acc
*
x_acc
;
}
else
if
(
Num
==
2
)
{
*
out_mean
+=
y
[
tid
];
*
out_var
+=
y
[
tid
]
*
x
[
tid
];
AccT
x_acc
=
static_cast
<
AccT
>
(
x
[
tid
]);
AccT
y_acc
=
static_cast
<
AccT
>
(
y
[
tid
]);
*
out_mean
+=
y_acc
;
*
out_var
+=
y_acc
*
x_acc
;
}
}
size
-=
blockDim
.
x
;
...
...
@@ -105,11 +109,14 @@ __device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
#pragma unroll
for
(
int
i
=
0
;
i
<
VecSize
;
++
i
)
{
if
(
Num
==
1
)
{
*
out_mean
+=
ins_x
[
i
];
*
out_var
+=
ins_x
[
i
]
*
ins_x
[
i
];
AccT
ins_x_acc
=
static_cast
<
AccT
>
(
ins_x
[
i
]);
*
out_mean
+=
ins_x_acc
;
*
out_var
+=
ins_x_acc
*
ins_x_acc
;
}
else
if
(
Num
==
2
)
{
*
out_mean
+=
ins_y
[
i
];
*
out_var
+=
ins_y
[
i
]
*
ins_x
[
i
];
AccT
ins_x_acc
=
static_cast
<
AccT
>
(
ins_x
[
i
]);
AccT
ins_y_acc
=
static_cast
<
AccT
>
(
ins_y
[
i
]);
*
out_mean
+=
ins_y_acc
;
*
out_var
+=
ins_y_acc
*
ins_x_acc
;
}
}
}
...
...
@@ -118,11 +125,14 @@ __device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
tid
=
size
-
remain
+
threadIdx
.
x
;
for
(;
tid
<
size
;
tid
+=
blockDim
.
x
)
{
if
(
Num
==
1
)
{
*
out_mean
+=
x
[
tid
];
*
out_var
+=
x
[
tid
]
*
x
[
tid
];
AccT
x_acc
=
static_cast
<
AccT
>
(
x
[
tid
]);
*
out_mean
+=
x_acc
;
*
out_var
+=
x_acc
*
x_acc
;
}
else
if
(
Num
==
2
)
{
*
out_mean
+=
y
[
tid
];
*
out_var
+=
y
[
tid
]
*
x
[
tid
];
AccT
x_acc
=
static_cast
<
AccT
>
(
x
[
tid
]);
AccT
y_acc
=
static_cast
<
AccT
>
(
y
[
tid
]);
*
out_mean
+=
y_acc
;
*
out_var
+=
y_acc
*
x_acc
;
}
}
}
...
...
@@ -137,28 +147,32 @@ __device__ __forceinline__ void ReduceMeanAndVar(
x_var
,
kps
::
AddFunctor
<
T
>
());
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
{
mean
[
nc
]
=
static_cast
<
T
>
(
x_mean
/
size
)
;
var
[
nc
]
=
static_cast
<
T
>
(
x_var
/
size
)
;
mean
[
nc
]
=
x_mean
/
size
;
var
[
nc
]
=
x_var
/
size
;
}
}
template
<
typename
T
>
__global__
void
ScalarGetMeanAndVarNCHW
(
const
T
*
x
,
T
*
mean
,
T
*
var
,
int
size
)
{
template
<
typename
T
,
typename
AccT
>
__global__
void
ScalarGetMeanAndVarNCHW
(
const
T
*
x
,
AccT
*
mean
,
AccT
*
var
,
int
size
)
{
int
i
=
blockIdx
.
x
;
T
x_mean
=
0
,
x_var
=
0
;
AccT
x_mean
=
static_cast
<
AccT
>
(
0
);
AccT
x_var
=
static_cast
<
AccT
>
(
0
);
for
(
int
j
=
threadIdx
.
x
;
j
<
size
;
j
+=
blockDim
.
x
)
{
T
val
;
val
=
x
[
i
*
size
+
j
]
;
Acc
T
val
;
val
=
static_cast
<
AccT
>
(
x
[
i
*
size
+
j
])
;
x_mean
+=
val
;
x_var
+=
val
*
val
;
}
ReduceMeanAndVar
<
T
>
(
mean
,
var
,
x_mean
,
x_var
,
size
);
ReduceMeanAndVar
<
Acc
T
>
(
mean
,
var
,
x_mean
,
x_var
,
size
);
}
template
<
typename
T
,
typename
AccT
,
int
VecSize
>
__global__
void
VectorizedGetMeanAndVarNCHW
(
const
T
*
x
,
T
*
mean
,
T
*
var
,
Acc
T
*
mean
,
Acc
T
*
var
,
int
size
)
{
int
i
=
blockIdx
.
x
;
AccT
x_mean
=
static_cast
<
AccT
>
(
0
);
...
...
paddle/phi/kernels/group_norm_kernel.h
浏览文件 @
34fd65cf
...
...
@@ -34,7 +34,7 @@ void GroupNormKernel(const Context& dev_ctx,
DenseTensor
*
variance
);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template
<
typename
T
>
template
<
typename
T
,
typename
AccT
=
T
>
class
GroupNormDirectCUDAFunctor
{
public:
void
operator
()(
gpuStream_t
stream
,
...
...
@@ -42,13 +42,12 @@ class GroupNormDirectCUDAFunctor {
std
::
vector
<
int
>
input_shape
,
const
T
*
bias
,
const
T
*
scale
,
T
*
temp_mean
,
T
*
temp_variance
,
AccT
*
temp_variance
,
int
groups
,
float
eps
,
T
*
output
,
T
*
mean
,
T
*
variance
,
Acc
T
*
mean
,
Acc
T
*
variance
,
const
DataLayout
data_layout
);
};
#endif
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py
浏览文件 @
34fd65cf
...
...
@@ -23,8 +23,6 @@ import unittest
class
TrtConvertGroupNormTest
(
TrtLayerAutoScanTest
):
def
is_program_valid
(
self
,
program_config
:
ProgramConfig
)
->
bool
:
inputs
=
program_config
.
inputs
weights
=
program_config
.
weights
attrs
=
[
program_config
.
ops
[
i
].
attrs
for
i
in
range
(
len
(
program_config
.
ops
))
]
...
...
@@ -49,14 +47,15 @@ class TrtConvertGroupNormTest(TrtLayerAutoScanTest):
for
batch
in
[
1
,
2
,
4
]:
for
group
in
[
1
,
4
,
32
,
-
1
]:
for
epsilon
in
[
0.000
1
,
0.0007
,
-
1
,
1
]:
for
epsilon
in
[
0.000
01
,
0.00005
]:
for
data_layout
in
[
'NCHW'
]:
dics
=
[
{
"epsilon"
:
epsilon
,
"groups"
:
group
,
"data_layout"
:
data_layout
,
}
},
{},
]
ops_config
=
[
{
...
...
@@ -122,31 +121,31 @@ class TrtConvertGroupNormTest(TrtLayerAutoScanTest):
# for static_shape
clear_dynamic_shape
()
self
.
trt_param
.
precision
=
paddle_infer
.
PrecisionType
.
Float32
yield
self
.
create_inference_config
(),
generate_trt_nodes_num
(
attrs
,
False
),
1e-5
self
.
trt_param
.
workspace_size
=
2013265920
self
.
trt_param
.
precision
=
paddle_infer
.
PrecisionType
.
Half
yield
self
.
create_inference_config
(),
generate_trt_nodes_num
(
attrs
,
False
),
(
1e-3
,
1e-3
)
),
1e-2
# for dynamic_shape
generate_dynamic_shape
(
attrs
)
self
.
trt_param
.
precision
=
paddle_infer
.
PrecisionType
.
Float32
yield
self
.
create_inference_config
(),
generate_trt_nodes_num
(
attrs
,
Tru
e
attrs
,
Fals
e
),
1e-5
# for dynamic_shape
generate_dynamic_shape
(
attrs
)
self
.
trt_param
.
workspace_size
=
2013265920
self
.
trt_param
.
precision
=
paddle_infer
.
PrecisionType
.
Half
yield
self
.
create_inference_config
(),
generate_trt_nodes_num
(
attrs
,
True
),
(
1e-3
,
1e-3
)
),
1e-2
def
add_skip_trt_case
(
self
):
pass
self
.
trt_param
.
precision
=
paddle_infer
.
PrecisionType
.
Float32
yield
self
.
create_inference_config
(),
generate_trt_nodes_num
(
attrs
,
True
),
1e-5
def
test
(
self
):
self
.
add_skip_trt_case
()
self
.
run_test
()
...
...
python/paddle/fluid/tests/unittests/test_group_norm_op.py
浏览文件 @
34fd65cf
...
...
@@ -317,5 +317,65 @@ class TestGroupNormEager(unittest.TestCase):
)
class
TestGroupNormEager_fp32
(
unittest
.
TestCase
):
def
test_dygraph_api
(
self
):
self
.
dtype
=
np
.
float32
self
.
shape
=
(
8
,
32
,
32
)
input
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
with
fluid
.
dygraph
.
guard
():
tensor_1
=
fluid
.
dygraph
.
to_variable
(
input
)
tensor_1
.
stop_gradient
=
False
groupNorm
=
fluid
.
dygraph
.
nn
.
GroupNorm
(
channels
=
32
,
groups
=
4
,
dtype
=
'float32'
)
ret1
=
groupNorm
(
tensor_1
)
ret1
.
backward
()
with
_test_eager_guard
():
tensor_eager_1
=
fluid
.
dygraph
.
to_variable
(
input
)
tensor_eager_1
.
stop_gradient
=
False
groupNorm_eager
=
fluid
.
dygraph
.
nn
.
GroupNorm
(
channels
=
32
,
groups
=
4
)
ret2
=
groupNorm_eager
(
tensor_eager_1
)
ret2
.
backward
()
self
.
assertEqual
(
(
tensor_1
.
grad
.
numpy
()
==
tensor_eager_1
.
grad
.
numpy
()
).
all
(),
True
,
)
class
TestGroupNormEager_fp16
(
unittest
.
TestCase
):
def
test_dygraph_api
(
self
):
self
.
dtype
=
np
.
float32
self
.
shape
=
(
8
,
32
,
32
)
input
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
with
fluid
.
dygraph
.
guard
():
tensor_1
=
fluid
.
dygraph
.
to_variable
(
input
)
tensor_1
.
stop_gradient
=
False
groupNorm
=
fluid
.
dygraph
.
nn
.
GroupNorm
(
channels
=
32
,
groups
=
4
,
dtype
=
'float16'
)
ret1
=
groupNorm
(
tensor_1
)
ret1
.
backward
()
with
_test_eager_guard
():
tensor_eager_1
=
fluid
.
dygraph
.
to_variable
(
input
)
tensor_eager_1
.
stop_gradient
=
False
groupNorm_eager
=
fluid
.
dygraph
.
nn
.
GroupNorm
(
channels
=
32
,
groups
=
4
)
ret2
=
groupNorm_eager
(
tensor_eager_1
)
ret2
.
backward
()
self
.
assertEqual
(
(
tensor_1
.
grad
.
numpy
()
==
tensor_eager_1
.
grad
.
numpy
()
).
all
(),
True
,
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
浏览文件 @
34fd65cf
...
...
@@ -178,6 +178,58 @@ class TestGroupNormAPIV2_With_General_Dimensions(unittest.TestCase):
self
.
test_numerical_accuracy
()
class
TestGroupNormAPIV2_With_General_Dimensions_fp16
(
unittest
.
TestCase
):
def
test_numerical_accuracy
(
self
):
# fp16 only supported in cuda
if
not
core
.
is_compiled_with_cuda
():
return
paddle
.
disable_static
()
shapes
=
[
(
2
,
6
,
4
),
(
2
,
6
,
4
,
4
),
(
2
,
6
,
6
,
6
,
2
),
(
2
,
6
,
6
,
6
,
2
,
3
),
(
2
,
6
,
6
,
6
,
256
,
3
),
]
np
.
random
.
seed
(
10
)
places
=
[
fluid
.
CPUPlace
()]
if
core
.
is_compiled_with_cuda
()
and
core
.
op_support_gpu
(
"group_norm"
):
places
.
append
(
fluid
.
CUDAPlace
(
0
))
for
place
in
places
:
for
shape
in
shapes
:
scale
=
np
.
array
([
1
]).
astype
(
"float32"
)
bias
=
np
.
array
([
0
]).
astype
(
"float32"
)
data
=
np
.
random
.
random
(
shape
).
astype
(
"float32"
)
expect_res1
=
group_norm_naive_for_general_dimension
(
data
,
scale
,
bias
,
epsilon
=
1e-5
,
groups
=
6
)
expect_res2
=
group_norm_naive_for_general_dimension
(
data
,
scale
,
bias
,
epsilon
=
1e-5
,
groups
=
2
)
gn1
=
paddle
.
nn
.
GroupNorm
(
num_channels
=
6
,
num_groups
=
6
)
gn2
=
paddle
.
nn
.
GroupNorm
(
num_channels
=
6
,
num_groups
=
2
)
paddle
.
assign
(
paddle
.
cast
(
gn1
.
weight
,
'float16'
),
gn1
.
weight
)
paddle
.
assign
(
paddle
.
cast
(
gn1
.
bias
,
'float16'
),
gn1
.
bias
)
paddle
.
assign
(
paddle
.
cast
(
gn2
.
weight
,
'float16'
),
gn2
.
weight
)
paddle
.
assign
(
paddle
.
cast
(
gn2
.
bias
,
'float16'
),
gn2
.
bias
)
data_pd
=
paddle
.
to_tensor
(
data
.
astype
(
'float16'
))
result1
=
gn1
(
data_pd
).
numpy
()
result2
=
gn2
(
data_pd
).
numpy
()
np
.
testing
.
assert_allclose
(
result1
,
expect_res1
,
rtol
=
1e-2
,
atol
=
1e-3
)
np
.
testing
.
assert_allclose
(
result2
,
expect_res2
,
rtol
=
1e-2
,
atol
=
1e-3
)
def
test_eager_api
(
self
):
with
_test_eager_guard
():
self
.
test_numerical_accuracy
()
class
TestGroupNormDimException
(
unittest
.
TestCase
):
def
test_exception
(
self
):
def
test_empty_input_static_API
():
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录