Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
909d1e61
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
909d1e61
编写于
3月 03, 2022
作者:
N
niuliling123
提交者:
GitHub
3月 03, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Modified Reduce for XPU2 (#38918)
1. set xpu2 block_size = 64 2. fix a bug when reduce_num is too large
上级
6bf85eaf
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
81 addition
and
49 deletion
+81
-49
paddle/phi/kernels/gpu/reduce.h
paddle/phi/kernels/gpu/reduce.h
+81
-49
未找到文件。
paddle/phi/kernels/gpu/reduce.h
浏览文件 @
909d1e61
...
...
@@ -178,6 +178,8 @@ struct IndexCalculator {
:
dim
(
dim
)
{
dims
=
details
::
VectorToArray
<
int
,
kMaxRank
>
(
cal_dims
);
strides
=
details
::
VectorToArray
<
int
,
kMaxRank
>
(
full_strides
);
reduce_strides
=
details
::
VectorToArray
<
int
,
kMaxRank
>
(
cal_strides
);
#ifndef PADDLE_WITH_XPU_KP
std
::
vector
<
paddle
::
platform
::
FastDivMod
>
cal_divmoders
;
// fast divmod
for
(
auto
i
:
cal_strides
)
{
...
...
@@ -185,9 +187,22 @@ struct IndexCalculator {
}
divmoders
=
details
::
VectorToArray
<
paddle
::
platform
::
FastDivMod
,
kMaxRank
>
(
cal_divmoders
);
#endif
}
__device__
inline
int
operator
()(
int
offset
)
const
{
#ifdef PADDLE_WITH_XPU_KP
int
index
=
0
;
#pragma unroll
for
(
int
i
=
0
;
i
<
kMaxRank
;
++
i
)
{
if
(
i
==
dim
)
{
break
;
}
index
+=
(
offset
/
reduce_strides
[
i
])
*
strides
[
dims
[
i
]];
offset
=
offset
%
reduce_strides
[
i
];
}
return
index
;
#else
int
index
=
0
;
#pragma unroll
for
(
int
i
=
0
;
i
<
kMaxRank
;
++
i
)
{
...
...
@@ -199,12 +214,16 @@ struct IndexCalculator {
offset
=
divmod
.
val
[
1
];
}
return
index
;
#endif
}
int
dim
;
phi
::
Array
<
int
,
kMaxRank
>
dims
;
phi
::
Array
<
int
,
kMaxRank
>
strides
;
phi
::
Array
<
int
,
kMaxRank
>
reduce_strides
;
#ifndef PADDLE_WITH_XPU2
phi
::
Array
<
paddle
::
platform
::
FastDivMod
,
kMaxRank
>
divmoders
;
#endif
};
template
<
bool
ReduceLastDim
=
false
>
...
...
@@ -247,7 +266,7 @@ struct ReduceIndexMapping {
__device__
__forceinline__
int
BlockDimY
()
{
#ifdef PADDLE_WITH_XPU2
return
dim
.
deal_size_y
;
return
1
;
#else
return
blockDim
.
y
;
#endif
...
...
@@ -454,10 +473,14 @@ struct ReduceConfig {
bool
is_last_dim
=
(
rank
==
2
)
&&
(
reduce_rank
==
1
)
&&
(
reduce_dim
[
0
]
==
1
);
if
(
rank
==
reduce_rank
||
is_last_dim
)
{
#ifdef PADDLE_WITH_XPU_KP
reduce_type
=
static_cast
<
int
>
(
ReduceType
::
kReduceAny
);
#else
reduce_type
=
static_cast
<
int
>
(
ReduceType
::
kReduceLastDim
);
#endif
}
else
if
(
reduce_rank
==
1
)
{
// ReduceFirstDim and reduceSecondDim
#ifdef PADDLE_WITH_XPU
2
#ifdef PADDLE_WITH_XPU
_KP
if
(
reduce_dim
[
0
]
==
0
)
{
reduce_type
=
static_cast
<
int
>
(
ReduceType
::
kReduceHigherDim
);
}
else
{
...
...
@@ -471,6 +494,7 @@ struct ReduceConfig {
}
}
#ifndef PADDLE_WITH_XPU_KP
void
SetBlockDimForReduceAny
(
dim3
*
block_dim
,
dim3
*
grid_dim
)
{
constexpr
int
min_reduce_num_per_thread
=
16
;
constexpr
int
max_reduce_num_per_thread
=
256
;
...
...
@@ -569,6 +593,7 @@ struct ReduceConfig {
grid_dim
->
y
=
details
::
AlignUp
(
reduce_num
,
blocking_size
);
}
}
#endif
void
SetBlockDim
()
{
// init
...
...
@@ -577,14 +602,14 @@ struct ReduceConfig {
dim3
block_dim
(
block_num
,
1
,
1
);
dim3
grid_dim
(
left_num
,
1
,
1
);
blocking_size
=
reduce_num
;
#ifdef PADDLE_WITH_XPU
2
#ifdef PADDLE_WITH_XPU
_KP
if
(
reduce_last_dim
)
{
block_dim
.
x
=
128
;
block_dim
.
x
=
64
;
block_dim
.
y
=
reduce_num
;
grid_dim
.
x
=
8
;
grid_dim
.
y
=
1
;
grid_dim
.
x
=
1
;
grid_dim
.
y
=
8
;
}
else
{
block_dim
.
x
=
128
;
block_dim
.
x
=
64
;
block_dim
.
y
=
left_num
;
grid_dim
.
x
=
8
;
grid_dim
.
y
=
1
;
...
...
@@ -661,7 +686,7 @@ __global__ void ReduceAnyKernel(const Tx* x,
store_offset
=
block
.
BlockIdY
()
*
left_num
+
left_idx
;
loop_left
=
min
(
block
.
GetLoopSize
(),
left_num
-
left_idx
);
stride_left
=
1
;
tid
=
threadIdx
.
x
;
tid
=
THREAD_ID_X
;
}
else
{
auto
block
=
ReduceIndexMapping
<
false
>
(
dim
);
input_idx
=
block
.
BlockIdY
()
*
block
.
BlockDimY
();
...
...
@@ -672,18 +697,20 @@ __global__ void ReduceAnyKernel(const Tx* x,
loop_left
=
min
(
block
.
GetLoopSize
(),
left_num
-
left_idx
);
stride_left
=
block
.
BlockDimX
()
*
block
.
GridDimX
();
store_offset
=
block
.
BlockIdY
()
*
left_num
+
left_idx
;
tid
=
threadIdx
.
y
;
tid
=
THREAD_ID_Y
;
}
// calculate the offset, means the addr where each thread really start.
// 1. reduce for each thread
MPType
input_compute
[
REDUCE_VEC_SIZE
];
Tx
input_reg
[
REDUCE_VEC_SIZE
];
int
input_idx_tmp
=
input_idx
;
for
(
int
i
=
0
;
i
<
loop_left
;
i
+=
stride_left
)
{
int
input_offset
=
left_index_calculator
(
left_idx
+
i
);
const
Tx
*
input
=
x
+
input_offset
;
const
_ptr_
Tx
*
input
=
x
+
input_offset
;
MPType
reduce_var
=
init
;
// load REDUCE_VEC_SIZE data once, and then compute
int
bound
=
reduce_num
-
(
REDUCE_VEC_SIZE
-
1
)
*
stride
;
input_idx
=
input_idx_tmp
;
for
(;
input_idx
+
block_size
<
bound
;
input_idx
+=
REDUCE_VEC_SIZE
*
stride
)
{
kps
::
ReadDataReduce
<
Tx
,
...
...
@@ -775,7 +802,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
int
loop_size
=
min
(
reduce_num
-
idy
,
blocking_size
);
int
store_offset
=
block
.
BlockIdY
()
*
left_num
+
idz
*
block
.
GridDimY
();
int
block_offset
=
idy
*
left_num
+
idz
*
reduce_num
;
const
Tx
*
input
=
x
+
block_offset
;
const
_ptr_
Tx
*
input
=
x
+
block_offset
;
Tx
reduce_input
;
for
(;
idx
<
size
;
idx
+=
stride
)
{
MPType
reduce_var
=
init
;
...
...
@@ -838,7 +865,7 @@ static void LaunchReduceKernel(const Tx* x_data,
const
ReduceOp
&
reducer
,
const
TransformOp
&
transform
,
MPType
init
,
gpuStream_t
stream
,
KPStream
stream
,
ReduceConfig
<
Ty
>
config
)
{
if
(
config
.
reduce_type
==
kReduceLastDim
)
{
int
stride_reduce
=
1
;
...
...
@@ -855,23 +882,24 @@ static void LaunchReduceKernel(const Tx* x_data,
0
);
dim
.
SetRem
(
config
.
reduce_num
%
config
.
block
.
x
,
0
,
0
);
#ifdef PADDLE_WITH_XPU
2
#ifdef PADDLE_WITH_XPU
_KP
ReduceAnyKernel
<
Tx
,
Ty
,
MPType
,
ReduceOp
,
TransformOp
,
OneDimIndexCal
><<<
8
,
128
,
stream
>>>
(
x_data
,
config
.
output_data
,
reducer
,
transform
,
init
,
config
.
reduce_num
,
config
.
left_num
,
config
.
reduce_last_dim
,
reduce_index_calculator
,
left_index_calculator
,
dim
);
OneDimIndexCal
><<<
8
,
64
,
0
,
stream
>>>
(
x_data
,
config
.
output_data
,
reducer
,
transform
,
init
,
config
.
reduce_num
,
config
.
left_num
,
config
.
reduce_last_dim
,
reduce_index_calculator
,
left_index_calculator
,
dim
);
#else
ReduceAnyKernel
<
Tx
,
Ty
,
...
...
@@ -910,13 +938,13 @@ static void LaunchReduceKernel(const Tx* x_data,
0
);
dim
.
SetRem
(
config
.
reduce_num
%
config
.
block
.
x
,
0
,
0
);
#ifdef PADDLE_WITH_XPU
2
#ifdef PADDLE_WITH_XPU
_KP
ReduceAnyKernel
<
Tx
,
Ty
,
MPType
,
ReduceOp
,
TransformOp
,
IndexCalculator
><<<
8
,
128
,
stream
>>>
(
IndexCalculator
><<<
8
,
64
,
0
,
stream
>>>
(
x_data
,
config
.
output_data
,
reducer
,
...
...
@@ -965,12 +993,13 @@ static void LaunchReduceKernel(const Tx* x_data,
kps
::
DimConfig
dim
=
kps
::
DimConfig
(
grid
.
x
,
grid
.
y
,
grid
.
z
,
block
.
x
,
config
.
grid
.
y
,
0
);
dim
.
SetRem
(
config
.
left_num
%
block
.
x
,
0
,
0
);
#ifdef PADDLE_WITH_XPU2
ReduceHigherDimKernel
<
Ty
,
Ty
,
MPType
,
ReduceOp
,
kps
::
IdentityFunctor
<
Ty
,
MPType
>><<<
8
,
128
,
stream
>>>
(
#ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel
<
Ty
,
Ty
,
MPType
,
ReduceOp
,
kps
::
IdentityFunctor
<
Ty
,
MPType
>><<<
8
,
64
,
0
,
stream
>>>
(
config
.
output_data
,
y_data
,
reducer
,
...
...
@@ -1011,7 +1040,7 @@ CubTensorReduceImpl(const Tx* x_data,
const
TransformOp
&
transform
,
int
reduce_num
,
const
paddle
::
platform
::
Place
&
place
,
gpuStream_t
stream
)
{
KPStream
stream
)
{
auto
reducer
=
ReduceOp
<
Ty
>
();
cub
::
TransformInputIterator
<
Ty
,
TransformOp
,
const
Tx
*>
trans_x
(
x_data
,
transform
);
...
...
@@ -1054,7 +1083,7 @@ CubTensorReduceImpl(const Tx* x_data,
const
TransformOp
&
transform
,
int
reduce_num
,
const
paddle
::
platform
::
Place
&
place
,
gpuStream_t
stream
)
{
KPStream
stream
)
{
PADDLE_THROW
(
phi
::
errors
::
InvalidArgument
(
"Tx should not be float16 when using cub::DeviceReduce::Reduce()."
));
}
...
...
@@ -1068,7 +1097,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
phi
::
DenseTensor
*
y
,
const
TransformOp
&
transform
,
const
std
::
vector
<
int
>&
origin_reduce_dims
,
gpuStream_t
stream
)
{
KPStream
stream
)
{
y
->
mutable_data
<
Ty
>
(
x
.
place
());
auto
x_dim
=
phi
::
vectorize
<
int
>
(
x
.
dims
());
...
...
@@ -1098,11 +1127,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
config
.
SetOutputData
(
y_data
,
x
.
place
(),
&
tmp
);
constexpr
bool
kIsTxFP16
=
std
::
is_same
<
Tx
,
phi
::
dtype
::
float16
>::
value
;
bool
use_cub_reduce
=
config
.
reduce_num
==
numel
&&
!
kIsTxFP16
;
#ifndef PADDLE_WITH_XPU_KP
if
(
use_cub_reduce
)
{
CubTensorReduceImpl
<
Tx
,
Ty
,
ReduceOp
,
TransformOp
>
(
x_data
,
y_data
,
transform
,
config
.
reduce_num
,
x
.
place
(),
stream
);
return
;
}
#endif
using
MPType
=
typename
kps
::
details
::
MPTypeTrait
<
Ty
>::
Type
;
auto
reducer
=
ReduceOp
<
MPType
>
();
...
...
@@ -1124,20 +1155,21 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
config
.
reduce_num
%
config
.
blocking_size
,
0
);
#ifdef PADDLE_WITH_XPU
2
#ifdef PADDLE_WITH_XPU
_KP
ReduceHigherDimKernel
<
Tx
,
Ty
,
MPType
,
ReduceOp
<
MPType
>
,
TransformOp
><<<
8
,
128
,
stream
>>>
(
x_data
,
config
.
output_data
,
reducer
,
transform
,
reducer
.
initial
(),
config
.
reduce_num
,
config
.
left_num
,
config
.
blocking_size
,
dim
);
TransformOp
><<<
8
,
64
,
0
,
stream
>>>
(
x_data
,
config
.
output_data
,
reducer
,
transform
,
reducer
.
initial
(),
config
.
reduce_num
,
config
.
left_num
,
config
.
blocking_size
,
dim
);
#else
ReduceHigherDimKernel
<
Tx
,
...
...
@@ -1163,13 +1195,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
kps
::
DimConfig
(
grid
.
x
,
grid
.
y
,
grid
.
z
,
block
.
x
,
config
.
grid
.
y
,
0
);
dim2
.
SetRem
(
config
.
left_num
%
config
.
block
.
x
,
0
,
0
);
#ifdef PADDLE_WITH_XPU
2
#ifdef PADDLE_WITH_XPU
_KP
ReduceHigherDimKernel
<
Ty
,
Ty
,
MPType
,
ReduceOp
<
MPType
>
,
kps
::
IdentityFunctor
<
Ty
,
MPType
>><<<
8
,
128
,
stream
>>>
(
kps
::
IdentityFunctor
<
Ty
,
MPType
>><<<
8
,
64
,
0
,
stream
>>>
(
config
.
output_data
,
y_data
,
reducer
,
...
...
@@ -1212,7 +1244,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
template
<
typename
T
,
template
<
typename
>
class
ReduceOp
,
template
<
typename
,
typename
>
class
TransformOp
>
void
Reduce
(
const
GPUContext
&
dev_ctx
,
void
Reduce
(
const
KPDevice
&
dev_ctx
,
const
DenseTensor
&
x
,
bool
reduce_all
,
const
std
::
vector
<
int64_t
>&
dims
,
...
...
@@ -1227,7 +1259,7 @@ void Reduce(const GPUContext& dev_ctx,
reduce_num
*=
(
x
.
dims
())[
i
];
}
gpuStream_t
stream
=
dev_ctx
.
stream
();
KPStream
stream
=
dev_ctx
.
stream
();
if
(
out_dtype
!=
phi
::
DataType
::
UNDEFINED
&&
out_dtype
!=
x
.
dtype
())
{
auto
tmp_tensor
=
phi
::
Cast
<
T
>
(
dev_ctx
,
x
,
out_dtype
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录