Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
8501fb00
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
8501fb00
编写于
5月 16, 2022
作者:
N
niuliling123
提交者:
GitHub
5月 16, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
delete rank switch in broadcast_function.h for compile (#42645)
上级
8ffebb5a
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
192 addition
and
385 deletion
+192
-385
paddle/fluid/operators/fused/attn_bias_add.cu.h
paddle/fluid/operators/fused/attn_bias_add.cu.h
+5
-6
paddle/phi/kernels/funcs/broadcast_function.h
paddle/phi/kernels/funcs/broadcast_function.h
+135
-219
paddle/phi/kernels/primitive/datamover_primitives.h
paddle/phi/kernels/primitive/datamover_primitives.h
+13
-54
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+33
-105
paddle/phi/kernels/primitive/kernel_primitives.h
paddle/phi/kernels/primitive/kernel_primitives.h
+6
-1
未找到文件。
paddle/fluid/operators/fused/attn_bias_add.cu.h
浏览文件 @
8501fb00
...
...
@@ -51,8 +51,7 @@ template <typename InT, typename OutT, int ShapeSize, int VecSize,
__global__
void
BroadcastKernelBinary
(
const
InT
*
__restrict__
in0
,
const
InT
*
__restrict__
in1
,
OutT
*
out
,
phi
::
Array
<
bool
,
MAX_INPUT_NUM
>
use_broadcast
,
uint32_t
numel
,
phi
::
Array
<
kps
::
details
::
BroadcastConfig
<
ShapeSize
>
,
MAX_INPUT_NUM
>
configlists
,
phi
::
Array
<
kps
::
details
::
BroadcastConfig
,
MAX_INPUT_NUM
>
configlists
,
int
main_tid
,
int
tail_tid
,
Functor
func
)
{
int
fix
=
blockIdx
.
x
*
blockDim
.
x
*
VecSize
;
int
num
=
tail_tid
;
...
...
@@ -65,14 +64,14 @@ __global__ void BroadcastKernelBinary(
// load in0
if
(
use_broadcast
[
0
])
{
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
,
1
,
ShapeSize
>
(
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
,
1
>
(
arg0
,
in0
,
fix
,
configlists
[
0
],
numel
);
}
else
{
kernel_primitives
::
ReadData
<
InT
,
VecSize
,
1
,
1
>
(
arg0
,
in0
+
fix
,
num
);
}
// load in1
if
(
use_broadcast
[
1
])
{
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
,
1
,
ShapeSize
>
(
kernel_primitives
::
ReadDataBc
<
InT
,
VecSize
,
DATA_PER_THREAD
,
1
>
(
arg1
,
in1
,
fix
,
configlists
[
1
],
numel
);
}
else
{
kernel_primitives
::
ReadData
<
InT
,
VecSize
,
1
,
1
>
(
arg1
,
in1
+
fix
,
num
);
...
...
@@ -104,7 +103,7 @@ void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
int
main_tid
=
numel
/
(
data_per_thread
*
vec_size
*
threads
);
int
tail_tid
=
numel
%
(
data_per_thread
*
vec_size
*
threads
);
phi
::
Array
<
kps
::
details
::
BroadcastConfig
<
2
>
,
MAX_INPUT_NUM
>
configlists
;
phi
::
Array
<
kps
::
details
::
BroadcastConfig
,
MAX_INPUT_NUM
>
configlists
;
phi
::
Array
<
bool
,
MAX_INPUT_NUM
>
use_broadcast
;
use_broadcast
[
0
]
=
false
;
...
...
@@ -115,7 +114,7 @@ void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
// Here, dims are transposed due to the logic in BroadcastConfig.
std
::
vector
<
int64_t
>
input1_dims
=
{
n
,
1
};
std
::
vector
<
int64_t
>
out_dims
=
{
n
,
m
};
configlists
[
1
]
=
kps
::
details
::
BroadcastConfig
<
2
>
(
out_dims
,
input1_dims
,
2
);
configlists
[
1
]
=
kps
::
details
::
BroadcastConfig
(
out_dims
,
input1_dims
,
2
);
auto
func
=
AddFunctor
<
T
>
();
auto
stream
=
ctx
.
stream
();
...
...
paddle/phi/kernels/funcs/broadcast_function.h
浏览文件 @
8501fb00
...
...
@@ -223,31 +223,42 @@ struct DimensionsTransform {
}
};
template
<
typename
T
,
int
VecSize
,
int
Rank
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
LoadData
(
T
*
dst
,
const
_ptr_
T
*
src
,
uint32_t
block_offset
,
const
kps
::
details
::
BroadcastConfig
<
Rank
>
&
config
,
int
numel
,
int
num
,
int
need_broadcast
)
{
// numel : whole num of output
// num: how many data will be deal with in this time
if
(
need_broadcast
)
{
kps
::
ReadDataBc
<
T
,
VecSize
,
1
,
1
,
Rank
,
IsBoundary
>
(
dst
,
src
,
block_offset
,
config
,
numel
);
template
<
typename
InT
,
typename
OutT
,
int
NumOuts
=
1
>
int
GetVecsize
(
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
std
::
vector
<
DenseTensor
*>
*
outs
)
{
int
in_vec_size
=
4
;
int
out_vec_size
=
4
;
if
(
NumOuts
>
1
)
{
for
(
int
i
=
0
;
i
<
NumOuts
;
++
i
)
{
PADDLE_ENFORCE_EQ
(
(
*
outs
)[
i
]
->
dims
(),
(
*
outs
)[
0
]
->
dims
(),
phi
::
errors
::
InvalidArgument
(
"The shape of each output tensor shall be identical yet, but "
"%d-th output tensor`s shape is not."
,
i
));
out_vec_size
=
std
::
min
(
phi
::
GetVectorizedSize
<
OutT
>
((
*
outs
)[
i
]
->
data
<
OutT
>
()),
out_vec_size
);
}
}
else
{
kps
::
ReadData
<
T
,
VecSize
,
1
,
1
,
IsBoundary
>
(
dst
,
src
+
block_offset
,
num
);
out_vec_size
=
phi
::
GetVectorizedSize
<
OutT
>
((
*
outs
)[
0
]
->
data
<
OutT
>
());
}
for
(
auto
*
in
:
ins
)
{
auto
temp_size
=
phi
::
GetVectorizedSize
<
InT
>
(
in
->
data
<
InT
>
());
in_vec_size
=
in
->
dims
()
==
(
*
outs
)[
0
]
->
dims
()
?
std
::
min
(
temp_size
,
in_vec_size
)
:
in_vec_size
;
}
return
std
::
min
(
out_vec_size
,
in_vec_size
);
}
template
<
typename
T
,
int
VecSize
,
int
Rank
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
VecSize
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
LoadData
(
T
*
dst
,
const
_ptr_
T
*
src
,
uint32_t
block_offset
,
const
kps
::
details
::
BroadcastConfig
<
Rank
>
&
config
,
const
kps
::
details
::
BroadcastConfig
&
config
,
int
numel
,
int
num
,
int
need_broadcast
,
...
...
@@ -255,7 +266,7 @@ __device__ __forceinline__ void LoadData(
// numel : whole num of output
// num: how many data will be deal with in this time
if
(
need_broadcast
)
{
kps
::
ReadDataBc
<
T
,
VecSize
,
1
,
1
,
Rank
,
IsBoundary
>
(
kps
::
ReadDataBc
<
T
,
VecSize
,
1
,
1
,
IsBoundary
>
(
dst
,
src
,
block_offset
,
config
,
numel
,
read_lens
);
}
else
{
kps
::
ReadData
<
T
,
VecSize
,
1
,
1
,
IsBoundary
>
(
...
...
@@ -269,14 +280,13 @@ template <typename InT,
int
Arity
,
int
NumOuts
,
int
VecSize
,
int
Rank
,
bool
IsBoundary
=
false
>
__device__
void
VectorizedBroadcastKernelImpl
(
const
phi
::
Array
<
const
_ptr_
InT
*
__restrict__
,
Arity
>
&
ins
,
phi
::
Array
<
_ptr_
OutT
*
,
NumOuts
>
outs
,
const
phi
::
Array
<
int
,
Arity
>
&
use_broadcast
,
uint32_t
numel
,
const
phi
::
Array
<
kps
::
details
::
BroadcastConfig
<
Rank
>
,
Arity
>
&
configs
,
const
phi
::
Array
<
kps
::
details
::
BroadcastConfig
,
Arity
>
&
configs
,
int
num
,
int
block_offset
,
int
read_lens
,
...
...
@@ -287,7 +297,7 @@ __device__ void VectorizedBroadcastKernelImpl(
#pragma unroll
for
(
int
i
=
0
;
i
<
Arity
;
i
++
)
{
kps
::
Init
<
InT
,
VecSize
>
(
args
[
i
],
static_cast
<
InT
>
(
1.0
f
),
read_lens
);
LoadData
<
InT
,
VecSize
,
Rank
,
IsBoundary
>
(
args
[
i
],
LoadData
<
InT
,
VecSize
,
IsBoundary
>
(
args
[
i
],
ins
[
i
],
block_offset
,
configs
[
i
],
...
...
@@ -315,14 +325,13 @@ template <typename InT,
typename
Functor
,
int
Arity
,
int
NumOuts
,
int
VecSize
,
int
Rank
>
int
VecSize
>
__global__
void
VectorizedBroadcastKernel
(
phi
::
Array
<
const
_ptr_
InT
*
__restrict__
,
Arity
>
ins
,
phi
::
Array
<
_ptr_
OutT
*
,
NumOuts
>
outs
,
phi
::
Array
<
int
,
Arity
>
use_broadcast
,
uint32_t
numel
,
phi
::
Array
<
kps
::
details
::
BroadcastConfig
<
Rank
>
,
Arity
>
configs
,
phi
::
Array
<
kps
::
details
::
BroadcastConfig
,
Arity
>
configs
,
int
main_offset
,
int
tail_tid
,
int
read_lens
,
...
...
@@ -338,7 +347,6 @@ __global__ void VectorizedBroadcastKernel(
Arity
,
NumOuts
,
VecSize
,
Rank
,
false
>
(
ins
,
outs
,
use_broadcast
,
...
...
@@ -357,7 +365,6 @@ __global__ void VectorizedBroadcastKernel(
Arity
,
NumOuts
,
VecSize
,
Rank
,
true
>
(
ins
,
outs
,
use_broadcast
,
...
...
@@ -376,7 +383,6 @@ __global__ void VectorizedBroadcastKernel(
Arity
,
NumOuts
,
VecSize
,
Rank
,
false
>
(
ins
,
outs
,
use_broadcast
,
...
...
@@ -393,7 +399,6 @@ __global__ void VectorizedBroadcastKernel(
Arity
,
NumOuts
,
VecSize
,
Rank
,
true
>
(
ins
,
outs
,
use_broadcast
,
...
...
@@ -412,15 +417,14 @@ template <typename InT,
typename
Functor
,
int
Arity
,
int
NumOuts
,
int
VecSize
,
int
Rank
>
void
LaunchBroadcastKernel
(
const
KPDevice
&
ctx
,
int
VecSize
>
void
LaunchBroadcastKernel
(
const
KPDevice
&
ctx
,
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
std
::
vector
<
DenseTensor
*>
*
outs
,
Functor
func
,
DimensionsTransform
merge_dim
s
)
{
const
phi
::
Array
<
kps
::
details
::
BroadcastConfig
,
Arity
>
&
config
s
)
{
int
numel
=
(
*
outs
)[
0
]
->
numel
();
phi
::
Array
<
kps
::
details
::
BroadcastConfig
<
Rank
>
,
Arity
>
configs
;
phi
::
Array
<
int
,
Arity
>
use_broadcast
;
phi
::
Array
<
const
_ptr_
InT
*
__restrict__
,
Arity
>
ins_data
;
phi
::
Array
<
_ptr_
OutT
*
,
NumOuts
>
outs_data
;
...
...
@@ -432,132 +436,41 @@ void LaunchBroadcastKernel(const KPDevice &ctx,
for
(
int
i
=
0
;
i
<
Arity
;
i
++
)
{
use_broadcast
[
i
]
=
(
ins
[
i
]
->
numel
()
!=
numel
);
ins_data
[
i
]
=
(
const
_ptr_
InT
*
)(
ins
[
i
]
->
data
<
InT
>
());
#ifdef PADDLE_WITH_XPU_KP
if
(
i
==
0
)
{
configs
[
i
]
=
kps
::
details
::
BroadcastConfig
<
Rank
>
(
merge_dims
.
out_dims
,
merge_dims
.
in_dims
[
0
],
merge_dims
.
in_dims
[
1
],
merge_dims
.
dim_size
);
}
else
if
(
i
==
1
)
{
configs
[
i
]
=
kps
::
details
::
BroadcastConfig
<
Rank
>
(
merge_dims
.
out_dims
,
merge_dims
.
in_dims
[
1
],
merge_dims
.
in_dims
[
0
],
merge_dims
.
dim_size
);
}
#else
if
(
use_broadcast
[
i
])
{
// get the broadcast config,
// if data shape is[m, n], then you should set data_dim = {n, m}
// eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
configs
[
i
]
=
kps
::
details
::
BroadcastConfig
<
Rank
>
(
merge_dims
.
out_dims
,
merge_dims
.
in_dims
[
i
],
merge_dims
.
dim_size
);
}
#endif
}
#ifdef PADDLE_WITH_XPU_KP
const
int
threads
=
64
;
const
int
blocks
=
8
;
int
read_lens
=
configs
[
0
].
buf_len
;
auto
stream
=
ctx
.
x_context
()
->
xpu_stream
;
int
main_offset
=
(
numel
/
(
read_lens
*
threads
))
*
read_lens
*
threads
;
int
tail_tid
=
numel
%
(
read_lens
*
threads
);
auto
stream
=
ctx
.
x_context
()
->
xpu_stream
;
if
(
configs
[
0
].
cmp_type
!=
kps
::
details
::
OptType
::
CanNotOptimize
)
{
main_offset
=
numel
;
VectorizedBroadcastKernel
<
InT
,
OutT
,
Functor
,
Arity
,
NumOuts
,
512
,
Rank
><<<
blocks
,
threads
,
stream
>>>
(
ins_data
,
outs_data
,
use_broadcast
,
numel
,
configs
,
main_offset
,
tail_tid
,
read_lens
,
func
);
}
else
{
VectorizedBroadcastKernel
<
InT
,
OutT
,
Functor
,
Arity
,
NumOuts
,
256
,
Rank
><<<
blocks
,
threads
,
stream
>>>
(
ins_data
,
outs_data
,
use_broadcast
,
numel
,
configs
,
main_offset
,
tail_tid
,
read_lens
,
func
);
}
#else
const
int
threads
=
256
;
int
blocks
=
((
numel
+
VecSize
-
1
)
/
VecSize
+
threads
-
1
)
/
threads
;
int
main_offset
=
(
numel
/
(
VecSize
*
threads
))
*
VecSize
*
threads
;
int
tail_tid
=
numel
%
(
VecSize
*
threads
);
auto
gpu_config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
ctx
,
numel
,
VecSize
);
int
read_lens
=
VecSize
;
auto
stream
=
ctx
.
stream
();
auto
threads
=
gpu_config
.
thread_per_block
;
auto
blocks
=
gpu_config
.
block_per_grid
;
int
main_offset
=
(
numel
/
(
read_lens
*
gpu_config
.
GetBlockSize
()))
*
read_lens
*
gpu_config
.
GetBlockSize
();
int
tail_tid
=
numel
%
(
read_lens
*
gpu_config
.
GetBlockSize
());
#endif
VectorizedBroadcastKernel
<
InT
,
OutT
,
Functor
,
Arity
,
NumOuts
,
VecSize
,
Rank
><<<
blocks
,
threads
,
0
,
stream
>>>
(
ins_data
,
VecSize
><<<
blocks
,
threads
,
0
,
stream
>>>
(
ins_data
,
outs_data
,
use_broadcast
,
numel
,
configs
,
main_offset
,
tail_tid
,
VecSize
,
read_lens
,
func
);
#endif
}
template
<
typename
InT
,
typename
OutT
,
typename
Functor
,
int
Arity
,
int
NumOuts
,
int
VecSize
>
void
BroadcastKernelForDifferentDimSize
(
const
KPDevice
&
ctx
,
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
std
::
vector
<
DenseTensor
*>
*
outs
,
int
axis
,
Functor
func
)
{
const
auto
merge_dims
=
DimensionsTransform
(
ins
,
(
*
outs
)[
0
]
->
dims
(),
axis
);
#define CALL_BROADCAST_FOR_DIM_SIZE(rank) \
case rank: { \
LaunchBroadcastKernel<InT, OutT, Functor, Arity, NumOuts, VecSize, rank>( \
ctx, ins, outs, func, merge_dims); \
} break;
switch
(
merge_dims
.
dim_size
)
{
CALL_BROADCAST_FOR_DIM_SIZE
(
1
);
CALL_BROADCAST_FOR_DIM_SIZE
(
2
);
CALL_BROADCAST_FOR_DIM_SIZE
(
3
);
CALL_BROADCAST_FOR_DIM_SIZE
(
4
);
CALL_BROADCAST_FOR_DIM_SIZE
(
5
);
CALL_BROADCAST_FOR_DIM_SIZE
(
6
);
CALL_BROADCAST_FOR_DIM_SIZE
(
7
);
CALL_BROADCAST_FOR_DIM_SIZE
(
8
);
default:
{
PADDLE_THROW
(
phi
::
errors
::
InvalidArgument
(
"The maximum dimension of input tensor is expected to be less than "
"%d, but received %d."
,
merge_dims
.
dim_size
,
phi
::
DDim
::
kMaxRank
));
}
}
#undef CALL_BROADCAST_FOR_DIM_SIZE
}
template
<
ElementwiseType
ET
,
...
...
@@ -574,79 +487,82 @@ void BroadcastKernelForDifferentVecSize(
using
Traits
=
paddle
::
platform
::
FunctionTraits
<
Functor
>
;
const
int
kArity
=
Traits
::
has_pointer_args
?
static_cast
<
int
>
(
ET
)
:
Traits
::
arity
;
PADDLE_ENFORCE_EQ
(
ins
.
size
(),
PADDLE_ENFORCE_EQ
(
ins
.
size
(),
kArity
,
phi
::
errors
::
InvalidArgument
(
"The number of inputs is expected to be equal to the "
"arity of functor. But received: the number of inputs "
phi
::
errors
::
InvalidArgument
(
"The number of inputs is expected to be "
"equal to the "
"arity of functor. But recieved: the "
"number of inputs "
"is %d, the arity of functor is %d."
,
ins
.
size
(),
kArity
));
PADDLE_ENFORCE_LE
(
kArity
,
PADDLE_ENFORCE_LE
(
kArity
,
3
,
phi
::
errors
::
InvalidArgument
(
"Currently only broadcast of ternary is
supported "
phi
::
errors
::
InvalidArgument
(
"Currently only broadcast of ternary is "
"
supported "
"and verified, but received %d."
,
kArity
));
PADDLE_ENFORCE_EQ
(
outs
->
size
(),
PADDLE_ENFORCE_EQ
(
outs
->
size
(),
NumOuts
,
phi
::
errors
::
InvalidArgument
(
"Number of outputs shall equal to number of functions, "
"but number of outputs is %d, of functions is %d."
,
phi
::
errors
::
InvalidArgument
(
"Number of outputs shall equal to number "
"of functions, "
"but number of outputs is %d, of "
"functions is %d."
,
outs
->
size
(),
NumOuts
));
int
in_vec_size
=
4
;
int
out_vec_size
=
4
;
if
(
NumOuts
>
1
)
{
for
(
int
i
=
0
;
i
<
NumOuts
;
++
i
)
{
// mergedim and get vec_size
const
auto
merge_dims
=
DimensionsTransform
(
ins
,
(
*
outs
)[
0
]
->
dims
(),
axis
);
phi
::
Array
<
kps
::
details
::
BroadcastConfig
,
kArity
>
configs
;
// get vec_size
#ifdef PADDLE_WITH_XPU_KP
PADDLE_ENFORCE_EQ
(
(
*
outs
)[
i
]
->
dims
(),
(
*
outs
)[
0
]
->
dims
()
,
ins
.
size
(),
2
,
phi
::
errors
::
InvalidArgument
(
"The shape of each output tensor shall be identical yet, but "
"%d-th output tensor`s shape is not."
,
i
));
out_vec_size
=
std
::
min
(
phi
::
GetVectorizedSize
<
OutT
>
((
*
outs
)[
i
]
->
data
<
OutT
>
()),
out_vec_size
);
}
}
else
{
out_vec_size
=
phi
::
GetVectorizedSize
<
OutT
>
((
*
outs
)[
0
]
->
data
<
OutT
>
());
"XPU only support inputs is 2, but received %d"
,
ins
.
size
()));
configs
[
0
]
=
kps
::
details
::
BroadcastConfig
(
merge_dims
.
out_dims
,
merge_dims
.
in_dims
[
0
],
merge_dims
.
in_dims
[
1
],
merge_dims
.
dim_size
);
configs
[
1
]
=
kps
::
details
::
BroadcastConfig
(
merge_dims
.
out_dims
,
merge_dims
.
in_dims
[
1
],
merge_dims
.
in_dims
[
0
],
merge_dims
.
dim_size
);
auto
type
=
kps
::
details
::
OptType
::
CanNotOptimize
;
bool
is_optimize
=
configs
[
0
].
cmp_type
!=
type
;
int
vec_size
=
is_optimize
?
VecSizeL
:
VecSizeM
;
#else
for
(
int
i
=
0
;
i
<
kArity
;
i
++
)
{
// get the broadcast config,
// if data shape is[m, n], then you should set data_dim = {n, m}
// eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
if
(
ins
[
i
]
->
numel
())
{
configs
[
i
]
=
kps
::
details
::
BroadcastConfig
(
merge_dims
.
out_dims
,
merge_dims
.
in_dims
[
i
],
merge_dims
.
dim_size
);
}
for
(
auto
*
in
:
ins
)
{
auto
temp_size
=
phi
::
GetVectorizedSize
<
InT
>
(
in
->
data
<
InT
>
());
in_vec_size
=
in
->
dims
()
==
(
*
outs
)[
0
]
->
dims
()
?
std
::
min
(
temp_size
,
in_vec_size
)
:
in_vec_size
;
}
int
vec_size
=
std
::
min
(
out_vec_size
,
in_vec_size
);
int
vec_size
=
GetVecsize
<
InT
,
OutT
,
NumOuts
>
(
ins
,
outs
);
#endif
switch
(
vec_size
)
{
case
4
:
{
BroadcastKernelForDifferentDimSize
<
InT
,
OutT
,
Functor
,
kArity
,
NumOuts
,
4
>
(
ctx
,
ins
,
outs
,
axis
,
func
);
case
VecSizeL
:
{
LaunchBroadcastKernel
<
InT
,
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeL
>
(
ctx
,
ins
,
outs
,
func
,
configs
);
break
;
}
case
2
:
{
BroadcastKernelForDifferentDimSize
<
InT
,
OutT
,
Functor
,
kArity
,
NumOuts
,
2
>
(
ctx
,
ins
,
outs
,
axis
,
func
);
case
VecSizeM
:
{
LaunchBroadcastKernel
<
InT
,
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeM
>
(
ctx
,
ins
,
outs
,
func
,
configs
);
break
;
}
case
1
:
{
BroadcastKernelForDifferentDimSize
<
InT
,
OutT
,
Functor
,
kArity
,
NumOuts
,
1
>
(
ctx
,
ins
,
outs
,
axis
,
func
);
case
VecSizeS
:
{
LaunchBroadcastKernel
<
InT
,
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeS
>
(
ctx
,
ins
,
outs
,
func
,
configs
);
break
;
}
default:
{
...
...
paddle/phi/kernels/primitive/datamover_primitives.h
浏览文件 @
8501fb00
...
...
@@ -82,10 +82,10 @@ struct FastDivMod {
* index of the output data. if input or output shape is [dim0, dim1] then dims
* must be [dim1, dim0].
*/
template
<
int
kDims
>
struct
BroadcastConfig
{
FastDivMod
divmoders
[
kDims
];
FastDivMod
divmoders
[
phi
::
DDim
::
kMaxRank
];
uint32_t
strides
[
phi
::
DDim
::
kMaxRank
];
int
kDims
;
HOSTDEVICE
BroadcastConfig
()
{}
HOSTDEVICE
BroadcastConfig
(
const
std
::
vector
<
int64_t
>&
out_dims
,
...
...
@@ -109,7 +109,7 @@ struct BroadcastConfig {
std
::
multiplies
<
int64_t
>
())
:
strides_in
[
i
];
}
kDims
=
dim_size
;
memcpy
(
strides
,
strides_in
.
data
(),
kDims
*
sizeof
(
uint32_t
));
memcpy
(
divmoders
,
divmoders_in
.
data
(),
kDims
*
sizeof
(
FastDivMod
));
}
...
...
@@ -436,17 +436,12 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
const
T
*
__restrict__
src
,
uint32_t
block_offset
,
details
::
BroadcastConfig
<
Rank
>
config
,
const
details
::
BroadcastConfig
&
config
,
int
total_num_output
,
int
stride_nx
,
int
stride_ny
)
{
...
...
@@ -465,7 +460,8 @@ __device__ __forceinline__ void ReadDataBc(
}
}
#pragma unroll
for
(
int
i
=
0
;
i
<
Rank
;
++
i
)
{
for
(
int
i
=
0
;
i
<
phi
::
DDim
::
kMaxRank
;
++
i
)
{
if
(
i
>=
config
.
kDims
)
break
;
auto
fast_divmoder
=
config
.
divmoders
[
i
].
Divmod
(
index_output
);
index_output
=
fast_divmoder
.
val
[
0
];
index_src
+=
fast_divmoder
.
val
[
1
]
*
config
.
strides
[
i
];
...
...
@@ -785,53 +781,14 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
const
T
*
__restrict__
src
,
uint32_t
block_offset
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
)
{
uint32_t
thread_offset
=
block_offset
+
threadIdx
.
x
*
NX
;
uint32_t
index_src
=
0
;
#pragma unroll
for
(
uint32_t
nx
=
0
;
nx
<
NX
;
++
nx
)
{
uint32_t
index_output
=
thread_offset
+
nx
;
index_src
=
0
;
if
(
IsBoundary
)
{
if
(
index_output
>=
total_num_output
)
{
break
;
}
}
#pragma unroll
for
(
int
i
=
0
;
i
<
Rank
;
++
i
)
{
auto
fast_divmoder
=
config
.
divmoders
[
i
].
Divmod
(
index_output
);
index_output
=
fast_divmoder
.
val
[
0
];
index_src
+=
fast_divmoder
.
val
[
1
]
*
config
.
strides
[
i
];
}
dst
[
nx
]
=
src
[
index_src
];
}
}
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
const
T
*
__restrict__
src
,
uint32_t
block_offset
,
details
::
BroadcastConfig
<
Rank
>
config
,
const
details
::
BroadcastConfig
&
config
,
int
total_num_output
,
int
read_lens
)
{
int
read_lens
=
NX
)
{
uint32_t
thread_offset
=
block_offset
+
threadIdx
.
x
*
NX
;
uint32_t
index_src
=
0
;
...
...
@@ -845,7 +802,8 @@ __device__ __forceinline__ void ReadDataBc(
}
}
#pragma unroll
for
(
int
i
=
0
;
i
<
Rank
;
++
i
)
{
for
(
int
i
=
0
;
i
<
phi
::
DDim
::
kMaxRank
;
++
i
)
{
if
(
i
>=
config
.
kDims
)
break
;
auto
fast_divmoder
=
config
.
divmoders
[
i
].
Divmod
(
index_output
);
index_output
=
fast_divmoder
.
val
[
0
];
index_src
+=
fast_divmoder
.
val
[
1
]
*
config
.
strides
[
i
];
...
...
@@ -853,6 +811,7 @@ __device__ __forceinline__ void ReadDataBc(
dst
[
nx
]
=
src
[
index_src
];
}
}
/**
* @brief Initialize register with data index.
*
...
...
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
浏览文件 @
8501fb00
...
...
@@ -65,7 +65,6 @@ struct alignas(sizeof(T) * VecSize) VectorType {
* must be [dim1, dim0].
*/
#pragma pack(4)
template
<
int
kDims
>
struct
BroadcastConfig
{
int
strides_in
[
phi
::
DDim
::
kMaxRank
];
int
strides_out
[
phi
::
DDim
::
kMaxRank
];
...
...
@@ -78,7 +77,7 @@ struct BroadcastConfig {
int
n
=
1
;
int
k
=
1
;
int
buf_len
=
0
;
int
kDims
;
HOSTDEVICE
BroadcastConfig
()
{}
HOSTDEVICE
BroadcastConfig
(
const
std
::
vector
<
int64_t
>&
out_dims
,
...
...
@@ -99,7 +98,7 @@ struct BroadcastConfig {
for
(
int
i
=
0
;
i
<
dim_size
;
i
++
)
{
dim_tmp
[
i
]
=
in_dims
[
i
];
}
kDims
=
dim_size
;
memcpy
(
strides_in
,
strides_in_tmp
.
data
(),
kDims
*
sizeof
(
int
));
memcpy
(
strides_out
,
strides_out_tmp
.
data
(),
kDims
*
sizeof
(
int
));
memcpy
(
in_dim
,
dim_tmp
.
data
(),
kDims
*
sizeof
(
int
));
...
...
@@ -551,7 +550,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x core_num(), boundary judgment is required to avoid memory access
...
...
@@ -567,16 +565,11 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
details
::
BroadcastConfig
<
Rank
>
config
,
const
details
::
BroadcastConfig
&
config
,
int
total_num_output
,
int
stride_nx
,
int
stride_ny
)
{
...
...
@@ -882,60 +875,6 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
}
}
/**
* @brief Read 1D data from global memory to register with broadcast form.
*
* @template paraments
* T: The type of data stored in the global memory.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x core_num(), boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param:
* dst: The register pointer of the thread, the size is NX * NY.
* src: The original input data pointer of kernel.
* block_offset: The data offset of this block, core_num() * blockIdx.x * NX;
* config: Calculation configuration of broadcast. It is used to calculate the
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
const
details
::
BroadcastConfig
<
Rank
>&
config
,
int
total_num_output
)
{
int
thread_offset
=
block_offset
+
core_id
()
*
NX
;
int
index_src
=
0
;
__local__
T
in_temp
;
#pragma unroll
for
(
int
nx
=
0
;
nx
<
NX
;
++
nx
)
{
int
index_output
=
thread_offset
+
nx
;
index_src
=
0
;
if
(
IsBoundary
)
{
if
(
index_output
>=
total_num_output
)
{
break
;
}
}
index_src
=
config
(
index_output
);
GM2LM
(
src
+
index_src
,
&
in_temp
,
sizeof
(
T
));
dst
[
nx
]
=
in_temp
;
}
}
/**
* @brief Read data from global memory to local memory with broadcast
* {m, 1, k}-> {m, n, k} form.
...
...
@@ -952,12 +891,12 @@ __device__ __inline__ void ReadDataBc(
* coordinate mapping relationship between output data and input data.
* read_lens: The number of data continuously loaded by each thread.
*/
template
<
typename
T
,
int
Rank
>
template
<
typename
T
>
__device__
__inline__
void
ReadDataBcM1kMnk
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
thread_offset
,
const
details
::
BroadcastConfig
<
Rank
>
&
config
,
const
details
::
BroadcastConfig
&
config
,
int
read_lens
)
{
int
index_output
=
thread_offset
;
int
index_base
=
config
(
index_output
);
...
...
@@ -999,12 +938,12 @@ __device__ __inline__ void ReadDataBcM1kMnk(
* coordinate mapping relationship between output data and input data.
* read_lens: The number of data continuously loaded by each thread.
*/
template
<
typename
T
,
int
Rank
>
template
<
typename
T
>
__device__
__inline__
void
ReadDataBcM1Mn
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
thread_offset
,
const
details
::
BroadcastConfig
<
Rank
>
&
config
,
const
details
::
BroadcastConfig
&
config
,
int
read_lens
)
{
int
index_output
=
thread_offset
;
int
index_base
=
config
(
index_output
);
...
...
@@ -1027,7 +966,6 @@ __device__ __inline__ void ReadDataBcM1Mn(
*
* @template paraments
* T: Data type of register.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
*
* @param:
* dst: The register pointer of the thread, the size is NX.
...
...
@@ -1037,12 +975,12 @@ __device__ __inline__ void ReadDataBcM1Mn(
* coordinate mapping relationship between output data and input data.
* read_lens: The number of data continuously loaded by each thread.
*/
template
<
typename
T
,
int
Rank
>
template
<
typename
T
>
__device__
__inline__
void
ReadDataBc1NMn
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
thread_offset
,
const
details
::
BroadcastConfig
<
Rank
>
&
config
,
const
details
::
BroadcastConfig
&
config
,
int
read_lens
)
{
int
index_output
=
thread_offset
;
int
index_base
=
config
(
index_output
);
...
...
@@ -1075,7 +1013,6 @@ __device__ __inline__ void ReadDataBc1NMn(
*
* @template paraments
* T: Data type of register.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
*
* @param:
* dst: The register pointer of the thread, the size is NX.
...
...
@@ -1085,12 +1022,12 @@ __device__ __inline__ void ReadDataBc1NMn(
* coordinate mapping relationship between output data and input data.
* read_lens: The number of data continuously loaded by each thread.
*/
template
<
typename
T
,
int
Rank
>
template
<
typename
T
>
__device__
__inline__
void
ReadDataBc1N1Mnk
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
thread_offset
,
const
details
::
BroadcastConfig
<
Rank
>
&
config
,
const
details
::
BroadcastConfig
&
config
,
int
read_lens
)
{
int
index_output
=
thread_offset
;
int
index_base
=
config
(
index_output
);
...
...
@@ -1130,7 +1067,6 @@ __device__ __inline__ void ReadDataBc1N1Mnk(
*
* @template paraments
* T: Data type of register.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
*
* @param:
* dst: The register pointer of the thread, the size is NX.
...
...
@@ -1140,12 +1076,11 @@ __device__ __inline__ void ReadDataBc1N1Mnk(
* coordinate mapping relationship between output data and input data.
* read_lens: The number of data continuously loaded by each thread.
*/
template
<
typename
T
,
int
Rank
>
__device__
__inline__
void
ReadDataBc1N
(
T
*
dst
,
template
<
typename
T
>
__device__
__inline__
void
ReadDataBc1N
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
thread_offset
,
const
details
::
BroadcastConfig
<
Rank
>
&
config
,
const
details
::
BroadcastConfig
&
config
,
int
read_lens
)
{
int
index_output
=
thread_offset
;
int
index_base
=
config
(
index_output
);
...
...
@@ -1174,12 +1109,12 @@ __device__ __inline__ void ReadDataBc1N(
* total_num_output: Total number of original output.
* read_lens: The number of data continuously loaded by each thread.
*/
template
<
typename
T
,
int
Rank
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBcCanNotCmp
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
thread_offset
,
const
details
::
BroadcastConfig
<
Rank
>
&
config
,
const
details
::
BroadcastConfig
&
config
,
int
total_num_output
,
int
read_lens
)
{
int
index_output
=
thread_offset
;
...
...
@@ -1215,7 +1150,6 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x core_num(), boundary judgment is required to avoid memory access
...
...
@@ -1230,33 +1164,27 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
* read_lens: The number of data continuously loaded by each thread.
* total_num_output: Total number of original output.
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
const
details
::
BroadcastConfig
<
Rank
>
&
config
,
const
details
::
BroadcastConfig
&
config
,
int
total_num_output
,
int
read_lens
)
{
int
thread_offset
=
block_offset
+
core_id
()
*
read_lens
;
if
(
config
.
cmp_type
==
details
::
OptType
::
MNK_M1K
)
{
ReadDataBcM1kMnk
<
T
,
Rank
>
(
dst
,
src
,
thread_offset
,
config
,
read_lens
);
ReadDataBcM1kMnk
<
T
>
(
dst
,
src
,
thread_offset
,
config
,
read_lens
);
}
else
if
(
config
.
cmp_type
==
details
::
OptType
::
N_1
)
{
ReadDataBc1N
<
T
,
Rank
>
(
dst
,
src
,
thread_offset
,
config
,
read_lens
);
ReadDataBc1N
<
T
>
(
dst
,
src
,
thread_offset
,
config
,
read_lens
);
}
else
if
(
config
.
cmp_type
==
details
::
OptType
::
MN_M
)
{
ReadDataBcM1Mn
<
T
,
Rank
>
(
dst
,
src
,
thread_offset
,
config
,
read_lens
);
ReadDataBcM1Mn
<
T
>
(
dst
,
src
,
thread_offset
,
config
,
read_lens
);
}
else
if
(
config
.
cmp_type
==
details
::
OptType
::
MN_N
)
{
ReadDataBc1NMn
<
T
,
Rank
>
(
dst
,
src
,
thread_offset
,
config
,
read_lens
);
ReadDataBc1NMn
<
T
>
(
dst
,
src
,
thread_offset
,
config
,
read_lens
);
}
else
if
(
config
.
cmp_type
==
details
::
OptType
::
MNK_1N1
)
{
ReadDataBc1N1Mnk
<
T
,
Rank
>
(
dst
,
src
,
thread_offset
,
config
,
read_lens
);
ReadDataBc1N1Mnk
<
T
>
(
dst
,
src
,
thread_offset
,
config
,
read_lens
);
}
else
{
ReadDataBcCanNotCmp
<
T
,
Rank
,
IsBoundary
>
(
ReadDataBcCanNotCmp
<
T
,
IsBoundary
>
(
dst
,
src
,
thread_offset
,
config
,
total_num_output
,
read_lens
);
}
}
...
...
paddle/phi/kernels/primitive/kernel_primitives.h
浏览文件 @
8501fb00
...
...
@@ -40,7 +40,9 @@
#define GRID_NUM_X cluster_num()
#define GRID_NUM_Y 0
#define GRID_NUM_Z 0
#define VecSizeL 512
#define VecSizeM 256
#define VecSizeS 128
#else
#define KPStream gpuStream_t
...
...
@@ -64,6 +66,9 @@
#define GRID_NUM_Y gridDim.y
#define GRID_NUM_Z gridDim.z
#define VecSizeL 4
#define VecSizeM 2
#define VecSizeS 1
#endif
// include file
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录