Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
aa0c885a
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
aa0c885a
编写于
7月 07, 2022
作者:
S
shixingbo
提交者:
GitHub
7月 07, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimized the performance of broadcast for kp XPU2 (#44091)
上级
1e6137b5
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
35 addition
and
19 deletion
+35
-19
paddle/phi/kernels/funcs/elementwise_base.h
paddle/phi/kernels/funcs/elementwise_base.h
+8
-16
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+27
-3
未找到文件。
paddle/phi/kernels/funcs/elementwise_base.h
100644 → 100755
浏览文件 @
aa0c885a
...
...
@@ -558,6 +558,9 @@ struct VecSizeGetter {
template
<
typename
OutT
,
typename
Functor
>
int
GetVectorizedSizeForTensors
(
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
const
std
::
vector
<
DenseTensor
*>
&
outs
)
{
#ifdef PADDLE_WITH_XPU_KP
int
vec_size
=
256
;
#else
using
Traits
=
paddle
::
platform
::
FunctionTraits
<
Functor
>
;
using
ArgsT
=
typename
Traits
::
ArgsTuple
;
const
int
Arity
=
Traits
::
arity
;
...
...
@@ -569,6 +572,7 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
vec_size
=
std
::
min
<
int
>
(
vec_size
,
phi
::
GetVectorizedSize
((
*
iter
)
->
data
<
OutT
>
()));
}
#endif
return
vec_size
;
}
...
...
@@ -784,7 +788,6 @@ template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
void
LaunchElementwiseCudaKernel
(
const
KPDevice
&
ctx
,
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
std
::
vector
<
DenseTensor
*>
*
outs
,
int
read_lens
,
Functor
func
)
{
// There are at least 1 output, but maybe 0 input (ins.size() == 0).
// For large tensor numel * sizeof(T) > 2^31, we must use int64_t as index
...
...
@@ -800,6 +803,7 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
#ifdef PADDLE_WITH_XPU_KP
int
block_size
=
64
;
int
grid_size
=
8
;
int
read_lens
=
kps
::
details
::
GetXpuReadLens
(
numel
,
block_size
,
grid_size
);
auto
stream
=
ctx
.
x_context
()
->
xpu_stream
;
int64_t
main_offset
=
(
numel
/
(
read_lens
*
block_size
))
*
read_lens
*
block_size
;
...
...
@@ -853,32 +857,20 @@ void ElementwiseKernel(const KPDevice &ctx,
}
}
#ifdef PADDLE_WITH_XPU_KP
const
int
buf_size
=
256
;
int
numel
=
(
*
outs
)[
0
]
->
numel
();
int
block_size
=
64
;
int
grid_size
=
8
;
int
nthreads
=
block_size
*
grid_size
;
int
read_lens
=
std
::
min
(
buf_size
,
kps
::
details
::
RoundUpDiv
(
numel
,
32
*
nthreads
)
*
32
);
int
vec_size
=
buf_size
;
#else
// calculate the max vec_size for all ins and outs
int
vec_size
=
GetVectorizedSizeForTensors
<
OutT
,
Functor
>
(
ins
,
*
outs
);
int
read_lens
=
vec_size
;
#endif
switch
(
vec_size
)
{
case
VecSizeL
:
LaunchElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeL
>
(
ctx
,
ins
,
outs
,
read_lens
,
func
);
ctx
,
ins
,
outs
,
func
);
break
;
case
VecSizeM
:
LaunchElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeM
>
(
ctx
,
ins
,
outs
,
read_lens
,
func
);
ctx
,
ins
,
outs
,
func
);
break
;
case
VecSizeS
:
LaunchElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeS
>
(
ctx
,
ins
,
outs
,
read_lens
,
func
);
ctx
,
ins
,
outs
,
func
);
break
;
default:
{
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
...
...
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
100644 → 100755
浏览文件 @
aa0c885a
...
...
@@ -21,7 +21,17 @@ namespace phi {
namespace
kps
{
namespace
details
{
int
RoundUpDiv
(
int
n
,
int
k
)
{
return
(
n
+
k
-
1
)
/
k
;
}
static
inline
int
RoundUpDiv
(
int
n
,
int
k
)
{
return
(
n
+
k
-
1
)
/
k
;
}
static
inline
int
GetXpuReadLens
(
int
numel
,
int
block_num
,
int
grid_num
)
{
const
int
buf_size
=
256
;
int
nthreads
=
block_num
*
grid_num
;
if
(
numel
/
nthreads
==
1
)
{
return
numel
/
nthreads
*
4
;
}
int
read_lens
=
std
::
min
(
buf_size
,
RoundUpDiv
(
numel
,
32
*
nthreads
)
*
32
);
return
read_lens
;
}
enum
class
OptType
{
// Optimize type of calc after input shape compressed
CanNotOptimize
=
-
1
,
// can not optimize, broadcast first
...
...
@@ -98,8 +108,10 @@ struct BroadcastConfig {
strides_out_tmp
[
i
]
=
strides_out_tmp
[
i
-
1
]
*
out_dims
[
i
-
1
];
}
int
numel_out
=
1
;
for
(
int
i
=
0
;
i
<
dim_size
;
i
++
)
{
dim_tmp
[
i
]
=
in_dims
[
i
];
numel_out
=
out_dims
[
i
]
*
numel_out
;
}
kDims
=
dim_size
;
memcpy
(
strides_in
,
strides_in_tmp
.
data
(),
kDims
*
sizeof
(
int
));
...
...
@@ -108,13 +120,25 @@ struct BroadcastConfig {
cmp_res
=
get_mnk_for_broadcast_ops
(
in_dims
,
y_in_dims
);
get_opt_type
();
buf_len
=
get_buf_len
();
buf_len
=
get_buf_len
(
numel_out
);
int
numel_x
=
1
;
int
numel_y
=
1
;
for
(
int
i
=
0
;
i
<
dim_size
;
i
++
)
{
numel_x
=
in_dims
[
i
]
*
numel_x
;
numel_y
=
y_in_dims
[
i
]
*
numel_y
;
}
if
(
numel_out
==
numel_x
&&
numel_out
==
numel_y
)
{
buf_len
=
GetXpuReadLens
(
numel_out
,
8
,
64
);
}
}
int
get_buf_len
()
{
int
get_buf_len
(
int
numel
)
{
if
(
cmp_type
==
OptType
::
CanNotOptimize
)
{
return
256
;
}
if
(
cmp_type
==
OptType
::
N_1
)
{
return
kps
::
details
::
GetXpuReadLens
(
numel
,
8
,
64
);
}
int
max_buf_len
=
512
;
int
buf_len
=
m
/
16
*
16
;
if
(
buf_len
==
0
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录