Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
aa0c885a
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
aa0c885a
编写于
7月 07, 2022
作者:
S
shixingbo
提交者:
GitHub
7月 07, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimized the performance of broadcast for kp XPU2 (#44091)
上级
1e6137b5
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
35 addition
and
19 deletion
+35
-19
paddle/phi/kernels/funcs/elementwise_base.h
paddle/phi/kernels/funcs/elementwise_base.h
+8
-16
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+27
-3
未找到文件。
paddle/phi/kernels/funcs/elementwise_base.h
100644 → 100755
浏览文件 @
aa0c885a
...
...
@@ -558,6 +558,9 @@ struct VecSizeGetter {
template
<
typename
OutT
,
typename
Functor
>
int
GetVectorizedSizeForTensors
(
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
const
std
::
vector
<
DenseTensor
*>
&
outs
)
{
#ifdef PADDLE_WITH_XPU_KP
int
vec_size
=
256
;
#else
using
Traits
=
paddle
::
platform
::
FunctionTraits
<
Functor
>
;
using
ArgsT
=
typename
Traits
::
ArgsTuple
;
const
int
Arity
=
Traits
::
arity
;
...
...
@@ -569,6 +572,7 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
vec_size
=
std
::
min
<
int
>
(
vec_size
,
phi
::
GetVectorizedSize
((
*
iter
)
->
data
<
OutT
>
()));
}
#endif
return
vec_size
;
}
...
...
@@ -784,7 +788,6 @@ template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
void
LaunchElementwiseCudaKernel
(
const
KPDevice
&
ctx
,
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
std
::
vector
<
DenseTensor
*>
*
outs
,
int
read_lens
,
Functor
func
)
{
// There are at least 1 output, but maybe 0 input (ins.size() == 0).
// For large tensor numel * sizeof(T) > 2^31, we must use int64_t as index
...
...
@@ -800,6 +803,7 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
#ifdef PADDLE_WITH_XPU_KP
int
block_size
=
64
;
int
grid_size
=
8
;
int
read_lens
=
kps
::
details
::
GetXpuReadLens
(
numel
,
block_size
,
grid_size
);
auto
stream
=
ctx
.
x_context
()
->
xpu_stream
;
int64_t
main_offset
=
(
numel
/
(
read_lens
*
block_size
))
*
read_lens
*
block_size
;
...
...
@@ -853,32 +857,20 @@ void ElementwiseKernel(const KPDevice &ctx,
}
}
#ifdef PADDLE_WITH_XPU_KP
const
int
buf_size
=
256
;
int
numel
=
(
*
outs
)[
0
]
->
numel
();
int
block_size
=
64
;
int
grid_size
=
8
;
int
nthreads
=
block_size
*
grid_size
;
int
read_lens
=
std
::
min
(
buf_size
,
kps
::
details
::
RoundUpDiv
(
numel
,
32
*
nthreads
)
*
32
);
int
vec_size
=
buf_size
;
#else
// calculate the max vec_size for all ins and outs
int
vec_size
=
GetVectorizedSizeForTensors
<
OutT
,
Functor
>
(
ins
,
*
outs
);
int
read_lens
=
vec_size
;
#endif
switch
(
vec_size
)
{
case
VecSizeL
:
LaunchElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeL
>
(
ctx
,
ins
,
outs
,
read_lens
,
func
);
ctx
,
ins
,
outs
,
func
);
break
;
case
VecSizeM
:
LaunchElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeM
>
(
ctx
,
ins
,
outs
,
read_lens
,
func
);
ctx
,
ins
,
outs
,
func
);
break
;
case
VecSizeS
:
LaunchElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeS
>
(
ctx
,
ins
,
outs
,
read_lens
,
func
);
ctx
,
ins
,
outs
,
func
);
break
;
default:
{
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
...
...
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
100644 → 100755
浏览文件 @
aa0c885a
...
...
@@ -21,7 +21,17 @@ namespace phi {
namespace
kps
{
namespace
details
{
int
RoundUpDiv
(
int
n
,
int
k
)
{
return
(
n
+
k
-
1
)
/
k
;
}
static
inline
int
RoundUpDiv
(
int
n
,
int
k
)
{
return
(
n
+
k
-
1
)
/
k
;
}
static
inline
int
GetXpuReadLens
(
int
numel
,
int
block_num
,
int
grid_num
)
{
const
int
buf_size
=
256
;
int
nthreads
=
block_num
*
grid_num
;
if
(
numel
/
nthreads
==
1
)
{
return
numel
/
nthreads
*
4
;
}
int
read_lens
=
std
::
min
(
buf_size
,
RoundUpDiv
(
numel
,
32
*
nthreads
)
*
32
);
return
read_lens
;
}
enum
class
OptType
{
// Optimize type of calc after input shape compressed
CanNotOptimize
=
-
1
,
// can not optimize, broadcast first
...
...
@@ -98,8 +108,10 @@ struct BroadcastConfig {
strides_out_tmp
[
i
]
=
strides_out_tmp
[
i
-
1
]
*
out_dims
[
i
-
1
];
}
int
numel_out
=
1
;
for
(
int
i
=
0
;
i
<
dim_size
;
i
++
)
{
dim_tmp
[
i
]
=
in_dims
[
i
];
numel_out
=
out_dims
[
i
]
*
numel_out
;
}
kDims
=
dim_size
;
memcpy
(
strides_in
,
strides_in_tmp
.
data
(),
kDims
*
sizeof
(
int
));
...
...
@@ -108,13 +120,25 @@ struct BroadcastConfig {
cmp_res
=
get_mnk_for_broadcast_ops
(
in_dims
,
y_in_dims
);
get_opt_type
();
buf_len
=
get_buf_len
();
buf_len
=
get_buf_len
(
numel_out
);
int
numel_x
=
1
;
int
numel_y
=
1
;
for
(
int
i
=
0
;
i
<
dim_size
;
i
++
)
{
numel_x
=
in_dims
[
i
]
*
numel_x
;
numel_y
=
y_in_dims
[
i
]
*
numel_y
;
}
if
(
numel_out
==
numel_x
&&
numel_out
==
numel_y
)
{
buf_len
=
GetXpuReadLens
(
numel_out
,
8
,
64
);
}
}
int
get_buf_len
()
{
int
get_buf_len
(
int
numel
)
{
if
(
cmp_type
==
OptType
::
CanNotOptimize
)
{
return
256
;
}
if
(
cmp_type
==
OptType
::
N_1
)
{
return
kps
::
details
::
GetXpuReadLens
(
numel
,
8
,
64
);
}
int
max_buf_len
=
512
;
int
buf_len
=
m
/
16
*
16
;
if
(
buf_len
==
0
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录