Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
d5afc1ba
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d5afc1ba
编写于
6月 07, 2022
作者:
S
shixingbo
提交者:
GitHub
6月 07, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimized the performance of activation op in XPU2 (#43187)
上级
9551e466
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
73 addition
and
36 deletion
+73
-36
paddle/fluid/operators/optimizers/cast_with_ptr.h
paddle/fluid/operators/optimizers/cast_with_ptr.h
+1
-1
paddle/phi/kernels/funcs/elementwise_base.h
paddle/phi/kernels/funcs/elementwise_base.h
+53
-24
paddle/phi/kernels/primitive/datamover_primitives.h
paddle/phi/kernels/primitive/datamover_primitives.h
+3
-2
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+16
-9
未找到文件。
paddle/fluid/operators/optimizers/cast_with_ptr.h
浏览文件 @
d5afc1ba
...
...
@@ -44,7 +44,7 @@ static void VecCastKernel(const platform::CUDADeviceContext &ctx, const InT *x,
phi
::
Array
<
_ptr_
OutT
*
,
1
>
out_arr
;
out_arr
[
0
]
=
y
;
phi
::
funcs
::
VectorizedElementwiseKernel
<
OutT
,
FunctorT
,
1
,
1
,
VecSize
>
<<<
block
,
thread
,
0
,
stream
>>>
(
in_arr
,
out_arr
,
n
,
main_offset
,
<<<
block
,
thread
,
0
,
stream
>>>
(
in_arr
,
out_arr
,
n
,
main_offset
,
VecSize
,
FunctorT
());
}
...
...
paddle/phi/kernels/funcs/elementwise_base.h
浏览文件 @
d5afc1ba
...
...
@@ -513,19 +513,23 @@ struct Loader {
ArgsT
*
args
,
int
num
,
int
data_offset
,
int
read_lens
,
bool
is_boundary
)
{
using
Type
=
std
::
tuple_element_t
<
Index
,
ArgsT
>
;
kps
::
Init
<
Type
,
ArgsT
,
Index
,
VecSize
>
(
args
,
static_cast
<
Type
>
(
1.0
f
));
kps
::
Init
<
Type
,
ArgsT
,
Index
,
VecSize
>
(
args
,
static_cast
<
Type
>
(
1.0
f
),
read_lens
);
if
(
is_boundary
)
{
kps
::
ReadData
<
Type
,
VecSize
,
1
,
1
,
ArgsT
,
Index
,
true
>
(
args
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
data_offset
,
num
);
num
,
read_lens
);
}
else
{
kps
::
ReadData
<
Type
,
VecSize
,
1
,
1
,
ArgsT
,
Index
,
false
>
(
args
,
reinterpret_cast
<
const
_ptr_
Type
*>
(
in
[
Index
])
+
data_offset
,
num
);
num
,
read_lens
);
}
}
};
...
...
@@ -660,11 +664,20 @@ template <typename OutT,
typename
ArgsT
,
int
Arity
>
struct
SameDimsElementwisePrimitiveCaller
{
__device__
inline
void
operator
()(
Functor
func
,
ArgsT
*
args
,
OutT
*
result
)
{
__device__
inline
void
operator
()(
Functor
func
,
ArgsT
*
args
,
OutT
*
result
,
int
read_lens
)
{
#ifdef PADDLE_WITH_XPU_KP
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
result
[
idx
]
=
static_cast
<
OutT
>
(
Apply
(
func
,
args
[
idx
]));
}
#else
#pragma unroll
for
(
int
idx
=
0
;
idx
<
VecSize
;
++
idx
)
{
result
[
idx
]
=
static_cast
<
OutT
>
(
Apply
(
func
,
args
[
idx
]));
}
#endif
}
};
...
...
@@ -750,6 +763,7 @@ __device__ void VectorizedElementwiseKernelImpl(
phi
::
Array
<
_ptr_
OutT
*
,
NumOuts
>
outs
,
int
num
,
int
data_offset
,
int
read_lens
,
Functor
func
)
{
using
Traits
=
paddle
::
platform
::
FunctionTraits
<
Functor
>
;
using
ArgsT
=
typename
Traits
::
ArgsTuple
;
...
...
@@ -757,16 +771,16 @@ __device__ void VectorizedElementwiseKernelImpl(
ConditionalT
<
OutT
,
NumOuts
>
result
[
VecSize
];
Unroller
<
Loader
,
VecSize
,
Arity
>::
step
(
in
,
args
,
num
,
data_offset
,
IsBoundary
);
in
,
args
,
num
,
data_offset
,
read_lens
,
IsBoundary
);
SameDimsElementwisePrimitiveCaller
<
ConditionalT
<
OutT
,
NumOuts
>
,
VecSize
,
Functor
,
ArgsT
,
Arity
>
()(
func
,
args
,
result
);
Arity
>
()(
func
,
args
,
result
,
read_lens
);
ElementwiseWriteDataCaller
<
OutT
,
VecSize
,
IsBoundary
,
NumOuts
>
()(
outs
,
result
,
data_offset
,
num
);
ElementwiseWriteDataCaller
Bc
<
OutT
,
VecSize
,
IsBoundary
,
NumOuts
>
()(
outs
,
result
,
data_offset
,
num
,
read_lens
);
}
template
<
typename
OutT
,
typename
Functor
,
int
Arity
,
int
NumOuts
,
int
VecSize
>
...
...
@@ -775,9 +789,10 @@ __global__ void VectorizedElementwiseKernel(
phi
::
Array
<
_ptr_
OutT
*
,
NumOuts
>
outs
,
int
size
,
int
main_offset
,
int
read_lens
,
Functor
func
)
{
int
data_offset
=
BLOCK_ID_X
*
BLOCK_NUM_X
*
VecSize
;
int
stride
=
BLOCK_NUM_X
*
GRID_NUM_X
*
VecSize
;
int
data_offset
=
BLOCK_ID_X
*
BLOCK_NUM_X
*
read_lens
;
int
stride
=
BLOCK_NUM_X
*
GRID_NUM_X
*
read_lens
;
for
(;
data_offset
<
main_offset
;
data_offset
+=
stride
)
{
VectorizedElementwiseKernelImpl
<
OutT
,
Functor
,
...
...
@@ -785,7 +800,7 @@ __global__ void VectorizedElementwiseKernel(
NumOuts
,
VecSize
,
false
>
(
ins
,
outs
,
VecSize
*
BLOCK_NUM_X
,
data_offset
,
func
);
ins
,
outs
,
read_lens
*
BLOCK_NUM_X
,
data_offset
,
read_lens
,
func
);
}
int
num
=
size
-
data_offset
;
...
...
@@ -795,7 +810,8 @@ __global__ void VectorizedElementwiseKernel(
Arity
,
NumOuts
,
VecSize
,
true
>
(
ins
,
outs
,
num
,
data_offset
,
func
);
true
>
(
ins
,
outs
,
num
,
data_offset
,
read_lens
,
func
);
}
}
...
...
@@ -803,6 +819,7 @@ template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
void
ElementwiseCudaKernel
(
const
KPDevice
&
ctx
,
const
std
::
vector
<
const
DenseTensor
*>
&
ins
,
std
::
vector
<
DenseTensor
*>
*
outs
,
int
read_lens
,
Functor
func
)
{
auto
numel
=
(
*
outs
)[
0
]
->
numel
();
// To avoid running errors when ins.size()== 0
...
...
@@ -817,10 +834,10 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
int
block_size
=
64
;
int
grid_size
=
8
;
auto
stream
=
ctx
.
x_context
()
->
xpu_stream
;
int
main_offset
=
(
numel
/
(
VecSize
*
block_size
))
*
VecSize
*
block_size
;
int
main_offset
=
(
numel
/
(
read_lens
*
block_size
))
*
read_lens
*
block_size
;
VectorizedElementwiseKernel
<
OutT
,
Functor
,
Arity
,
NumOuts
,
VecSize
>
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
ins_data
,
outs_data
,
numel
,
main_offset
,
func
);
ins_data
,
outs_data
,
numel
,
main_offset
,
read_lens
,
func
);
#else
auto
gpu_config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
ctx
,
numel
,
VecSize
);
...
...
@@ -829,7 +846,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
auto
stream
=
ctx
.
stream
();
VectorizedElementwiseKernel
<
OutT
,
Functor
,
Arity
,
NumOuts
,
VecSize
>
<<<
gpu_config
.
block_per_grid
,
gpu_config
.
thread_per_block
,
0
,
stream
>>>
(
ins_data
,
outs_data
,
numel
,
main_offset
,
func
);
ins_data
,
outs_data
,
numel
,
main_offset
,
VecSize
,
func
);
#endif
}
...
...
@@ -868,20 +885,32 @@ void ElementwiseKernel(const KPDevice &ctx,
}
}
#ifdef PADDLE_WITH_XPU_KP
const
int
buf_size
=
256
;
int
numel
=
(
*
outs
)[
0
]
->
numel
();
int
block_size
=
64
;
int
grid_size
=
8
;
int
nthreads
=
block_size
*
grid_size
;
int
read_lens
=
std
::
min
(
buf_size
,
kps
::
details
::
RoundUpDiv
(
numel
,
32
*
nthreads
)
*
32
);
int
vec_size
=
buf_size
;
#else
// calculate the max vec_size for all ins and outs
int
vec_size
=
GetVectorizedSizeForTensors
<
OutT
,
Functor
>
(
ins
,
*
outs
);
int
read_lens
=
vec_size
;
#endif
switch
(
vec_size
)
{
case
4
:
ElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
4
>
(
ctx
,
ins
,
outs
,
func
);
case
VecSizeL
:
ElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeL
>
(
ctx
,
ins
,
outs
,
read_lens
,
func
);
break
;
case
2
:
ElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
2
>
(
ctx
,
ins
,
outs
,
func
);
case
VecSizeM
:
ElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeM
>
(
ctx
,
ins
,
outs
,
read_lens
,
func
);
break
;
case
1
:
ElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
1
>
(
ctx
,
ins
,
outs
,
func
);
case
VecSizeS
:
ElementwiseCudaKernel
<
OutT
,
Functor
,
kArity
,
NumOuts
,
VecSizeS
>
(
ctx
,
ins
,
outs
,
read_lens
,
func
);
break
;
default:
{
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
...
...
paddle/phi/kernels/primitive/datamover_primitives.h
浏览文件 @
d5afc1ba
...
...
@@ -259,7 +259,7 @@ __device__ __forceinline__ void Init(T* dst, T init_data, int read_lens) {
* it supports different data types of inputs.
*/
template
<
typename
T
,
typename
ArgsT
,
int
Index
,
int
NX
>
__device__
__forceinline__
void
Init
(
ArgsT
*
dst
,
T
init_data
)
{
__device__
__forceinline__
void
Init
(
ArgsT
*
dst
,
T
init_data
,
int
read_lens
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
NX
;
i
++
)
{
std
::
get
<
Index
>
(
dst
[
i
])
=
init_data
;
...
...
@@ -382,7 +382,8 @@ template <typename T,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
ArgsT
*
dst
,
const
T
*
__restrict__
src
,
int
num
)
{
int
num
,
int
read_lens
)
{
if
(
IsBoundary
)
{
// blockDim.x * NX > num
int
thread_offset
=
threadIdx
.
x
*
NX
;
#pragma unroll
...
...
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
浏览文件 @
d5afc1ba
...
...
@@ -21,6 +21,8 @@ namespace phi {
namespace
kps
{
namespace
details
{
int
RoundUpDiv
(
int
n
,
int
k
)
{
return
(
n
+
k
-
1
)
/
k
;
}
enum
class
OptType
{
// Optimize type of calc after input shape compressed
CanNotOptimize
=
-
1
,
// can not optimize, broadcast first
N_1
,
// just like {1} op {100} or {100} op {1}
...
...
@@ -425,9 +427,10 @@ __device__ __inline__ void Init(T* dst, T init_data, int read_lens) {
* it supports different data types of inputs.
*/
template
<
typename
T
,
typename
ArgsT
,
int
Index
,
int
NX
>
__device__
__forceinline__
void
Init
(
ArgsT
*
dst
,
T
init_data
)
{
__device__
__forceinline__
void
Init
(
ArgsT
*
dst
,
T
init_data
,
int
read_lens
)
{
mfence
();
#pragma unroll
for
(
int
i
=
0
;
i
<
NX
;
i
++
)
{
for
(
int
i
=
0
;
i
<
read_lens
;
i
++
)
{
std
::
get
<
Index
>
(
dst
[
i
])
=
init_data
;
}
}
...
...
@@ -523,22 +526,24 @@ template <typename T,
bool
IsBoundary
>
__device__
__forceinline__
void
ReadData
(
ArgsT
*
dst
,
const
T
_global_ptr_
*
src
,
int
num
)
{
int
thread_offset
=
core_id
()
*
NX
;
int
num
,
int
read_lens
)
{
int
thread_offset
=
core_id
()
*
read_lens
;
__local__
T
in_temp
[
1
];
__local__
T
in_vec
[
NX
];
if
(
IsBoundary
)
{
// core_num() *
NX
> num
if
(
IsBoundary
)
{
// core_num() *
read_lens
> num
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
;
++
idx
)
{
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
if
(
idx
+
thread_offset
<
num
)
{
GM2LM
(
src
+
thread_offset
+
idx
,
in_temp
,
sizeof
(
T
));
std
::
get
<
Index
>
(
dst
[
idx
])
=
in_temp
[
0
];
mfence
();
}
}
}
else
{
// core_num() *
NX
< num
GM2LM
(
src
+
thread_offset
,
in_vec
,
NX
*
sizeof
(
T
));
}
else
{
// core_num() *
read_lens
< num
GM2LM
(
src
+
thread_offset
,
in_vec
,
read_lens
*
sizeof
(
T
));
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
;
++
idx
)
{
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
std
::
get
<
Index
>
(
dst
[
idx
])
=
in_vec
[
idx
];
}
}
...
...
@@ -727,10 +732,12 @@ __device__ void WriteData(T _global_ptr_* dst,
for
(
int
idx
=
0
;
idx
<
read_lens
;
++
idx
)
{
if
(
idx
+
thread_offset
<
num
)
{
in_temp
[
0
]
=
src
[
idx
];
mfence
();
LM2GM
(
in_temp
,
dst
+
idx
+
thread_offset
,
sizeof
(
T
));
}
}
}
else
{
// core_num() * read_lens < num
mfence
();
LM2GM
(
src
,
dst
+
thread_offset
,
read_lens
*
sizeof
(
T
));
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录