Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
c142e37d
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
c142e37d
编写于
3月 17, 2022
作者:
N
niuliling123
提交者:
GitHub
3月 17, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Replace PADDLE_WITH_XPU2 with PADDLE_WITH_KP (#40560)
* Replace PADDLE_WITH_XPU2 with PADDLE_WITH_KP
上级
81848fff
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
75 addition
and
135 deletion
+75
-135
paddle/phi/kernels/funcs/reduce_function.h
paddle/phi/kernels/funcs/reduce_function.h
+57
-135
paddle/phi/kernels/primitive/datamover_primitives.h
paddle/phi/kernels/primitive/datamover_primitives.h
+8
-0
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+10
-0
未找到文件。
paddle/phi/kernels/funcs/reduce_function.h
浏览文件 @
c142e37d
...
...
@@ -14,8 +14,8 @@
#pragma once
// CUDA and HIP use same api
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// CUDA
, XPU
and HIP use same api
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
|| defined(__xpu__)
#include <algorithm>
#include <cmath>
...
...
@@ -220,7 +220,7 @@ struct IndexCalculator {
phi
::
Array
<
int
,
kMaxRank
>
dims
;
phi
::
Array
<
int
,
kMaxRank
>
strides
;
phi
::
Array
<
int
,
kMaxRank
>
reduce_strides
;
#ifndef PADDLE_WITH_XPU
2
#ifndef PADDLE_WITH_XPU
_KP
phi
::
Array
<
paddle
::
platform
::
FastDivMod
,
kMaxRank
>
divmoders
;
#endif
};
...
...
@@ -231,81 +231,65 @@ struct ReduceIndexMapping {
HOSTDEVICE
explicit
ReduceIndexMapping
(
const
kps
::
DimConfig
&
dims
)
:
dim
(
dims
)
{}
#ifdef PADDLE_WITH_XPU_KP
__device__
__forceinline__
int
BlockIdX
()
{
#ifdef PADDLE_WITH_XPU2
if
(
ReduceLastDim
)
{
return
(
cluster_id
()
/
dim
.
split_num_x
%
dim
.
split_num_y
);
}
else
{
return
cluster_id
()
%
dim
.
split_num_x
;
}
#else
return
blockIdx
.
x
;
#endif
}
__device__
__forceinline__
int
BlockIdY
()
{
#ifdef PADDLE_WITH_XPU2
if
(
ReduceLastDim
)
{
return
(
cluster_id
()
%
dim
.
split_num_x
);
}
else
{
return
(
cluster_id
()
/
dim
.
split_num_x
%
dim
.
split_num_y
);
}
#else
return
blockIdx
.
y
;
#endif
}
__device__
__forceinline__
int
BlockDimX
()
{
#ifdef PADDLE_WITH_XPU2
return
dim
.
deal_size_x
;
#else
return
blockDim
.
x
;
#endif
}
__device__
__forceinline__
int
BlockDimX
()
{
return
dim
.
deal_size_x
;
}
__device__
__forceinline__
int
BlockDimY
()
{
#ifdef PADDLE_WITH_XPU2
return
1
;
#else
return
blockDim
.
y
;
#endif
}
__device__
__forceinline__
int
BlockDimY
()
{
return
1
;
}
__device__
__forceinline__
int
GridDimX
()
{
#ifdef PADDLE_WITH_XPU2
if
(
ReduceLastDim
)
{
return
dim
.
split_num_y
;
}
else
{
return
dim
.
split_num_x
;
}
#else
return
gridDim
.
x
;
#endif
}
__device__
__forceinline__
int
GridDimY
()
{
#ifdef PADDLE_WITH_XPU2
if
(
ReduceLastDim
)
{
return
dim
.
split_num_x
;
}
else
{
return
dim
.
split_num_y
;
}
#else
return
gridDim
.
y
;
#endif
}
__device__
__forceinline__
int
GetLoopSize
()
{
#ifdef PADDLE_WITH_XPU2
if
(
ReduceLastDim
)
{
return
dim
.
deal_size_y
;
}
else
{
return
dim
.
deal_size_x
;
}
}
#else
return
1
;
__device__
__forceinline__
int
BlockIdX
()
{
return
blockIdx
.
x
;
}
__device__
__forceinline__
int
BlockIdY
()
{
return
blockIdx
.
y
;
}
__device__
__forceinline__
int
BlockDimX
()
{
return
blockDim
.
x
;
}
__device__
__forceinline__
int
BlockDimY
()
{
return
blockDim
.
y
;
}
__device__
__forceinline__
int
GridDimX
()
{
return
gridDim
.
x
;
}
__device__
__forceinline__
int
GridDimY
()
{
return
gridDim
.
y
;
}
__device__
int
GetLoopSize
()
{
return
1
;
}
#endif
}
};
// when reduce_type == kReduceLastDim this struct will be used
...
...
@@ -341,7 +325,7 @@ struct ReduceConfig {
// when should_reduce_again is true, we need malloc temp space for temp data
void
SetOutputData
(
Ty
*
y_data
,
const
phi
::
GPUContext
&
dev_ctx
,
const
KPDevice
&
dev_ctx
,
phi
::
DenseTensor
*
tmp
)
{
if
(
should_reduce_again
)
{
tmp
->
Resize
(
phi
::
make_ddim
(
...
...
@@ -640,9 +624,7 @@ struct ReduceConfig {
int
blocking_size
;
bool
should_reduce_again
;
bool
reduce_last_dim
;
Ty
*
output_data
;
dim3
block
;
dim3
grid
;
};
...
...
@@ -770,9 +752,10 @@ __global__ void ReduceAnyKernel(const Tx* x,
kps
::
Reduce
<
MPType
,
1
,
1
,
1
,
ReduceOp
,
kps
::
details
::
kGlobalMode
>
(
&
reduce_var
,
&
reduce_var
,
reducer
,
reduce_last_dim
);
if
(
need_store
)
{
y
[
store_offset
+
i
]
=
static_cast
<
Ty
>
(
reduce_var
);
}
Ty
result
=
static_cast
<
Ty
>
(
reduce_var
);
kps
::
details
::
WriteData
<
Ty
>
(
y
+
store_offset
+
i
,
&
result
,
static_cast
<
int
>
(
need_store
));
}
}
...
...
@@ -882,30 +865,18 @@ static void LaunchReduceKernel(const Tx* x_data,
dim
.
SetRem
(
config
.
reduce_num
%
config
.
block
.
x
,
0
,
0
);
#ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel
<
Tx
,
Ty
,
MPType
,
ReduceOp
,
TransformOp
,
OneDimIndexCal
><<<
8
,
64
,
0
,
stream
>>>
(
x_data
,
config
.
output_data
,
reducer
,
transform
,
init
,
config
.
reduce_num
,
config
.
left_num
,
config
.
reduce_last_dim
,
reduce_index_calculator
,
left_index_calculator
,
dim
);
auto
grid_num
=
8
;
auto
block_num
=
64
;
#else
auto
grid_num
=
config
.
grid
;
auto
block_num
=
config
.
block
;
#endif
ReduceAnyKernel
<
Tx
,
Ty
,
MPType
,
ReduceOp
,
TransformOp
,
OneDimIndexCal
><<<
config
.
grid
,
config
.
block
,
0
,
stream
>>>
(
OneDimIndexCal
><<<
grid_num
,
block_num
,
0
,
stream
>>>
(
x_data
,
config
.
output_data
,
reducer
,
...
...
@@ -917,7 +888,6 @@ static void LaunchReduceKernel(const Tx* x_data,
reduce_index_calculator
,
left_index_calculator
,
dim
);
#endif
}
else
{
int
reduce_rank
=
config
.
reduce_strides
.
size
();
...
...
@@ -938,30 +908,18 @@ static void LaunchReduceKernel(const Tx* x_data,
dim
.
SetRem
(
config
.
reduce_num
%
config
.
block
.
x
,
0
,
0
);
#ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel
<
Tx
,
Ty
,
MPType
,
ReduceOp
,
TransformOp
,
IndexCalculator
><<<
8
,
64
,
0
,
stream
>>>
(
x_data
,
config
.
output_data
,
reducer
,
transform
,
init
,
config
.
reduce_num
,
config
.
left_num
,
config
.
reduce_last_dim
,
reduce_index_calculator
,
left_index_calculator
,
dim
);
auto
grid_num
=
8
;
auto
block_num
=
64
;
#else
auto
grid_num
=
config
.
grid
;
auto
block_num
=
config
.
block
;
#endif
ReduceAnyKernel
<
Tx
,
Ty
,
MPType
,
ReduceOp
,
TransformOp
,
IndexCalculator
><<<
config
.
grid
,
config
.
block
,
0
,
stream
>>>
(
IndexCalculator
><<<
grid_num
,
block_num
,
0
,
stream
>>>
(
x_data
,
config
.
output_data
,
reducer
,
...
...
@@ -973,7 +931,6 @@ static void LaunchReduceKernel(const Tx* x_data,
reduce_index_calculator
,
left_index_calculator
,
dim
);
#endif
}
if
(
config
.
should_reduce_again
)
{
...
...
@@ -993,22 +950,9 @@ static void LaunchReduceKernel(const Tx* x_data,
kps
::
DimConfig
(
grid
.
x
,
grid
.
y
,
grid
.
z
,
block
.
x
,
config
.
grid
.
y
,
0
);
dim
.
SetRem
(
config
.
left_num
%
block
.
x
,
0
,
0
);
#ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel
<
Ty
,
Ty
,
MPType
,
ReduceOp
,
kps
::
IdentityFunctor
<
Ty
,
MPType
>><<<
8
,
64
,
0
,
stream
>>>
(
config
.
output_data
,
y_data
,
reducer
,
kps
::
IdentityFunctor
<
Ty
,
MPType
>
(),
init
,
config
.
grid
.
y
,
config
.
left_num
,
config
.
grid
.
y
,
dim
);
#else
grid
=
8
;
block
=
64
;
#endif
ReduceHigherDimKernel
<
Ty
,
Ty
,
...
...
@@ -1024,7 +968,6 @@ static void LaunchReduceKernel(const Tx* x_data,
config
.
left_num
,
config
.
grid
.
y
,
dim
);
#endif
}
}
...
...
@@ -1038,7 +981,7 @@ CubTensorReduceImpl(const Tx* x_data,
Ty
*
y_data
,
const
TransformOp
&
transform
,
int
reduce_num
,
const
phi
::
GPUContext
&
dev_ctx
,
const
KPDevice
&
dev_ctx
,
KPStream
stream
)
{
auto
reducer
=
ReduceOp
<
Ty
>
();
cub
::
TransformInputIterator
<
Ty
,
TransformOp
,
const
Tx
*>
trans_x
(
x_data
,
...
...
@@ -1077,7 +1020,7 @@ CubTensorReduceImpl(const Tx* x_data,
Ty
*
y_data
,
const
TransformOp
&
transform
,
int
reduce_num
,
const
phi
::
GPUContext
&
dev_ctx
,
const
KPDevice
&
dev_ctx
,
KPStream
stream
)
{
PADDLE_THROW
(
phi
::
errors
::
InvalidArgument
(
"Tx should not be float16 when using cub::DeviceReduce::Reduce()."
));
...
...
@@ -1087,12 +1030,16 @@ template <typename Tx,
typename
Ty
,
template
<
typename
>
class
ReduceOp
,
typename
TransformOp
>
void
ReduceKernel
(
const
phi
::
GPUContext
&
dev_ctx
,
void
ReduceKernel
(
const
KPDevice
&
dev_ctx
,
const
phi
::
DenseTensor
&
x
,
phi
::
DenseTensor
*
y
,
const
TransformOp
&
transform
,
const
std
::
vector
<
int
>&
origin_reduce_dims
)
{
#ifdef PADDLE_WITH_XPU_KP
auto
stream
=
dev_ctx
.
x_context
()
->
xpu_stream
;
#else
auto
stream
=
dev_ctx
.
stream
();
#endif
dev_ctx
.
Alloc
<
Ty
>
(
y
);
auto
x_dim
=
phi
::
vectorize
<
int
>
(
x
.
dims
());
...
...
@@ -1149,11 +1096,17 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
0
);
#ifdef PADDLE_WITH_XPU_KP
auto
grid_num
=
8
;
auto
block_num
=
64
;
#else
auto
grid_num
=
config
.
grid
;
auto
block_num
=
config
.
block
;
#endif
ReduceHigherDimKernel
<
Tx
,
Ty
,
MPType
,
ReduceOp
<
MPType
>
,
TransformOp
><<<
8
,
64
,
0
,
stream
>>>
(
TransformOp
><<<
grid_num
,
block_num
,
0
,
stream
>>>
(
x_data
,
config
.
output_data
,
reducer
,
...
...
@@ -1163,23 +1116,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
config
.
left_num
,
config
.
blocking_size
,
dim
);
#else
ReduceHigherDimKernel
<
Tx
,
Ty
,
MPType
,
ReduceOp
<
MPType
>
,
TransformOp
><<<
config
.
grid
,
config
.
block
,
0
,
stream
>>>
(
x_data
,
config
.
output_data
,
reducer
,
transform
,
reducer
.
initial
(),
config
.
reduce_num
,
config
.
left_num
,
config
.
blocking_size
,
dim
);
#endif
if
(
config
.
should_reduce_again
)
{
dim3
block
=
dim3
(
config
.
block
.
x
,
1
,
1
);
...
...
@@ -1189,22 +1125,9 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
dim2
.
SetRem
(
config
.
left_num
%
config
.
block
.
x
,
0
,
0
);
#ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel
<
Ty
,
Ty
,
MPType
,
ReduceOp
<
MPType
>
,
kps
::
IdentityFunctor
<
Ty
,
MPType
>><<<
8
,
64
,
0
,
stream
>>>
(
config
.
output_data
,
y_data
,
reducer
,
kps
::
IdentityFunctor
<
Ty
,
MPType
>
(
config
.
grid
.
y
),
reducer
.
initial
(),
config
.
grid
.
y
,
config
.
left_num
,
config
.
grid
.
y
,
dim2
);
#else
grid
=
8
;
block
=
64
;
#endif
ReduceHigherDimKernel
<
Ty
,
Ty
,
...
...
@@ -1220,7 +1143,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
config
.
left_num
,
config
.
grid
.
y
,
dim2
);
#endif
}
return
;
}
...
...
paddle/phi/kernels/primitive/datamover_primitives.h
浏览文件 @
c142e37d
...
...
@@ -115,6 +115,14 @@ struct BroadcastConfig {
}
};
template
<
typename
T
>
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
T
*
__restrict__
src
,
int
num
)
{
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
dst
[
i
]
=
src
[
i
];
}
}
#undef INT_BITS
}
// namespace details
...
...
paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
浏览文件 @
c142e37d
...
...
@@ -76,6 +76,16 @@ struct BroadcastConfig {
};
#pragma pack()
template
<
typename
T
>
__device__
__forceinline__
void
WriteData
(
T
*
_global_ptr_
dst
,
T
*
src
,
int
num
)
{
if
(
num
>
0
)
{
LM2GM
(
src
,
dst
,
num
*
sizeof
(
T
));
}
}
#undef INT_BITS
}
// namespace details
/**
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录