Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f53db251
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f53db251
编写于
3月 21, 2022
作者:
N
niuliling123
提交者:
GitHub
3月 21, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Support MaskedSelectGrad op with Kernel Primitive API (#40617)
* Support MaskedSelectGrad op with Kernel Primitive API
上级
34a256c9
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
107 addition
and
104 deletion
+107
-104
paddle/phi/kernels/funcs/select_impl.cu.h
paddle/phi/kernels/funcs/select_impl.cu.h
+68
-39
paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
+23
-60
paddle/phi/kernels/gpu/masked_select_kernel.cu
paddle/phi/kernels/gpu/masked_select_kernel.cu
+3
-2
paddle/phi/kernels/gpu/where_index_kernel.cu
paddle/phi/kernels/gpu/where_index_kernel.cu
+4
-3
paddle/phi/kernels/primitive/datamover_primitives.h
paddle/phi/kernels/primitive/datamover_primitives.h
+9
-0
未找到文件。
paddle/phi/kernels/funcs/select_impl.cu.h
浏览文件 @
f53db251
...
...
@@ -168,57 +168,82 @@ __global__ void CumsumOneBlock(
}
}
// where_index
template
<
typename
OutT
,
typename
MT
,
typename
InT
,
typename
IdT
,
typename
Functor
,
int
VecSize
,
int
IsBoundary
,
int
Is
MaskData
>
int
MaskData
>
struct
SelectCaller
{
__device__
void
inline
operator
()(
OutT
*
store_data
,
__device__
void
inline
operator
()(
OutT
*
out
,
const
MT
*
mask_data
,
const
InT
*
in
,
Functor
func
,
int
num
,
int
data_offset
)
{
// where_index op
IdT
index_reg
[
VecSize
];
// Set data index of global
kps
::
InitWithDataIndex
<
IdT
,
VecSize
,
1
,
1
>
(
&
index_reg
[
0
],
data_offset
);
int
data_offset
,
int
store_num
,
int
thread_fix
,
int
num
)
{
int64_t
in_data
[
VecSize
];
OutT
store_data
[
VecSize
*
phi
::
DDim
::
kMaxRank
];
// set index
kps
::
InitWithDataIndex
<
int64_t
,
VecSize
,
1
,
1
>
(
&
in_data
[
0
],
data_offset
);
// Get store data according to mask_idt
kps
::
OperatorTernary
<
MT
,
IdT
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
index_reg
[
0
],
func
,
VecSize
);
kps
::
OperatorTernary
<
MT
,
int64_t
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
kps
::
details
::
WriteData
<
OutT
>
(
out
+
thread_fix
,
&
store_data
[
0
],
store_num
);
}
};
// masked_select
template
<
typename
OutT
,
typename
MT
,
typename
InT
,
typename
IdT
,
typename
Functor
,
int
VecSize
,
int
IsBoundary
>
struct
SelectCaller
<
OutT
,
MT
,
InT
,
IdT
,
Functor
,
VecSize
,
IsBoundary
,
1
>
{
// masked_select
__device__
void
inline
operator
()(
OutT
*
store_data
,
struct
SelectCaller
<
OutT
,
MT
,
InT
,
Functor
,
VecSize
,
IsBoundary
,
1
>
{
__device__
void
inline
operator
()(
OutT
*
out
,
const
MT
*
mask_data
,
const
InT
*
in
,
Functor
func
,
int
num
,
int
data_offset
)
{
int
data_offset
,
int
store_num
,
int
thread_fix
,
int
num
)
{
InT
in_data
[
VecSize
];
OutT
store_data
[
VecSize
*
phi
::
DDim
::
kMaxRank
];
kps
::
ReadData
<
InT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
&
in_data
[
0
],
in
,
num
);
// Get store data according to mask_idt
kps
::
OperatorTernary
<
MT
,
InT
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
kps
::
details
::
WriteData
<
OutT
>
(
out
+
thread_fix
,
&
store_data
[
0
],
store_num
);
}
};
// masked_select_grad
template
<
typename
OutT
,
typename
MT
,
typename
InT
,
typename
Functor
,
int
VecSize
,
int
IsBoundary
>
struct
SelectCaller
<
OutT
,
MT
,
InT
,
Functor
,
VecSize
,
IsBoundary
,
2
>
{
__device__
void
inline
operator
()(
OutT
*
out
,
const
MT
*
mask_data
,
const
InT
*
in
,
Functor
func
,
int
data_offset
,
int
store_num
,
int
thread_fix
,
int
num
)
{
InT
in_data
[
VecSize
];
OutT
store_data
[
VecSize
*
phi
::
DDim
::
kMaxRank
];
kps
::
details
::
ReadData
<
InT
>
(
&
in_data
[
0
],
in
+
thread_fix
,
store_num
);
kps
::
OperatorTernary
<
MT
,
InT
,
OutT
,
Functor
>
(
store_data
,
mask_data
,
&
in_data
[
0
],
func
,
VecSize
);
kps
::
WriteData
<
OutT
,
VecSize
,
1
,
1
,
IsBoundary
>
(
out
,
&
store_data
[
0
],
num
);
}
};
...
...
@@ -253,7 +278,6 @@ SelectKernelImpl(OutT *out,
IdT
num_thread
[
kCVecSize
];
IdT
cumsum_thread
[
kCVecSize
];
OutT
store_data
[
VecSize
*
phi
::
DDim
::
kMaxRank
];
MT
mask_data
[
VecSize
];
IdT
mask_idt
[
VecSize
];
// init data_pr
...
...
@@ -271,17 +295,15 @@ SelectKernelImpl(OutT *out,
// Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
// thread_fix
kps
::
Cumsum
<
IdT
,
IdT
,
1
,
Add
>
(
&
cumsum_thread
[
0
],
&
num_thread
[
0
],
Add
());
// Get store data(index) according to mask_idt
SelectCaller
<
OutT
,
MT
,
InT
,
IdT
,
Functor
,
VecSize
,
IsBoundary
,
MaskData
>
compute
;
compute
(
&
store_data
[
0
],
&
mask_data
[
0
],
in
,
func
,
num
,
data_offset
);
// get thread_fix
int
thread_fix
=
(
static_cast
<
int
>
(
cumsum_thread
[
0
]
-
num_thread
[
0
])
*
store_rank
);
// get how many data need to store
int
store_num
=
static_cast
<
int
>
(
num_thread
[
0
])
*
store_rank
;
// thread store num data, each thread may has different num
kps
::
details
::
WriteData
<
OutT
>
(
out
+
thread_fix
,
&
store_data
[
0
],
store_num
);
// Get store data(index) according to mask_idt
SelectCaller
<
OutT
,
MT
,
InT
,
Functor
,
VecSize
,
IsBoundary
,
MaskData
>
select
;
select
(
out
,
mask_data
,
in
,
func
,
data_offset
,
store_num
,
thread_fix
,
num
);
}
template
<
typename
MT
,
...
...
@@ -303,15 +325,17 @@ __global__ void SelectKernel(OutT *out,
int
stride
=
BLOCK_NUM_X
*
GRID_NUM_X
*
VecSize
;
int
repeat
=
0
;
int
size
=
VecSize
*
BLOCK_ID_X
;
CT
block_store_offset
=
0
;
for
(;
data_offset
<
main_offset
;
data_offset
+=
stride
)
{
// Cumsum index
int
idx_cumsum
=
repeat
*
GRID_NUM_X
+
BLOCK_ID_X
;
// niuliling todo: us ReadData API
int
block_store_offset
=
cumsum
[
idx_cumsum
];
kps
::
details
::
ReadData
<
CT
>
(
&
block_store_offset
,
cumsum
+
idx_cumsum
,
1
);
int
out_fix
=
MaskData
<
2
?
block_store_offset
*
store_rank
:
data_offset
;
int
in_fix
=
MaskData
<
2
?
data_offset
:
block_store_offset
*
store_rank
;
SelectKernelImpl
<
InT
,
MT
,
OutT
,
Functor
,
VecSize
,
MaskData
,
false
>
(
out
+
block_store_offset
*
store_rank
,
out
+
out_fix
,
mask
+
data_offset
,
in
+
data_offset
,
in
+
in_fix
,
func
,
size
,
data_offset
,
...
...
@@ -323,12 +347,13 @@ __global__ void SelectKernel(OutT *out,
if
(
num
>
0
)
{
// Cumsum index
int
idx_cumsum
=
repeat
*
GRID_NUM_X
+
BLOCK_ID_X
;
// niuliling todo: us ReadData API
int
block_store_offset
=
static_cast
<
int
>
(
cumsum
[
idx_cumsum
]);
kps
::
details
::
ReadData
<
CT
>
(
&
block_store_offset
,
cumsum
+
idx_cumsum
,
1
);
int
out_fix
=
MaskData
<
2
?
block_store_offset
*
store_rank
:
data_offset
;
int
in_fix
=
MaskData
<
2
?
data_offset
:
block_store_offset
*
store_rank
;
SelectKernelImpl
<
InT
,
MT
,
OutT
,
Functor
,
VecSize
,
MaskData
,
true
>
(
out
+
block_store_offset
*
store_rank
,
out
+
out_fix
,
mask
+
data_offset
,
in
+
data_offset
,
in
+
in_fix
,
func
,
num
,
data_offset
,
...
...
@@ -402,6 +427,7 @@ void SelectKernel(const KPDevice &dev_ctx,
const
int
kCumVesize
=
2
;
const
int
block_c
=
256
;
const
int
main_offset_c
=
Floor
(
size_count_block
,
(
kCumVesize
*
block_c
));
using
Add
=
kps
::
AddFunctor
<
CT
>
;
CumsumOneBlock
<
CT
,
CT
,
Add
,
kCumVesize
><<<
1
,
block_c
,
0
,
stream
>>>
(
count_data
,
cumsum_data
,
size_count_block
,
main_offset_c
,
Add
());
...
...
@@ -418,10 +444,13 @@ void SelectKernel(const KPDevice &dev_ctx,
dev_ctx
.
Wait
();
// 3.1.2 allock for out with total_true_num
std
::
vector
<
int64_t
>
out_dim
=
{
static_cast
<
int64_t
>
(
total_true_num
)};
if
(
SelectData
==
0
)
{
// where_index
if
(
SelectData
==
1
)
{
out
->
Resize
(
phi
::
make_ddim
(
out_dim
));
}
else
if
(
SelectData
==
0
)
{
// == 0 where_index
out_dim
.
push_back
(
rank
);
out
->
Resize
(
phi
::
make_ddim
(
out_dim
));
}
out
->
Resize
(
phi
::
make_ddim
(
out_dim
));
auto
out_data
=
out
->
mutable_data
<
OutT
>
(
cuda_place
);
// 3.2 get true data's index according to cond_data and cumsum_data
if
(
total_true_num
<=
0
)
return
;
...
...
paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
浏览文件 @
f53db251
...
...
@@ -17,38 +17,31 @@
#include <thrust/reverse.h>
#include <thrust/scan.h>
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/select_impl.cu.h"
#include "paddle/phi/kernels/masked_select_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace
phi
{
__global__
void
SetMaskArrayT
(
const
bool
*
mask
,
int32_t
*
mask_array
,
int
size
)
{
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
for
(;
idx
<
size
;
idx
+=
blockDim
.
x
*
gridDim
.
x
)
{
if
(
mask
[
idx
])
mask_array
[
idx
]
=
1
;
else
mask_array
[
idx
]
=
0
;
}
}
template
<
typename
MT
,
typename
InT
,
typename
OutT
>
struct
MaskedSelectGradFunctor
{
HOSTDEVICE
MaskedSelectGradFunctor
()
{}
template
<
typename
T
>
__global__
void
SelectGradWithPrefixMask
(
const
int32_t
*
mask_prefix_sum
,
const
bool
*
mask
,
const
T
*
input
,
T
*
out
,
int
size
)
{
int
idx
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
for
(;
idx
<
size
;
idx
+=
blockDim
.
x
*
gridDim
.
x
)
{
if
(
mask
[
idx
])
{
int
index
=
mask_prefix_sum
[
idx
];
out
[
idx
]
=
input
[
index
];
}
else
{
out
[
idx
]
=
0
;
HOSTDEVICE
inline
void
operator
()(
OutT
*
out
,
const
MT
*
mask
,
const
InT
*
value
,
int
num
)
{
int
read_fix
=
0
;
for
(
int
idx
=
0
;
idx
<
num
;
idx
++
)
{
if
(
mask
[
idx
])
{
out
[
idx
]
=
value
[
read_fix
++
];
}
else
{
out
[
idx
]
=
0
;
}
}
}
}
}
;
template
<
typename
T
,
typename
Context
>
void
MaskedSelectGradKernel
(
const
Context
&
dev_ctx
,
...
...
@@ -56,42 +49,12 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
const
DenseTensor
&
x
,
const
DenseTensor
&
mask
,
DenseTensor
*
x_grad
)
{
auto
*
mask_data
=
mask
.
data
<
bool
>
();
auto
*
input_data
=
out_grad
.
data
<
T
>
();
auto
*
out_data
=
x_grad
->
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
auto
input_size
=
out_grad
.
numel
();
auto
mask_size
=
mask
.
numel
();
auto
mask_dim
=
mask
.
dims
();
auto
out_size
=
mask_size
;
DenseTensor
mask_array
;
DenseTensor
mask_prefix_sum
;
mask_array
.
Resize
(
mask_dim
);
mask_prefix_sum
.
Resize
(
mask_dim
);
int32_t
*
mask_array_data
=
mask_array
.
mutable_data
<
int32_t
>
(
dev_ctx
.
GetPlace
());
int32_t
*
mask_prefix_sum_data
=
mask_prefix_sum
.
mutable_data
<
int32_t
>
(
dev_ctx
.
GetPlace
());
int
threads
=
512
;
int
grid
=
(
mask_size
+
threads
-
1
)
/
threads
;
auto
stream
=
dev_ctx
.
stream
();
SetMaskArrayT
<<<
grid
,
threads
,
0
,
stream
>>>
(
mask_data
,
mask_array_data
,
mask_size
);
thrust
::
device_ptr
<
int32_t
>
mask_array_dev_ptr
=
thrust
::
device_pointer_cast
(
mask_array_data
);
thrust
::
device_vector
<
int32_t
>
mask_array_vec
(
mask_array_dev_ptr
,
mask_array_dev_ptr
+
mask_size
);
thrust
::
exclusive_scan
(
thrust
::
device
,
mask_array_vec
.
begin
(),
mask_array_vec
.
end
(),
mask_prefix_sum_data
);
SelectGradWithPrefixMask
<
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
mask_prefix_sum_data
,
mask_data
,
input_data
,
out_data
,
mask_size
);
auto
*
out_data
=
x_grad
->
mutable_data
<
T
>
(
dev_ctx
.
GetPlace
());
if
(
mask_size
<=
0
)
return
;
using
Functor
=
MaskedSelectGradFunctor
<
bool
,
T
,
T
>
;
phi
::
funcs
::
SelectKernel
<
bool
,
T
,
T
,
2
,
Functor
>
(
dev_ctx
,
mask
,
out_grad
,
x_grad
,
Functor
());
}
}
// namespace phi
...
...
paddle/phi/kernels/gpu/masked_select_kernel.cu
浏览文件 @
f53db251
...
...
@@ -17,11 +17,12 @@
#include <thrust/reverse.h>
#include <thrust/scan.h>
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/select_impl.cu.h"
#include "paddle/phi/kernels/masked_select_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace
phi
{
template
<
typename
MT
,
typename
InT
,
typename
OutT
>
...
...
paddle/phi/kernels/gpu/where_index_kernel.cu
浏览文件 @
f53db251
...
...
@@ -20,13 +20,14 @@
namespace
cub
=
hipcub
;
#endif
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/select_impl.cu.h"
#include "paddle/phi/kernels/where_index_kernel.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
namespace
phi
{
template
<
typename
T1
,
typename
T2
,
typename
OutT
>
struct
IndexFunctor
{
...
...
paddle/phi/kernels/primitive/datamover_primitives.h
浏览文件 @
f53db251
...
...
@@ -123,6 +123,15 @@ __device__ __forceinline__ void WriteData(T* dst,
dst
[
i
]
=
src
[
i
];
}
}
template
<
typename
T
>
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
const
T
*
__restrict__
src
,
int
num
)
{
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
dst
[
i
]
=
src
[
i
];
}
}
#undef INT_BITS
}
// namespace details
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录