Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
e5c59fc9
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
e5c59fc9
编写于
3月 14, 2022
作者:
Z
zmxdream
提交者:
GitHub
3月 14, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[GPUPS]fix instag lod information (#40483)
上级
e553f758
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
2 addition
and
60 deletion
+2
-60
paddle/fluid/operators/filter_by_instag_op.cu
paddle/fluid/operators/filter_by_instag_op.cu
+2
-60
未找到文件。
paddle/fluid/operators/filter_by_instag_op.cu
浏览文件 @
e5c59fc9
...
...
@@ -96,30 +96,6 @@ __global__ void filter_copy_fuse_kernel(
if
(
N
<
ins_end
)
ins_end
=
N
;
/*
if (!x1_lods_filled) {
for (int p = ins_start; p < ins_end; p++) {
x1_lods_data[p] = p;
}
if (idx == 0) {
x1_lods_data[N] = N;
}
}
if (!x2_lods_filled) {
for (int p = ins_start; p < ins_end; p++) {
x2_lods_data[p] = p;
}
if (idx == 0) {
x2_lods_data[N] = N;
}
}
if (!x1_lods_filled || !x2_lods_filled) {
b.sync();
}
*/
int
flag_data
[
5
];
int
prefix_sum_data
[
5
];
int
prefix_sum_data2
[
5
];
...
...
@@ -173,8 +149,6 @@ __global__ void filter_copy_fuse_kernel(
local_addr
=
prefix_sum_data
[
ins_end
-
1
-
ins_start
];
sum_addr
=
local_addr
;
// flag
// local_flag = 0;
for
(
int
p
=
ins_start
;
p
<
ins_end
;
p
++
)
{
local_flag
+=
flag_data
[
p
-
ins_start
];
}
...
...
@@ -188,7 +162,6 @@ __global__ void filter_copy_fuse_kernel(
sum_out_lods
=
local_out_lods
;
}
// 32 threads
for
(
int
i
=
1
;
i
<
warp_thread_num
;
i
*=
2
)
{
int
temp_addr
=
g
.
shfl_up
(
sum_addr
,
i
);
int
temp_flag
=
g
.
shfl_up
(
sum_flag
,
i
);
...
...
@@ -266,27 +239,16 @@ __global__ void filter_copy_fuse_kernel(
if
(
ins_start
<
ins_end
)
{
int
out_lods_idx
=
p_flag
+
1
;
// ins_start = 1
// BUG fix
for
(
int
p
=
ins_start
;
p
<
ins_end
;
p
++
)
{
if
(
flag_data
[
p
-
ins_start
]
==
1
)
{
// batch_len = 2
// batch_len = 4
size_t
batch_len
=
x1_lods_data
[
p
+
1
]
-
x1_lods_data
[
p
];
// t = 0
// t = 1
int
t
=
out_lods_idx
-
1
;
// out_lods_data[0] = 0;
int
previous
;
if
(
out_lods_idx
==
p_flag
+
1
)
{
// out_lods_data[t] = p_out_lods;
previous
=
p_out_lods
;
}
else
{
previous
=
out_lods_data
[
t
];
}
map_data
[
t
*
3
]
=
(
int64_t
)
previous
;
map_data
[
t
*
3
+
1
]
=
x1_lods_data
[
p
];
map_lods_data
[
t
]
=
t
;
...
...
@@ -300,7 +262,6 @@ __global__ void filter_copy_fuse_kernel(
if
(
sum_out_lods4
>
1
)
{
int
out_data_num
=
sum_out_lods4
-
1
;
int
out_start
=
ins_start
;
if
(
out_start
<
out_data_num
)
{
int
out_end
=
ins_end
>=
out_data_num
?
out_data_num
:
ins_end
;
for
(
int
p
=
out_start
;
p
<
out_end
;
p
++
)
{
...
...
@@ -314,11 +275,8 @@ __global__ void filter_copy_fuse_kernel(
if
(
flag_data
[
p
-
ins_start
]
==
1
)
{
auto
output_start_idx
=
prefix_sum_data2
[
p
-
ins_start
];
T
*
dst
=
out_data
+
output_start_idx
*
x1_embed_size
;
const
T
*
src_start
=
x1_data
+
x1_lods_data
[
p
]
*
x1_embed_size
;
const
T
*
src_end
=
x1_data
+
x1_lods_data
[
p
+
1
]
*
x1_embed_size
;
// optimized
for
(
const
T
*
j
=
src_start
;
j
!=
src_end
;
dst
++
,
j
++
)
{
*
dst
=
*
j
;
}
...
...
@@ -338,12 +296,10 @@ __global__ void copy_grad_kernel(const size_t N, const int ins_per_thread,
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
ins_start
=
idx
*
ins_per_thread
;
int
ins_end
=
(
idx
+
1
)
*
ins_per_thread
;
if
(
ins_start
>=
N
)
{
return
;
}
if
(
ins_end
>
N
)
ins_end
=
N
;
for
(
int
p
=
ins_start
;
p
<
ins_end
;
p
++
)
{
T
*
dst
=
x1_grad_data
+
map_data
[
p
*
3
+
1
]
*
x1_embed_size
;
const
T
*
src_start
=
out_grad_data
+
map_data
[
p
*
3
]
*
x1_embed_size
;
...
...
@@ -394,21 +350,17 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
const
Tensor
*
x3
=
context
.
Input
<
Tensor
>
(
"Filter_tag"
);
const
int64_t
*
x3_data
=
x3
->
data
<
int64_t
>
();
// int x2_lods_filled = 1;
Vector
<
size_t
>
x2_lods
;
// Vector, in GPU
if
(
x2
->
lod
().
size
()
!=
0
)
{
// lod_level = 1
x2_lods
=
x2
->
lod
()[
0
];
// x2_lods_filled = 1;
}
else
{
// lod_level = 0
const
size_t
x2_lods_size
=
x2
->
dims
()[
0
];
const
size_t
instag_per_num
=
x2
->
dims
()[
1
];
// x2_lods.resize(x2->dims()[0] + 1);
// move to cuda
x2_lods
.
push_back
(
0
);
for
(
size_t
i
=
0
;
i
<
x2_lods_size
;
i
++
)
{
x2_lods
.
push_back
(
i
+
1
);
x2_lods
.
push_back
(
x2_lods
.
back
()
+
instag_per_num
);
}
}
...
...
@@ -417,13 +369,8 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
size_t
*
x2_lods_data
=
mixv_x2_lods
.
CUDAMutableData
(
gpu_place
);
// Vector, in GPU
// int x1_lods_filled = 1;
Vector
<
size_t
>
x1_lods
;
if
(
!
is_x1_lod
)
{
// move to cuda
// x1_lods.resize(x1->dims()[0] + 1);
x1_lods
.
push_back
(
0
);
for
(
int
i
=
0
;
i
<
x1
->
dims
()[
0
];
i
++
)
{
x1_lods
.
push_back
(
i
+
1
);
...
...
@@ -432,7 +379,6 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
// x1_lods = context.Input<LoDTensor>("Ins")->lod()[0];
// new: lod_level=0 => lod() return {}
if
(
x1
->
lod
().
size
()
!=
0
)
{
// lod_level = 1
// x1_lods_filled = 1;
x1_lods
=
x1
->
lod
()[
0
];
}
else
{
// lod_level = 0
// x1_lods.resize(x1->dims()[0] + 1);
...
...
@@ -458,10 +404,6 @@ class FilterByInstagGPUKernel : public framework::OpKernel<T> {
LoDTensor
*
loss_weight
=
context
.
Output
<
LoDTensor
>
(
"LossWeight"
);
int
out_first
=
x1_lods
.
back
();
// int out_first = x1->dims()[0];
// if (x1_lods_filled) {
// out_first = x1_lods.back();
// }
out
->
Resize
(
phi
::
make_ddim
({(
int64_t
)
out_first
,
(
int64_t
)
x1_embed_size
}));
map
->
Resize
(
phi
::
make_ddim
({(
int64_t
)
x2_lods_size
,
3
}));
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录