Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
ea9684f1
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ea9684f1
编写于
3月 26, 2022
作者:
Y
Yiqun Liu
提交者:
GitHub
3月 26, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optmize the CPU -> GPU memcpy and avoid explit sync in some operators. (#40933)
上级
3a6201af
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
22 addition
and
47 deletion
+22
-47
paddle/phi/kernels/funcs/gather.cu.h
paddle/phi/kernels/funcs/gather.cu.h
+3
-12
paddle/phi/kernels/funcs/scatter.cu.h
paddle/phi/kernels/funcs/scatter.cu.h
+3
-14
paddle/phi/kernels/gpu/index_select_grad_kernel.cu
paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+0
-2
paddle/phi/kernels/gpu/index_select_kernel.cu
paddle/phi/kernels/gpu/index_select_kernel.cu
+0
-2
paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
+1
-1
paddle/phi/kernels/gpu/where_index_kernel.cu
paddle/phi/kernels/gpu/where_index_kernel.cu
+15
-16
未找到文件。
paddle/phi/kernels/funcs/gather.cu.h
浏览文件 @
ea9684f1
...
...
@@ -21,7 +21,6 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/utils/dim.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
phi
{
...
...
@@ -44,7 +43,7 @@ __global__ void GatherCUDAKernel(const T* params,
template
<
typename
T
,
typename
IndexT
=
int
>
__global__
void
GatherNdCUDAKernel
(
const
T
*
input
,
const
int64_t
*
input_dims
,
const
Dim
<
DDim
::
kMaxRank
>
input_dims
,
const
IndexT
*
indices
,
T
*
output
,
size_t
remain_size
,
...
...
@@ -149,19 +148,11 @@ void GPUGatherNd(const phi::GPUContext& ctx,
slice_size
*=
input_dims
[
i
];
}
// source dim
std
::
vector
<
int64_t
>
v_input_dims
(
input_dims_size
)
;
Dim
<
DDim
::
kMaxRank
>
g_input_dims
;
for
(
int
i
=
0
;
i
<
input_dims_size
;
++
i
)
{
v
_input_dims
[
i
]
=
input_dims
[
i
];
g
_input_dims
[
i
]
=
input_dims
[
i
];
}
phi
::
DenseTensor
input_dims_tensor
;
input_dims_tensor
.
Resize
({
input_dims_size
});
auto
*
g_input_dims
=
ctx
.
Alloc
<
int64_t
>
(
&
input_dims_tensor
);
int64_t
bytes
=
input_dims_size
*
sizeof
(
int64_t
);
paddle
::
memory
::
Copy
(
gplace
,
g_input_dims
,
cplace
,
v_input_dims
.
data
(),
bytes
,
ctx
.
stream
());
int
block
=
512
;
int64_t
n
=
slice_size
*
remain_numel
;
int64_t
grid
=
(
n
+
block
-
1
)
/
block
;
...
...
paddle/phi/kernels/funcs/scatter.cu.h
浏览文件 @
ea9684f1
...
...
@@ -77,7 +77,7 @@ template <typename T, typename IndexT = int>
__global__
void
ScatterNdCUDAKernel
(
const
T
*
update
,
const
IndexT
*
indices
,
T
*
output
,
const
int64_t
*
output_dims
,
const
Dim
<
DDim
::
kMaxRank
>
output_dims
,
size_t
remain_size
,
size_t
slice_size
,
size_t
end_size
)
{
...
...
@@ -222,23 +222,12 @@ void GPUScatterNdAdd(const phi::GPUContext& ctx,
slice_size
*=
output_dims
[
i
];
}
const
size_t
slice_bytes
=
slice_size
*
sizeof
(
T
);
// put output_dims int CUDA
// gplace and cplace
const
auto
gplace
=
ctx
.
GetPlace
();
auto
cplace
=
phi
::
CPUPlace
();
std
::
vector
<
int64_t
>
v_output_dims
(
output_dims_size
)
;
Dim
<
DDim
::
kMaxRank
>
g_output_dims
;
for
(
int
i
=
0
;
i
<
output_dims_size
;
++
i
)
{
v
_output_dims
[
i
]
=
output_dims
[
i
];
g
_output_dims
[
i
]
=
output_dims
[
i
];
}
phi
::
DenseTensor
out_dims_tensor
;
out_dims_tensor
.
Resize
({
output_dims_size
});
auto
*
g_output_dims
=
ctx
.
Alloc
<
int64_t
>
(
&
out_dims_tensor
);
int64_t
bytes
=
output_dims_size
*
sizeof
(
int64_t
);
paddle
::
memory
::
Copy
(
gplace
,
g_output_dims
,
cplace
,
v_output_dims
.
data
(),
bytes
,
ctx
.
stream
());
int
block
=
512
;
int64_t
n
=
slice_size
*
remain_numel
;
int64_t
grid
=
(
n
+
block
-
1
)
/
block
;
...
...
paddle/phi/kernels/gpu/index_select_grad_kernel.cu
浏览文件 @
ea9684f1
...
...
@@ -109,7 +109,6 @@ void IndexSelectGradKernel(const Context& ctx,
stride
,
size
,
delta
);
phi
::
backends
::
gpu
::
GpuStreamSync
(
stream
);
}
else
{
const
int
*
index_data
=
index
.
data
<
int
>
();
index_select_grad_cuda_kernel
<
T
,
int
><<<
...
...
@@ -124,7 +123,6 @@ void IndexSelectGradKernel(const Context& ctx,
stride
,
size
,
delta
);
phi
::
backends
::
gpu
::
GpuStreamSync
(
stream
);
}
}
...
...
paddle/phi/kernels/gpu/index_select_kernel.cu
浏览文件 @
ea9684f1
...
...
@@ -82,7 +82,6 @@ void IndexSelectKernel(const Context& ctx,
PADDLE_CUDA_NUM_THREADS
,
0
,
stream
>>>
(
in_data
,
out_data
,
index_data
,
numel
,
stride
,
size
,
delta
);
phi
::
backends
::
gpu
::
GpuStreamSync
(
stream
);
}
else
{
const
int
*
index_data
=
index
.
data
<
int
>
();
index_select_cuda_kernel
<
...
...
@@ -92,7 +91,6 @@ void IndexSelectKernel(const Context& ctx,
0
,
stream
>>>
(
in_data
,
out_data
,
index_data
,
numel
,
stride
,
size
,
delta
);
phi
::
backends
::
gpu
::
GpuStreamSync
(
stream
);
}
}
...
...
paddle/phi/kernels/gpu/scatter_nd_add_kernel.cu
浏览文件 @
ea9684f1
...
...
@@ -26,7 +26,7 @@ void ScatterNdAddKernel(const Context &ctx,
const
DenseTensor
&
index
,
const
DenseTensor
&
updates
,
DenseTensor
*
out
)
{
Copy
(
ctx
,
x
,
ctx
.
GetPlace
(),
tru
e
,
out
);
Copy
(
ctx
,
x
,
ctx
.
GetPlace
(),
fals
e
,
out
);
const
auto
&
index_type
=
index
.
dtype
();
bool
index_type_match
=
index_type
==
phi
::
DataType
::
INT32
||
index_type
==
phi
::
DataType
::
INT64
;
...
...
paddle/phi/kernels/gpu/where_index_kernel.cu
浏览文件 @
ea9684f1
...
...
@@ -29,33 +29,32 @@ namespace cub = hipcub;
#include "paddle/phi/core/kernel_registry.h"
namespace
phi
{
template
<
typename
T1
,
typename
T2
,
typename
OutT
>
template
<
typename
MaskT
,
typename
IndexT
,
typename
OutT
>
struct
IndexFunctor
{
T2
stride
[
phi
::
DDim
::
kMaxRank
];
int
dims
;
IndexT
strides
[
phi
::
DDim
::
kMaxRank
];
int
rank
;
explicit
IndexFunctor
(
const
phi
::
DDim
&
in_dims
)
{
dims
=
in_dims
.
size
();
std
::
vector
<
T2
>
strides_in_tmp
;
strides_in_tmp
.
resize
(
dims
,
1
);
// get strides according to in_dims
for
(
T2
i
=
1
;
i
<
dims
;
i
++
)
{
strides_in_tmp
[
i
]
=
strides_in_tmp
[
i
-
1
]
*
in_dims
[
dims
-
i
];
rank
=
in_dims
.
size
();
// Get strides according to in_dims
strides
[
0
]
=
1
;
for
(
IndexT
i
=
1
;
i
<
rank
;
i
++
)
{
strides
[
i
]
=
strides
[
i
-
1
]
*
in_dims
[
rank
-
i
];
}
memcpy
(
stride
,
strides_in_tmp
.
data
(),
dims
*
sizeof
(
T2
));
}
HOSTDEVICE
inline
void
operator
()(
OutT
*
out
,
const
T1
*
mask
,
const
T2
*
index
,
const
MaskT
*
mask
,
const
IndexT
*
index
,
const
int
num
)
{
int
store_fix
=
0
;
for
(
int
idx
=
0
;
idx
<
num
;
idx
++
)
{
if
(
mask
[
idx
])
{
T2
data_index
=
index
[
idx
];
IndexT
data_index
=
index
[
idx
];
// get index
for
(
int
rank_id
=
dims
-
1
;
rank_id
>=
0
;
--
rank_id
)
{
out
[
store_fix
]
=
static_cast
<
OutT
>
(
data_index
/
stride
[
rank_id
]);
data_index
=
data_index
%
stride
[
rank_id
];
for
(
int
rank_id
=
rank
-
1
;
rank_id
>=
0
;
--
rank_id
)
{
out
[
store_fix
]
=
static_cast
<
OutT
>
(
data_index
/
stride
s
[
rank_id
]);
data_index
=
data_index
%
stride
s
[
rank_id
];
store_fix
++
;
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录