Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
ae867a84
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ae867a84
编写于
6月 22, 2022
作者:
H
Haohongxiang
提交者:
GitHub
6月 22, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Dygraph] Fix bugs of supporting ProcessGroupNCCL on DCU (#43682)
* fix bugs * update * update * update * code style * code style check
上级
292b7254
变更
3
展开全部
显示空白变更内容
内联
并排
Showing
3 changed file
with
523 addition
and
248 deletion
+523
-248
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+1
-1
paddle/fluid/pybind/distributed_py.cc
paddle/fluid/pybind/distributed_py.cc
+121
-55
paddle/fluid/pybind/eager_method.cc
paddle/fluid/pybind/eager_method.cc
+401
-192
未找到文件。
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
ae867a84
...
...
@@ -129,7 +129,7 @@ endif()
if
(
NOT ON_INFER
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
processgroup eager_reducer
)
if
(
WITH_NCCL
)
if
(
WITH_NCCL
OR WITH_RCCL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
processgroup_nccl
)
if
(
WITH_PSCORE
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
processgroup_heter
)
...
...
paddle/fluid/pybind/distributed_py.cc
浏览文件 @
ae867a84
...
...
@@ -31,7 +31,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager_utils.h"
#include "paddle/phi/api/all.h"
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
|| defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#endif
...
...
@@ -61,11 +61,15 @@ std::shared_ptr<distributed::EagerReducer> CreateEagerReducer(
const
std
::
vector
<
std
::
vector
<
size_t
>>
&
group_indices
,
const
std
::
vector
<
bool
>
&
is_sparse_gradient
,
std
::
shared_ptr
<
distributed
::
ProcessGroup
>
process_group
,
const
std
::
vector
<
size_t
>
&
group_size_limits
,
bool
find_unused_parameters
)
{
const
std
::
vector
<
size_t
>
&
group_size_limits
,
bool
find_unused_parameters
)
{
auto
params
=
CastPyArg2VectorOfTensor
(
py_tensors
.
ptr
(),
0
);
return
std
::
make_shared
<
distributed
::
EagerReducer
>
(
params
,
group_indices
,
is_sparse_gradient
,
process_group
,
group_size_limits
,
find_unused_parameters
);
return
std
::
make_shared
<
distributed
::
EagerReducer
>
(
params
,
group_indices
,
is_sparse_gradient
,
process_group
,
group_size_limits
,
find_unused_parameters
);
}
#if defined(PADDLE_WITH_GLOO)
...
...
@@ -111,7 +115,8 @@ void BindDistributed(py::module *m) {
.
def
(
"name"
,
&
distributed
::
ProcessGroup
::
GetBackendName
)
.
def
(
"allreduce"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_tensor
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_tensor
,
distributed
::
ReduceOp
op
)
{
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
distributed
::
AllreduceOptions
opts
;
...
...
@@ -121,12 +126,14 @@ void BindDistributed(py::module *m) {
std
::
vector
<
phi
::
DenseTensor
>
tensors
=
{
*
dense
};
return
self
.
AllReduce
(
tensors
,
tensors
,
opts
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"op"
)
=
distributed
::
ReduceOp
::
SUM
,
py
::
arg
(
"tensor"
),
py
::
arg
(
"op"
)
=
distributed
::
ReduceOp
::
SUM
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"broadcast"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_tensor
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_tensor
,
int
source_rank
)
{
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
distributed
::
BroadcastOptions
opts
;
...
...
@@ -136,7 +143,8 @@ void BindDistributed(py::module *m) {
std
::
vector
<
phi
::
DenseTensor
>
tensors
=
{
*
dense
};
return
self
.
Broadcast
(
tensors
,
tensors
,
opts
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"source_rank"
),
py
::
arg
(
"tensor"
),
py
::
arg
(
"source_rank"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
...
...
@@ -151,7 +159,8 @@ void BindDistributed(py::module *m) {
.
def
(
"send"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_tensor
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_tensor
,
int
dst
)
{
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
auto
dense
=
...
...
@@ -159,12 +168,14 @@ void BindDistributed(py::module *m) {
std
::
vector
<
phi
::
DenseTensor
>
tensors
=
{
*
dense
};
return
self
.
Send
(
tensors
,
dst
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"dst"
),
py
::
arg
(
"tensor"
),
py
::
arg
(
"dst"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"recv"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_tensor
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_tensor
,
int
src
)
{
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
auto
dense
=
...
...
@@ -172,12 +183,14 @@ void BindDistributed(py::module *m) {
std
::
vector
<
phi
::
DenseTensor
>
tensors
=
{
*
dense
};
return
self
.
Recv
(
tensors
,
src
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"src"
),
py
::
arg
(
"tensor"
),
py
::
arg
(
"src"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"all_gather"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_in_tensor
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_in_tensor
,
py
::
handle
py_out_tensor
)
{
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
...
...
@@ -189,12 +202,14 @@ void BindDistributed(py::module *m) {
std
::
vector
<
phi
::
DenseTensor
>
out_tensors
=
{
*
out_dense
};
return
self
.
AllGather
(
in_tensors
,
out_tensors
);
},
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"alltoall"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_in_tensor
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_in_tensor
,
py
::
handle
py_out_tensor
)
{
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
...
...
@@ -206,13 +221,16 @@ void BindDistributed(py::module *m) {
std
::
vector
<
phi
::
DenseTensor
>
out_tensors
=
{
*
out_dense
};
return
self
.
AllToAll
(
in_tensors
,
out_tensors
);
},
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"reduce"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_in_tensor
,
int
dst
,
distributed
::
ReduceOp
op
)
{
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_in_tensor
,
int
dst
,
distributed
::
ReduceOp
op
)
{
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
distributed
::
ReduceOptions
opts
;
opts
.
reduce_op
=
op
;
...
...
@@ -222,14 +240,17 @@ void BindDistributed(py::module *m) {
std
::
vector
<
phi
::
DenseTensor
>
tensors
=
{
*
dense
};
return
self
.
Reduce
(
tensors
,
tensors
,
opts
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"dst"
),
py
::
arg
(
"tensor"
),
py
::
arg
(
"dst"
),
py
::
arg
(
"op"
)
=
distributed
::
ReduceOp
::
SUM
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"scatter"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_in_tensor
,
py
::
handle
py_out_tensor
,
int
src
)
{
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_in_tensor
,
py
::
handle
py_out_tensor
,
int
src
)
{
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
distributed
::
ScatterOptions
opts
;
...
...
@@ -242,17 +263,25 @@ void BindDistributed(py::module *m) {
std
::
vector
<
phi
::
DenseTensor
>
out_tensors
=
{
*
out_dense
};
return
self
.
Scatter
(
in_tensors
,
out_tensors
,
opts
);
},
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
arg
(
"src"
),
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
arg
(
"src"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_
RCCL) || defined(PADDLE_WITH_
NCCL)
py
::
class_
<
distributed
::
ProcessGroupNCCL
,
std
::
shared_ptr
<
distributed
::
ProcessGroupNCCL
>>
(
*
m
,
"ProcessGroupNCCL"
,
ProcessGroup
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
const
platform
::
CUDAPlace
&
,
int
>
(),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"place"
),
py
::
arg
(
"group_id"
)
=
0
,
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
const
platform
::
CUDAPlace
&
,
int
>
(),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"place"
),
py
::
arg
(
"group_id"
)
=
0
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#endif
...
...
@@ -261,29 +290,53 @@ void BindDistributed(py::module *m) {
py
::
class_
<
distributed
::
ProcessGroupHeter
,
std
::
shared_ptr
<
distributed
::
ProcessGroupHeter
>>
(
*
m
,
"ProcessGroupHeter"
,
ProcessGroup
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
#if defined(PADDLE_WITH_ASCEND_CL)
const
platform
::
NPUPlace
&
,
#else
const
platform
::
CUDAPlace
&
,
#endif
int
,
int
,
int
,
int
,
int
,
bool
,
std
::
string
,
int
,
int
>
(),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"place"
),
py
::
arg
(
"gid"
)
=
0
,
py
::
arg
(
"local_rank"
)
=
0
,
py
::
arg
(
"local_size"
)
=
1
,
py
::
arg
(
"gloo_rank"
)
=
0
,
py
::
arg
(
"gloo_size"
)
=
1
,
py
::
arg
(
"with_switch"
)
=
false
,
py
::
arg
(
"switch_endpoint"
)
=
""
,
py
::
arg
(
"src_rank"
)
=
""
,
py
::
arg
(
"dst_rank"
)
=
""
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
int
,
int
,
int
,
int
,
int
,
bool
,
std
::
string
,
int
,
int
>
(),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"place"
),
py
::
arg
(
"gid"
)
=
0
,
py
::
arg
(
"local_rank"
)
=
0
,
py
::
arg
(
"local_size"
)
=
1
,
py
::
arg
(
"gloo_rank"
)
=
0
,
py
::
arg
(
"gloo_size"
)
=
1
,
py
::
arg
(
"with_switch"
)
=
false
,
py
::
arg
(
"switch_endpoint"
)
=
""
,
py
::
arg
(
"src_rank"
)
=
""
,
py
::
arg
(
"dst_rank"
)
=
""
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
py
::
class_
<
distributed
::
ProcessGroupHCCL
,
std
::
shared_ptr
<
distributed
::
ProcessGroupHCCL
>>
(
*
m
,
"ProcessGroupHCCL"
,
ProcessGroup
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
const
platform
::
NPUPlace
&
,
int
>
(),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"place"
),
py
::
arg
(
"group_id"
)
=
0
,
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
const
platform
::
NPUPlace
&
,
int
>
(),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"place"
),
py
::
arg
(
"group_id"
)
=
0
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#endif
...
...
@@ -291,22 +344,29 @@ void BindDistributed(py::module *m) {
py
::
class_
<
distributed
::
ProcessGroup
::
Task
,
std
::
shared_ptr
<
distributed
::
ProcessGroup
::
Task
>>
(
*
m
,
"task"
)
.
def
(
"is_completed"
,
&
distributed
::
ProcessGroup
::
Task
::
IsCompleted
)
.
def
(
"wait"
,
&
distributed
::
ProcessGroup
::
Task
::
Wait
,
.
def
(
"wait"
,
&
distributed
::
ProcessGroup
::
Task
::
Wait
,
py
::
arg
(
"timeout"
)
=
kWaitTimeout
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"synchronize"
,
&
distributed
::
ProcessGroup
::
Task
::
Synchronize
,
.
def
(
"synchronize"
,
&
distributed
::
ProcessGroup
::
Task
::
Synchronize
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#if defined(PADDLE_WITH_GLOO)
py
::
class_
<
ProcessGroupGloo
,
std
::
shared_ptr
<
ProcessGroupGloo
>>
(
*
m
,
"ProcessGroupGloo"
,
ProcessGroup
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>
&
,
int
,
int
,
const
platform
::
CPUPlace
&
,
int
,
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>
&
,
int
,
int
,
const
platform
::
CPUPlace
&
,
int
,
std
::
shared_ptr
<
GlooOptions
>
&>
(),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
py
::
init
([](
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>
&
store
,
int
rank
,
int
world_size
,
const
platform
::
CPUPlace
&
place
,
int
gid
)
{
int
rank
,
int
world_size
,
const
platform
::
CPUPlace
&
place
,
int
gid
)
{
auto
opts
=
GlooOptions
::
create
();
char
*
ifname
=
getenv
(
GLOO_SOCKET_IFNAME_ENV
.
c_str
());
if
(
ifname
&&
strlen
(
ifname
)
>
1
)
{
...
...
@@ -315,11 +375,14 @@ void BindDistributed(py::module *m) {
}
else
{
opts
->
device
=
ProcessGroupGloo
::
createDefaultDevice
();
}
return
std
::
make_shared
<
ProcessGroupGloo
>
(
store
,
rank
,
world_size
,
place
,
gid
,
opts
);
return
std
::
make_shared
<
ProcessGroupGloo
>
(
store
,
rank
,
world_size
,
place
,
gid
,
opts
);
}),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"place"
),
py
::
arg
(
"group_id"
)
=
0
,
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"place"
),
py
::
arg
(
"group_id"
)
=
0
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def_static
(
"create_default_device"
,
&
ProcessGroupGloo
::
createDefaultDevice
);
...
...
@@ -327,21 +390,23 @@ void BindDistributed(py::module *m) {
m
->
def
(
"eager_assign_group_by_size"
,
[](
py
::
handle
py_tensors
,
std
::
vector
<
bool
>
is_sparse_gradient
,
[](
py
::
handle
py_tensors
,
std
::
vector
<
bool
>
is_sparse_gradient
,
std
::
vector
<
size_t
>
group_size_limits
,
std
::
vector
<
int64_t
>
tensor_indices
)
{
auto
tensors
=
CastPyArg2VectorOfTensor
(
py_tensors
.
ptr
(),
0
);
return
distributed
::
Eager_AssignGroupBySize
(
tensors
,
is_sparse_gradient
,
group_size_limits
,
tensor_indices
);
},
py
::
arg
(
"tensors"
),
py
::
arg
(
"is_sparse_gradient"
),
py
::
arg
(
"tensors"
),
py
::
arg
(
"is_sparse_gradient"
),
py
::
arg
(
"group_size_limits"
)
=
std
::
vector
<
size_t
>
{
25
*
1024
*
1024
},
py
::
arg
(
"tensor_indices"
)
=
std
::
vector
<
int64_t
>
{},
py
::
call_guard
<
py
::
gil_scoped_release
>
());
py
::
class_
<
distributed
::
EagerReducer
,
std
::
shared_ptr
<
distributed
::
EagerReducer
>>
(
*
m
,
"EagerReducer"
,
R"DOC()DOC"
)
std
::
shared_ptr
<
distributed
::
EagerReducer
>>
(
*
m
,
"EagerReducer"
,
R"DOC()DOC"
)
.
def
(
py
::
init
(
&
CreateEagerReducer
))
.
def
(
"prepare_for_backward"
,
...
...
@@ -349,7 +414,8 @@ void BindDistributed(py::module *m) {
auto
params
=
CastPyArg2VectorOfTensor
(
py_tensors
.
ptr
(),
0
);
self
.
PrepareForBackward
(
params
);
},
py
::
arg
(
"tensors"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
());
py
::
arg
(
"tensors"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
());
}
}
// end namespace pybind
...
...
paddle/fluid/pybind/eager_method.cc
浏览文件 @
ae867a84
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录