Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
ae00f428
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ae00f428
编写于
9月 16, 2022
作者:
W
Wen Sun
提交者:
GitHub
9月 16, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Support both use_calc_stream and sync_op in send recv APIs (#46023)
上级
92e1f64b
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
922 addition
and
40 deletion
+922
-40
paddle/fluid/distributed/collective/ProcessGroup.h
paddle/fluid/distributed/collective/ProcessGroup.h
+41
-9
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+216
-10
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+47
-2
paddle/fluid/distributed/collective/ProcessGroupStream.cc
paddle/fluid/distributed/collective/ProcessGroupStream.cc
+84
-0
paddle/fluid/distributed/collective/ProcessGroupStream.h
paddle/fluid/distributed/collective/ProcessGroupStream.h
+52
-0
paddle/fluid/pybind/distributed_py.cc
paddle/fluid/pybind/distributed_py.cc
+170
-0
python/paddle/distributed/communication/stream/__init__.py
python/paddle/distributed/communication/stream/__init__.py
+3
-1
python/paddle/distributed/communication/stream/all_reduce.py
python/paddle/distributed/communication/stream/all_reduce.py
+4
-4
python/paddle/distributed/communication/stream/recv.py
python/paddle/distributed/communication/stream/recv.py
+82
-0
python/paddle/distributed/communication/stream/send.py
python/paddle/distributed/communication/stream/send.py
+82
-0
python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
...on/paddle/fluid/tests/unittests/collective/CMakeLists.txt
+20
-12
python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py
...s/collective/communication_stream_sendrecv_api_dygraph.py
+68
-0
python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py
...ests/collective/test_communication_stream_sendrecv_api.py
+50
-0
python/paddle/fluid/tests/unittests/collective/testslist.csv
python/paddle/fluid/tests/unittests/collective/testslist.csv
+3
-2
未找到文件。
paddle/fluid/distributed/collective/ProcessGroup.h
浏览文件 @
ae00f428
...
@@ -134,24 +134,56 @@ class ProcessGroup {
...
@@ -134,24 +134,56 @@ class ProcessGroup {
"ProcessGroup%s does not support send"
,
GetBackendName
()));
"ProcessGroup%s does not support send"
,
GetBackendName
()));
}
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
,
int
,
bool
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support send with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
)
{
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
,
int
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support rec
eive
"
,
GetBackendName
()));
"ProcessGroup%s does not support rec
v
"
,
GetBackendName
()));
}
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
,
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
int
,
std
::
vector
<
phi
::
DenseTensor
>&
,
int
,
bool
)
{
// NOLINT
int
,
int
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support send"
,
GetBackendName
()));
"ProcessGroup%s does not support recv with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
,
// NOLINT
int
,
int
,
int
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support send_partial"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
,
int
,
int
,
int
,
bool
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support send_partial with sync_op flag"
,
GetBackendName
()));
}
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
,
int
,
int
)
{
// NOLINT
phi
::
DenseTensor
&
,
// NOLINT
int
,
int
,
int
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support receive"
,
GetBackendName
()));
"ProcessGroup%s does not support recv_partial"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
,
int
,
int
,
int
,
bool
)
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support recv_partial with sync_op flag"
,
GetBackendName
()));
}
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
...
...
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
浏览文件 @
ae00f428
...
@@ -51,6 +51,17 @@ std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
...
@@ -51,6 +51,17 @@ std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
places
,
rank
,
comm_type
,
inputs
);
places
,
rank
,
comm_type
,
inputs
);
}
}
std
::
shared_ptr
<
ProcessGroupNCCL
::
NCCLTask
>
ProcessGroupNCCL
::
CreateTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
comm_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
bool
is_sync
,
bool
use_calc_stream
)
{
return
std
::
make_shared
<
ProcessGroupNCCL
::
NCCLTask
>
(
places
,
rank
,
comm_type
,
inputs
,
is_sync
,
use_calc_stream
);
}
ProcessGroupNCCL
::
NCCLTask
::
NCCLTask
(
ProcessGroupNCCL
::
NCCLTask
::
NCCLTask
(
const
std
::
vector
<
Place
>&
places
,
const
std
::
vector
<
Place
>&
places
,
int
rank
,
int
rank
,
...
@@ -264,10 +275,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
...
@@ -264,10 +275,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
auto
&
nccl_comms
=
places_to_ncclcomm_
[
key
];
auto
&
nccl_comms
=
places_to_ncclcomm_
[
key
];
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
if
(
!
use_calc_stream
)
{
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
}
auto
task
=
std
::
make_shared
<
ProcessGroupNCCL
::
NCCLTask
>
(
auto
task
=
places
,
rank_
,
comm_type
,
inputs
,
sync_op
,
use_calc_stream
);
CreateTask
(
places
,
rank_
,
comm_type
,
inputs
,
sync_op
,
use_calc_stream
);
platform
::
CUDADeviceGuard
cuda_guard
;
platform
::
CUDADeviceGuard
cuda_guard
;
...
@@ -406,6 +419,78 @@ void ProcessGroupNCCL::Collective(const phi::DenseTensor* in,
...
@@ -406,6 +419,78 @@ void ProcessGroupNCCL::Collective(const phi::DenseTensor* in,
cuda_guard
.
SetDevice
(
places
[
0
]);
cuda_guard
.
SetDevice
(
places
[
0
]);
}
}
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
PointToPoint
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
Fn
fn
,
int
dst_rank
,
CommType
op_type
,
bool
sync_op
,
bool
use_calc_stream
)
{
const
auto
&
places
=
GetPlaceList
(
tensors
);
const
auto
&
key
=
GetKeyFromPlaces
(
places
);
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
places_to_ncclcomm_
.
find
(
key
)
==
places_to_ncclcomm_
.
end
())
{
CreateNCCLManagerCache
(
key
,
places
);
}
}
auto
&
nccl_comms
=
places_to_ncclcomm_
[
key
];
if
(
!
use_calc_stream
)
{
SyncDefaultStream
(
places
,
places_to_events_
[
key
],
places_to_ctx_
[
key
]);
}
auto
task
=
CreateTask
(
places
,
rank_
,
op_type
,
tensors
,
sync_op
,
use_calc_stream
);
platform
::
CUDADeviceGuard
cuda_guard
;
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
gpuStream_t
nccl_stream
;
if
(
use_calc_stream
)
{
nccl_stream
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places
[
i
]))
->
stream
();
}
else
{
nccl_stream
=
places_to_ctx_
[
key
][
i
]
->
stream
();
}
memory
::
RecordStream
(
tensors
[
i
].
Holder
(),
nccl_stream
);
}
}
{
platform
::
NCCLGroupGuard
nccl_guard
;
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
gpuStream_t
nccl_stream
;
if
(
use_calc_stream
)
{
nccl_stream
=
static_cast
<
phi
::
GPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
places
[
i
]))
->
stream
();
}
else
{
nccl_stream
=
places_to_ctx_
[
key
][
i
]
->
stream
();
}
fn
(
tensors
[
i
],
nccl_comms
[
i
]
->
GetNcclComm
(),
nccl_stream
,
dst_rank
);
}
}
if
(
!
use_calc_stream
)
{
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
cuda_guard
.
SetDevice
(
places
[
i
]);
task
->
control_events_
[
i
].
Record
(
*
places_to_ctx_
[
key
][
i
]);
}
}
return
task
;
}
template
<
typename
Fn
>
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
PointToPoint
(
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
PointToPoint
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
...
@@ -617,6 +702,34 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
...
@@ -617,6 +702,34 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
return
task
;
return
task
;
}
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
,
bool
sync_op
,
bool
use_calc_stream
)
{
CheckTensorsInDifferentDevices
(
tensors
,
static_cast
<
size_t
>
(
GetSize
()));
auto
task
=
PointToPoint
(
tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
dst_rank
)
{
return
platform
::
dynload
::
ncclSend
(
input
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
dst_rank
,
comm
,
stream
);
},
dst_rank
,
CommType
::
SEND
,
sync_op
,
use_calc_stream
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv
(
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
)
{
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
)
{
CheckTensorsInDifferentDevices
(
tensors
,
static_cast
<
size_t
>
(
GetSize
()));
CheckTensorsInDifferentDevices
(
tensors
,
static_cast
<
size_t
>
(
GetSize
()));
...
@@ -640,6 +753,34 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
...
@@ -640,6 +753,34 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
return
task
;
return
task
;
}
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
,
bool
sync_op
,
bool
use_calc_stream
)
{
CheckTensorsInDifferentDevices
(
tensors
,
static_cast
<
size_t
>
(
GetSize
()));
auto
task
=
PointToPoint
(
tensors
,
[
&
](
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
src_rank
)
{
return
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
output
.
numel
(),
platform
::
ToNCCLDataType
(
output
.
dtype
()),
src_rank
,
comm
,
stream
);
},
src_rank
,
CommType
::
RECV
,
sync_op
,
use_calc_stream
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Send_Partial
(
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int
offset
,
int
length
)
{
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int
offset
,
int
length
)
{
// CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
// CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
...
@@ -647,10 +788,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
...
@@ -647,10 +788,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
phi
::
DenseTensor
flatten_tensor
;
phi
::
DenseTensor
flatten_tensor
;
flatten_tensor
.
ShareDataWith
(
tensors
).
Resize
({
tensors
.
numel
()});
flatten_tensor
.
ShareDataWith
(
tensors
).
Resize
({
tensors
.
numel
()});
phi
::
DenseTensor
shared_input
=
flatten_tensor
.
Slice
(
offset
,
offset
+
length
);
std
::
vector
<
phi
::
DenseTensor
>
shared_tensors
{
flatten_tensor
.
Slice
(
offset
,
offset
+
length
)};
std
::
vector
<
phi
::
DenseTensor
>
shared_tensors
;
shared_tensors
.
push_back
(
shared_input
);
auto
task
=
PointToPoint
(
auto
task
=
PointToPoint
(
shared_tensors
,
shared_tensors
,
...
@@ -671,16 +810,49 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
...
@@ -671,16 +810,49 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
return
task
;
return
task
;
}
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int
offset
,
int
length
,
bool
sync_op
,
bool
use_calc_stream
)
{
phi
::
DenseTensor
flatten_tensor
;
flatten_tensor
.
ShareDataWith
(
tensors
).
Resize
({
tensors
.
numel
()});
std
::
vector
<
phi
::
DenseTensor
>
shared_tensors
{
flatten_tensor
.
Slice
(
offset
,
offset
+
length
)};
auto
task
=
PointToPoint
(
shared_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
dst_rank
)
{
return
platform
::
dynload
::
ncclSend
(
input
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
dst_rank
,
comm
,
stream
);
},
dst_rank
,
CommType
::
SEND
,
sync_op
,
use_calc_stream
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv_Partial
(
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int
offset
,
int
length
)
{
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int
offset
,
int
length
)
{
// phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
// phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
phi
::
DenseTensor
flatten_tensor
;
phi
::
DenseTensor
flatten_tensor
;
flatten_tensor
.
ShareDataWith
(
tensors
).
Resize
({
tensors
.
numel
()});
flatten_tensor
.
ShareDataWith
(
tensors
).
Resize
({
tensors
.
numel
()});
phi
::
DenseTensor
shared_input
=
flatten_tensor
.
Slice
(
offset
,
offset
+
length
);
std
::
vector
<
phi
::
DenseTensor
>
shared_tensors
;
std
::
vector
<
phi
::
DenseTensor
>
shared_tensors
{
shared_tensors
.
push_back
(
shared_input
)
;
flatten_tensor
.
Slice
(
offset
,
offset
+
length
)}
;
auto
task
=
PointToPoint
(
auto
task
=
PointToPoint
(
shared_tensors
,
shared_tensors
,
...
@@ -701,6 +873,40 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
...
@@ -701,6 +873,40 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
return
task
;
return
task
;
}
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int
offset
,
int
length
,
bool
sync_op
,
bool
use_calc_stream
)
{
phi
::
DenseTensor
flatten_tensor
;
flatten_tensor
.
ShareDataWith
(
tensors
).
Resize
({
tensors
.
numel
()});
std
::
vector
<
phi
::
DenseTensor
>
shared_tensors
{
flatten_tensor
.
Slice
(
offset
,
offset
+
length
)};
auto
task
=
PointToPoint
(
shared_tensors
,
[
&
](
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
,
int
src_rank
)
{
return
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
output
.
numel
(),
platform
::
ToNCCLDataType
(
output
.
dtype
()),
src_rank
,
comm
,
stream
);
},
src_rank
,
CommType
::
RECV
,
sync_op
,
use_calc_stream
);
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllGather
(
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
{
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
{
...
...
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
浏览文件 @
ae00f428
...
@@ -60,7 +60,7 @@ class ProcessGroupNCCL : public ProcessGroupStream {
...
@@ -60,7 +60,7 @@ class ProcessGroupNCCL : public ProcessGroupStream {
int
rank
,
int
rank
,
CommType
comm_type
,
CommType
comm_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
bool
is_sync
,
bool
sync_op
,
bool
use_calc_stream
);
bool
use_calc_stream
);
bool
IsCompleted
();
bool
IsCompleted
();
...
@@ -122,19 +122,47 @@ class ProcessGroupNCCL : public ProcessGroupStream {
...
@@ -122,19 +122,47 @@ class ProcessGroupNCCL : public ProcessGroupStream {
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
)
override
;
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
)
override
;
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int
dst_rank
,
int
offset
,
int
offset
,
int
length
)
override
;
int
length
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int
offset
,
int
length
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int
src_rank
,
int
offset
,
int
offset
,
int
length
)
override
;
int
length
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int
offset
,
int
length
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
override
;
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
)
override
;
...
@@ -180,9 +208,17 @@ class ProcessGroupNCCL : public ProcessGroupStream {
...
@@ -180,9 +208,17 @@ class ProcessGroupNCCL : public ProcessGroupStream {
virtual
std
::
shared_ptr
<
ProcessGroupNCCL
::
NCCLTask
>
CreateTask
(
virtual
std
::
shared_ptr
<
ProcessGroupNCCL
::
NCCLTask
>
CreateTask
(
std
::
vector
<
Place
>
places
,
std
::
vector
<
Place
>
places
,
int
rank
,
int
rank
,
CommType
op
T
ype
,
CommType
op
_t
ype
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
);
virtual
std
::
shared_ptr
<
ProcessGroupNCCL
::
NCCLTask
>
CreateTask
(
const
std
::
vector
<
Place
>&
places
,
int
rank
,
CommType
op_type
,
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
bool
sync_op
,
bool
use_calc_stream
);
protected:
protected:
std
::
shared_ptr
<
Store
>
store_
;
std
::
shared_ptr
<
Store
>
store_
;
std
::
shared_ptr
<
NCCLCommManager
>
nccl_comm_
;
std
::
shared_ptr
<
NCCLCommManager
>
nccl_comm_
;
...
@@ -233,6 +269,15 @@ class ProcessGroupNCCL : public ProcessGroupStream {
...
@@ -233,6 +269,15 @@ class ProcessGroupNCCL : public ProcessGroupStream {
int
dst_rank
,
int
dst_rank
,
CommType
op_type
);
CommType
op_type
);
template
<
typename
Fn
>
std
::
shared_ptr
<
ProcessGroup
::
Task
>
PointToPoint
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
Fn
fn
,
int
dst_rank
,
CommType
op_type
,
bool
sync_op
,
bool
use_calc_stream
);
void
CreateNCCLManagerCache
(
const
std
::
string
&
places_key
,
void
CreateNCCLManagerCache
(
const
std
::
string
&
places_key
,
const
std
::
vector
<
Place
>&
places
);
const
std
::
vector
<
Place
>&
places
);
...
...
paddle/fluid/distributed/collective/ProcessGroupStream.cc
浏览文件 @
ae00f428
...
@@ -45,5 +45,89 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
...
@@ -45,5 +45,89 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
"ProcessGroup%s does not support do allreduce"
,
GetBackendName
()));
"ProcessGroup%s does not support do allreduce"
,
GetBackendName
()));
}
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
,
bool
sync_op
)
{
return
Send
(
tensors
,
dst_rank
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
dst_rank
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do send"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int
offset
,
int
length
,
bool
sync_op
)
{
return
Send_Partial
(
tensors
,
dst_rank
,
offset
,
length
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
int
dst_rank
,
int
offset
,
int
length
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do send_partial"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
,
bool
sync_op
)
{
return
Recv
(
tensors
,
src_rank
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do recv"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int
offset
,
int
length
,
bool
sync_op
)
{
return
Recv_Partial
(
tensors
,
src_rank
,
offset
,
length
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
int
src_rank
,
int
offset
,
int
length
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do recv_partial"
,
GetBackendName
()));
}
}
// namespace distributed
}
// namespace distributed
}
// namespace paddle
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupStream.h
浏览文件 @
ae00f428
...
@@ -66,6 +66,58 @@ class ProcessGroupStream : public ProcessGroup {
...
@@ -66,6 +66,58 @@ class ProcessGroupStream : public ProcessGroup {
const
AllreduceOptions
&
options
,
const
AllreduceOptions
&
options
,
bool
sync_op
,
bool
sync_op
,
bool
use_calc_stream
);
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
int
dst_rank
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
int
dst_rank
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
// NOLINT
int
dst_rank
,
int
offset
,
int
length
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send_Partial
(
phi
::
DenseTensor
&
tensors
,
// NOLINT
int
dst_rank
,
int
offset
,
int
length
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
int
src_rank
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
int
src_rank
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
// NOLINT
int
src_rank
,
int
offset
,
int
length
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv_Partial
(
phi
::
DenseTensor
&
tensors
,
// NOLINT
int
src_rank
,
int
offset
,
int
length
,
bool
sync_op
,
bool
use_calc_stream
);
};
};
}
// namespace distributed
}
// namespace distributed
...
...
paddle/fluid/pybind/distributed_py.cc
浏览文件 @
ae00f428
...
@@ -196,6 +196,23 @@ void BindDistributed(py::module *m) {
...
@@ -196,6 +196,23 @@ void BindDistributed(py::module *m) {
py
::
arg
(
"dst"
),
py
::
arg
(
"dst"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"send"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_tensor
,
int
dst
,
bool
sync_op
)
{
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
auto
dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
tensors
=
{
*
dense
};
return
self
.
Send
(
tensors
,
dst
,
sync_op
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"dst"
),
py
::
arg
(
"sync_op"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
.
def
(
"send_partial"
,
"send_partial"
,
[](
distributed
::
ProcessGroup
&
self
,
[](
distributed
::
ProcessGroup
&
self
,
...
@@ -217,6 +234,30 @@ void BindDistributed(py::module *m) {
...
@@ -217,6 +234,30 @@ void BindDistributed(py::module *m) {
py
::
arg
(
"id"
),
py
::
arg
(
"id"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"send_partial"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_tensor
,
int
dst_rank
,
int
nranks
,
int
rank_id
,
bool
sync_op
)
{
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
auto
dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensor
.
impl
());
int
numel
=
(
*
dense
).
numel
();
int
send_numel
=
numel
/
nranks
;
int
offset
=
send_numel
*
rank_id
;
return
self
.
Send_Partial
(
*
dense
,
dst_rank
,
offset
,
send_numel
,
sync_op
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"dst"
),
py
::
arg
(
"num"
),
py
::
arg
(
"id"
),
py
::
arg
(
"sync_op"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
.
def
(
"recv"
,
"recv"
,
[](
distributed
::
ProcessGroup
&
self
,
[](
distributed
::
ProcessGroup
&
self
,
...
@@ -232,6 +273,23 @@ void BindDistributed(py::module *m) {
...
@@ -232,6 +273,23 @@ void BindDistributed(py::module *m) {
py
::
arg
(
"src"
),
py
::
arg
(
"src"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"recv"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_tensor
,
int
src
,
bool
sync_op
)
{
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
auto
dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
tensors
=
{
*
dense
};
return
self
.
Recv
(
tensors
,
src
,
sync_op
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"src"
),
py
::
arg
(
"sync_op"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
.
def
(
"recv_partial"
,
"recv_partial"
,
[](
distributed
::
ProcessGroup
&
self
,
[](
distributed
::
ProcessGroup
&
self
,
...
@@ -253,6 +311,30 @@ void BindDistributed(py::module *m) {
...
@@ -253,6 +311,30 @@ void BindDistributed(py::module *m) {
py
::
arg
(
"id"
),
py
::
arg
(
"id"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"recv_partial"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_tensor
,
int
src_rank
,
int
nranks
,
int
rank_id
,
bool
sync_op
)
{
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
auto
dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensor
.
impl
());
int
numel
=
(
*
dense
).
numel
();
int
recv_numel
=
numel
/
nranks
;
int
offset
=
recv_numel
*
rank_id
;
return
self
.
Recv_Partial
(
*
dense
,
src_rank
,
offset
,
recv_numel
,
sync_op
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"src"
),
py
::
arg
(
"num"
),
py
::
arg
(
"id"
),
py
::
arg
(
"sync_op"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
.
def
(
"all_gather"
,
"all_gather"
,
[](
distributed
::
ProcessGroup
&
self
,
[](
distributed
::
ProcessGroup
&
self
,
...
@@ -427,6 +509,94 @@ void BindDistributed(py::module *m) {
...
@@ -427,6 +509,94 @@ void BindDistributed(py::module *m) {
},
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"tensor"
),
py
::
arg
(
"op"
),
py
::
arg
(
"op"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"send_on_calc_stream"
,
[](
distributed
::
ProcessGroupStream
&
self
,
py
::
handle
py_tensor
,
int
dst
)
{
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
auto
dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
tensors
=
{
*
dense
};
return
self
.
Send
(
tensors
,
dst
,
/*sync_op*/
true
,
/*use_calc_stream*/
true
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"dst"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"send_partial_on_calc_stream"
,
[](
distributed
::
ProcessGroupStream
&
self
,
py
::
handle
py_tensor
,
int
dst_rank
,
int
nranks
,
int
rank_id
)
{
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
auto
dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensor
.
impl
());
int
numel
=
(
*
dense
).
numel
();
int
send_numel
=
numel
/
nranks
;
int
offset
=
send_numel
*
rank_id
;
return
self
.
Send_Partial
(
*
dense
,
dst_rank
,
offset
,
send_numel
,
/*sync_op*/
true
,
/*use_calc_stream*/
true
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"dst"
),
py
::
arg
(
"num"
),
py
::
arg
(
"id"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"recv_on_calc_stream"
,
[](
distributed
::
ProcessGroupStream
&
self
,
py
::
handle
py_tensor
,
int
src
)
{
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
auto
dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
tensors
=
{
*
dense
};
return
self
.
Recv
(
tensors
,
src
,
/*sync_op*/
true
,
/*use_calc_stream*/
true
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"src"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"recv_partial_on_calc_stream"
,
[](
distributed
::
ProcessGroupStream
&
self
,
py
::
handle
py_tensor
,
int
src_rank
,
int
nranks
,
int
rank_id
)
{
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
auto
dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensor
.
impl
());
int
numel
=
(
*
dense
).
numel
();
int
recv_numel
=
numel
/
nranks
;
int
offset
=
recv_numel
*
rank_id
;
return
self
.
Recv_Partial
(
*
dense
,
src_rank
,
offset
,
recv_numel
,
/*sync_op*/
true
,
/*use_calc_stream*/
true
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"src"
),
py
::
arg
(
"num"
),
py
::
arg
(
"id"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
());
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
...
...
python/paddle/distributed/communication/stream/__init__.py
浏览文件 @
ae00f428
...
@@ -13,5 +13,7 @@
...
@@ -13,5 +13,7 @@
# limitations under the License.
# limitations under the License.
from
.all_reduce
import
all_reduce
from
.all_reduce
import
all_reduce
from
.send
import
send
from
.recv
import
recv
__all__
=
[
"all_reduce"
]
__all__
=
[
"all_reduce"
,
"send"
,
"recv"
]
python/paddle/distributed/communication/stream/all_reduce.py
浏览文件 @
ae00f428
...
@@ -12,13 +12,13 @@
...
@@ -12,13 +12,13 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
paddle.distributed.collective
as
collective
import
paddle.fluid.framework
as
framework
import
paddle.fluid.framework
as
framework
from
...collective
import
_get_default_group
,
_get_reduce_op
,
ReduceOp
def
_all_reduce_in_dygraph
(
tensor
,
op
,
group
,
sync_op
,
use_calc_stream
):
def
_all_reduce_in_dygraph
(
tensor
,
op
,
group
,
sync_op
,
use_calc_stream
):
op_type
=
_get_reduce_op
(
op
,
"all_reduce"
)
op_type
=
collective
.
_get_reduce_op
(
op
,
"all_reduce"
)
group
=
_get_default_group
()
if
group
is
None
else
group
group
=
collective
.
_get_default_group
()
if
group
is
None
else
group
if
use_calc_stream
:
if
use_calc_stream
:
return
group
.
process_group
.
allreduce_on_calc_stream
(
tensor
,
op_type
)
return
group
.
process_group
.
allreduce_on_calc_stream
(
tensor
,
op_type
)
...
@@ -30,7 +30,7 @@ def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream):
...
@@ -30,7 +30,7 @@ def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream):
def
all_reduce
(
tensor
,
def
all_reduce
(
tensor
,
op
=
ReduceOp
.
SUM
,
op
=
collective
.
ReduceOp
.
SUM
,
group
=
None
,
group
=
None
,
sync_op
=
True
,
sync_op
=
True
,
use_calc_stream
=
False
):
use_calc_stream
=
False
):
...
...
python/paddle/distributed/communication/stream/recv.py
0 → 100644
浏览文件 @
ae00f428
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.distributed.collective
as
collective
import
paddle.fluid.framework
as
framework
def
_recv_in_dygraph
(
tensor
,
src
,
group
,
sync_op
,
use_calc_stream
):
group
=
collective
.
_get_default_group
()
if
group
is
None
else
group
if
use_calc_stream
:
return
group
.
process_group
.
recv_on_calc_stream
(
tensor
,
src
)
task
=
group
.
process_group
.
recv
(
tensor
,
src
,
sync_op
)
if
sync_op
:
task
.
wait
()
return
task
def
recv
(
tensor
,
src
=
0
,
group
=
None
,
sync_op
=
True
,
use_calc_stream
=
False
):
"""
Receive a tensor from the source device.
Args:
tensor (Tensor): The tensor to receive. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type.
src (int, optional): Rank of the source device. If none is given, use `0` as default.
group (Group, optional): Communicate in which group. If none is given, use the global group as default.
sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode now.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
local_rank = dist.get_rank()
if local_rank == 0:
data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
task = dist.stream.send(data, dst=1, sync_op=False)
else:
data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
task = dist.stream.recv(data, src=0, sync_op=False)
task.wait()
out = data.numpy()
# [[4, 5, 6], [4, 5, 6]
"""
if
group
is
not
None
and
not
group
.
is_member
():
raise
RuntimeError
(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
"use_calc_stream can only be True in sync op behavior."
)
if
framework
.
in_dygraph_mode
():
return
_recv_in_dygraph
(
tensor
,
src
,
group
,
sync_op
,
use_calc_stream
)
raise
RuntimeError
(
"paddle.distributed.stream.recv is only supported in dygraph mode now."
)
python/paddle/distributed/communication/stream/send.py
0 → 100644
浏览文件 @
ae00f428
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.distributed.collective
as
collective
import
paddle.fluid.framework
as
framework
def
_send_in_dygraph
(
tensor
,
dst
,
group
,
sync_op
,
use_calc_stream
):
group
=
collective
.
_get_default_group
()
if
group
is
None
else
group
if
use_calc_stream
:
return
group
.
process_group
.
send_on_calc_stream
(
tensor
,
dst
)
task
=
group
.
process_group
.
send
(
tensor
,
dst
,
sync_op
)
if
sync_op
:
task
.
wait
()
return
task
def
send
(
tensor
,
dst
=
0
,
group
=
None
,
sync_op
=
True
,
use_calc_stream
=
False
):
"""
Send a tensor to the destination device.
Args:
tensor (Tensor): The tensor to send. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type.
dst (int, optional): Rank of the destination device. If none is given, use `0` as default.
group (Group, optional): Communicate in which group. If none is given, use the global group as default.
sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
Returns:
Return a task object.
Warning:
This API only supports the dygraph mode now.
Examples:
.. code-block:: python
# required: distributed
import paddle
import paddle.distributed as dist
dist.init_parallel_env()
local_rank = dist.get_rank()
if local_rank == 0:
data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
task = dist.stream.send(data, dst=1, sync_op=False)
else:
data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
task = dist.stream.recv(data, src=0, sync_op=False)
task.wait()
out = data.numpy()
# [[4, 5, 6], [4, 5, 6]
"""
if
group
is
not
None
and
not
group
.
is_member
():
raise
RuntimeError
(
"The group should not be None and all ranks which invoke this operation should be the member of this group."
)
if
not
sync_op
and
use_calc_stream
:
raise
RuntimeError
(
"use_calc_stream can only be True in sync op behavior."
)
if
framework
.
in_dygraph_mode
():
return
_send_in_dygraph
(
tensor
,
dst
,
group
,
sync_op
,
use_calc_stream
)
raise
RuntimeError
(
"paddle.distributed.stream.send is only supported in dygraph mode now."
)
python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
浏览文件 @
ae00f428
...
@@ -268,17 +268,26 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
...
@@ -268,17 +268,26 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
endif
()
endif
()
if
((
WITH_GPU OR WITH_ROCM
)
AND
(
LINUX
))
if
((
WITH_GPU OR WITH_ROCM
)
AND
(
LINUX
))
py_test_modules
(
py_test_modules
(
test_eager_dist_api MODULES test_eager_dist_api ENVS
test_communication_stream_allreduce_api MODULES
"http_proxy=;https_proxy=;PYTHONPATH=..:
${
PADDLE_BINARY_DIR
}
/python"
)
test_communication_stream_allreduce_api ENVS
set_tests_properties
(
test_eager_dist_api PROPERTIES TIMEOUT
"120"
LABELS
"PYTHONPATH=..:
${
PADDLE_BINARY_DIR
}
/python;http_proxy=;https_proxy="
)
"RUN_TYPE=DIST"
)
set_tests_properties
(
test_communication_stream_allreduce_api
PROPERTIES TIMEOUT
"120"
LABELS
"RUN_TYPE=DIST"
)
endif
()
endif
()
if
((
WITH_GPU OR WITH_ROCM
)
AND
(
LINUX
))
if
((
WITH_GPU OR WITH_ROCM
)
AND
(
LINUX
))
py_test_modules
(
py_test_modules
(
test_new_group_api MODULES test_new_group_api ENVS
test_communication_stream_sendrecv_api MODULES
test_communication_stream_sendrecv_api ENVS
"PYTHONPATH=..:
${
PADDLE_BINARY_DIR
}
/python;http_proxy=;https_proxy="
)
set_tests_properties
(
test_communication_stream_sendrecv_api
PROPERTIES TIMEOUT
"120"
LABELS
"RUN_TYPE=DIST"
)
endif
()
if
((
WITH_GPU OR WITH_ROCM
)
AND
(
LINUX
))
py_test_modules
(
test_eager_dist_api MODULES test_eager_dist_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:
${
PADDLE_BINARY_DIR
}
/python"
)
"http_proxy=;https_proxy=;PYTHONPATH=..:
${
PADDLE_BINARY_DIR
}
/python"
)
set_tests_properties
(
test_
new_group
_api PROPERTIES TIMEOUT
"120"
LABELS
set_tests_properties
(
test_
eager_dist
_api PROPERTIES TIMEOUT
"120"
LABELS
"RUN_TYPE=DIST"
)
"RUN_TYPE=DIST"
)
endif
()
endif
()
if
((
WITH_GPU
if
((
WITH_GPU
OR WITH_ROCM
OR WITH_ROCM
...
@@ -298,11 +307,10 @@ if((WITH_GPU
...
@@ -298,11 +307,10 @@ if((WITH_GPU
endif
()
endif
()
if
((
WITH_GPU OR WITH_ROCM
)
AND
(
LINUX
))
if
((
WITH_GPU OR WITH_ROCM
)
AND
(
LINUX
))
py_test_modules
(
py_test_modules
(
test_communication_stream_allreduce_api MODULES
test_new_group_api MODULES test_new_group_api ENVS
test_communication_stream_allreduce_api ENVS
"http_proxy=;https_proxy=;PYTHONPATH=..:
${
PADDLE_BINARY_DIR
}
/python"
)
"PYTHONPATH=..:
${
PADDLE_BINARY_DIR
}
/python;http_proxy=;https_proxy="
)
set_tests_properties
(
test_new_group_api PROPERTIES TIMEOUT
"120"
LABELS
set_tests_properties
(
test_communication_stream_allreduce_api
"RUN_TYPE=DIST"
)
PROPERTIES TIMEOUT
"120"
LABELS
"RUN_TYPE=DIST"
)
endif
()
endif
()
if
((
WITH_ROCM OR WITH_GPU
)
AND
(
LINUX
))
if
((
WITH_ROCM OR WITH_GPU
)
AND
(
LINUX
))
bash_test_modules
(
bash_test_modules
(
...
...
python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py
0 → 100644
浏览文件 @
ae00f428
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
numpy
as
np
import
paddle
import
paddle.distributed
as
dist
import
paddle.fluid
as
fluid
import
test_collective_api_base
as
test_collective_base
import
test_communication_api_base
as
test_base
class
StreamSendRecvTestCase
():
def
__init__
(
self
):
self
.
_sync_op
=
eval
(
os
.
getenv
(
"sync_op"
))
self
.
_use_calc_stream
=
eval
(
os
.
getenv
(
"use_calc_stream"
))
self
.
_backend
=
os
.
getenv
(
"backend"
)
self
.
_shape
=
eval
(
os
.
getenv
(
"shape"
))
self
.
_dtype
=
os
.
getenv
(
"dtype"
)
self
.
_seeds
=
eval
(
os
.
getenv
(
"seeds"
))
if
self
.
_backend
not
in
[
"nccl"
,
"gloo"
]:
raise
NotImplementedError
(
"Only support nccl and gloo as the backend for now."
)
os
.
environ
[
"PADDLE_DISTRI_BACKEND"
]
=
self
.
_backend
def
run_test_case
(
self
):
dist
.
init_parallel_env
()
test_data_list
=
[]
for
seed
in
self
.
_seeds
:
test_data_list
.
append
(
test_collective_base
.
create_test_data
(
shape
=
self
.
_shape
,
dtype
=
self
.
_dtype
,
seed
=
seed
))
rank
=
dist
.
get_rank
()
tensor
=
paddle
.
to_tensor
(
test_data_list
[
rank
])
if
rank
==
0
:
task
=
dist
.
stream
.
send
(
tensor
,
dst
=
1
,
sync_op
=
self
.
_sync_op
,
use_calc_stream
=
self
.
_use_calc_stream
)
else
:
task
=
dist
.
stream
.
recv
(
tensor
,
src
=
0
,
sync_op
=
self
.
_sync_op
,
use_calc_stream
=
self
.
_use_calc_stream
)
if
not
self
.
_sync_op
:
task
.
wait
()
result
=
test_data_list
[
0
]
assert
np
.
allclose
(
tensor
,
result
,
rtol
=
1e-05
,
atol
=
1e-05
)
if
__name__
==
"__main__"
:
StreamSendRecvTestCase
().
run_test_case
()
python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py
0 → 100644
浏览文件 @
ae00f428
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
paddle
import
test_communication_api_base
as
test_base
class
TestCommunicationStreamSendRecvAPI
(
test_base
.
CommunicationTestDistBase
):
def
setUp
(
self
):
super
(
TestCommunicationStreamSendRecvAPI
,
self
).
setUp
(
num_of_devices
=
2
,
timeout
=
120
)
self
.
_default_envs
=
{
"backend"
:
"nccl"
,
"shape"
:
"(100, 200)"
,
"dtype"
:
"float32"
,
"seeds"
:
str
(
self
.
_seeds
)
}
self
.
_changeable_envs
=
{
"sync_op"
:
[
"True"
,
"False"
],
"use_calc_stream"
:
[
"True"
,
"False"
]
}
def
test_sendrecv_stream
(
self
):
envs_list
=
test_base
.
gen_product_envs_list
(
self
.
_default_envs
,
self
.
_changeable_envs
)
for
envs
in
envs_list
:
if
eval
(
envs
[
"use_calc_stream"
])
and
not
eval
(
envs
[
"sync_op"
]):
continue
self
.
run_test_case
(
"communication_stream_sendrecv_api_dygraph.py"
,
user_defined_envs
=
envs
)
def
tearDown
(
self
):
super
(
TestCommunicationStreamSendRecvAPI
,
self
).
tearDown
()
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/collective/testslist.csv
浏览文件 @
ae00f428
...
@@ -32,8 +32,9 @@ test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_
...
@@ -32,8 +32,9 @@ test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_
test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_collective_wait,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_collective_wait,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
test_communication_stream_sendrecv_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
test_
communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=
,
test_
new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..
,
test_world_size_and_rank,linux,rocm;gpu,120,DIST,test_world_size_and_rank.sh,2,,http_proxy=;https_proxy=,
test_world_size_and_rank,linux,rocm;gpu,120,DIST,test_world_size_and_rank.sh,2,,http_proxy=;https_proxy=,
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录