Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
edda13cd
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
edda13cd
编写于
11月 18, 2022
作者:
W
Wen Sun
提交者:
GitHub
11月 18, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refactor collective communication reduce, scatter, reduce_scatter C++ API (#48115)
上级
d7f7963f
变更
11
显示空白变更内容
内联
并排
Showing
11 changed file
with
432 addition
and
446 deletion
+432
-446
paddle/fluid/distributed/collective/NCCLTools.h
paddle/fluid/distributed/collective/NCCLTools.h
+2
-1
paddle/fluid/distributed/collective/ProcessGroup.h
paddle/fluid/distributed/collective/ProcessGroup.h
+30
-30
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+36
-26
paddle/fluid/distributed/collective/ProcessGroupGloo.h
paddle/fluid/distributed/collective/ProcessGroupGloo.h
+10
-12
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+125
-144
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+19
-21
paddle/fluid/distributed/collective/ProcessGroupStream.cc
paddle/fluid/distributed/collective/ProcessGroupStream.cc
+66
-66
paddle/fluid/distributed/collective/ProcessGroupStream.h
paddle/fluid/distributed/collective/ProcessGroupStream.h
+37
-39
paddle/fluid/pybind/distributed_py.cc
paddle/fluid/pybind/distributed_py.cc
+99
-99
python/paddle/distributed/communication/stream/reduce_scatter.py
...paddle/distributed/communication/stream/reduce_scatter.py
+4
-4
python/paddle/distributed/communication/stream/scatter.py
python/paddle/distributed/communication/stream/scatter.py
+4
-4
未找到文件。
paddle/fluid/distributed/collective/NCCLTools.h
浏览文件 @
edda13cd
...
...
@@ -47,7 +47,7 @@
namespace
paddle
{
namespace
distributed
{
#define NCCL
CHECK(cmd)
\
#define NCCL
_CHECK(cmd)
\
do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
...
...
@@ -60,6 +60,7 @@ namespace distributed {
} while (0)
ncclRedOp_t
ToNCCLRedType
(
ReduceOp
reduction
);
std
::
string
SerializeNCCLUniqueId
(
const
ncclUniqueId
&
ncclID
);
}
// namespace distributed
...
...
paddle/fluid/distributed/collective/ProcessGroup.h
浏览文件 @
edda13cd
...
...
@@ -150,6 +150,36 @@ class ProcessGroup {
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceOptions
&
opts
,
bool
sync_op
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"ProcessGroup%s does not support reduce with sync_op flag."
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ReduceScatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"ProcessGroup%s does not support reduce_scatter with sync_op flag."
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ScatterOptions
&
opts
,
bool
sync_op
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"ProcessGroup%s does not support scatter with sync_op flag."
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
phi
::
DenseTensor
*
tensor
,
int
src_rank
,
int64_t
offset
,
...
...
@@ -273,16 +303,6 @@ class ProcessGroup {
"ProcessGroup%s does not support reduce"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
/* input tensors */
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
/* output tensors */
,
// NOLINT
const
ReduceOptions
&
,
bool
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support reduce with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
...
...
@@ -291,26 +311,6 @@ class ProcessGroup {
"ProcessGroup%s does not support scatter"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
const
ScatterOptions
&
,
bool
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support scatter with sync_op flag"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
,
// NOLINT
const
ReduceScatterOptions
&
,
bool
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support reduce_scatter with sync_op flag"
,
GetBackendName
()));
}
protected:
const
int
rank_
;
const
int
size_
;
...
...
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
浏览文件 @
edda13cd
...
...
@@ -234,8 +234,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
const
phi
::
DenseTensor
&
in_tensor
,
const
BroadcastOptions
&
opts
,
bool
sync_op
)
{
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
=
{
in_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
=
{
*
out_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
{
in_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
{
*
out_tensor
};
return
Broadcast
(
in_wrapper
,
out_wrapper
,
opts
,
true
);
}
...
...
@@ -396,8 +396,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllGather(
int64_t
offset
,
// for compatibility, no use now
int64_t
numel
,
// for compatibility, no use now
bool
sync_op
)
{
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
=
{
in_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
=
{
*
out_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
{
in_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
{
*
out_tensor
};
return
AllGather
(
in_wrapper
,
out_wrapper
,
true
);
}
...
...
@@ -475,26 +475,34 @@ class ReduceGlooTask : public ProcessGroupGloo::GlooTask {
};
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
ReduceOptions
&
opts
)
{
return
Reduce
(
inputs
,
outputs
,
opts
,
true
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceOptions
&
opts
,
bool
sync_op
)
{
bool
sync_op
// for compatibility, no use now
)
{
std
::
shared_ptr
<
ReduceGlooTask
>
task
;
auto
tag
=
next_tag
();
auto
context
=
get_context
();
task
=
std
::
make_shared
<
ReduceGlooTask
>
(
rank_
,
context
,
inputs
,
outputs
,
opts
.
reduce_op
,
opts
.
root_rank
,
tag
);
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
{
in_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
{
*
out_tensor
};
task
=
std
::
make_shared
<
ReduceGlooTask
>
(
rank_
,
context
,
in_wrapper
,
out_wrapper
,
opts
.
reduce_op
,
opts
.
root_rank
,
tag
);
task
->
Run
();
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
ReduceOptions
&
opts
)
{
return
Reduce
(
&
outputs
[
0
],
inputs
[
0
],
opts
,
true
);
}
class
ScatterGlooTask
:
public
ProcessGroupGloo
::
GlooTask
{
public:
ScatterGlooTask
(
int
rank
,
...
...
@@ -538,26 +546,28 @@ class ScatterGlooTask : public ProcessGroupGloo::GlooTask {
};
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
)
{
return
Scatter
(
in_tensors
,
out_tensors
,
opts
,
true
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ScatterOptions
&
opts
,
bool
sync_op
)
{
std
::
shared_ptr
<
ScatterGlooTask
>
task
;
auto
tag
=
next_tag
();
auto
context
=
get_context
();
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
{
in_tensor
};
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
{
*
out_tensor
};
task
=
std
::
make_shared
<
ScatterGlooTask
>
(
rank_
,
context
,
in_
tensors
,
out_tensors
,
opts
.
root_rank
,
size_
,
tag
);
rank_
,
context
,
in_
wrapper
,
out_wrapper
,
opts
.
root_rank
,
size_
,
tag
);
task
->
Run
();
return
task
;
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupGloo
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
)
{
return
Scatter
(
&
out_tensors
[
0
],
in_tensors
[
0
],
opts
,
true
);
}
std
::
shared_ptr
<::
gloo
::
transport
::
Device
>
ProcessGroupGloo
::
createDeviceForInterface
(
const
std
::
string
&
ifname
)
{
::
gloo
::
transport
::
tcp
::
attr
attr
;
...
...
paddle/fluid/distributed/collective/ProcessGroupGloo.h
浏览文件 @
edda13cd
...
...
@@ -120,6 +120,16 @@ class ProcessGroupGloo : public ProcessGroup {
const
BroadcastOptions
&
opts
,
bool
sync_op
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceOptions
&
opts
,
bool
sync_op
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ScatterOptions
&
opts
,
bool
sync_op
)
override
;
// TODO(sunyilun): methods below will be removed later
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
...
...
@@ -155,23 +165,11 @@ class ProcessGroupGloo : public ProcessGroup {
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
bool
sync_op
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
,
bool
sync_op
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
,
bool
sync_op
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
...
...
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
浏览文件 @
edda13cd
...
...
@@ -87,11 +87,11 @@ ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr<Store>& store,
:
ProcessGroupStream
(
rank
,
size
,
gid
),
store_
(
store
)
{}
void
ProcessGroupNCCL
::
GroupStart
()
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupStart
());
NCCL_CHECK
(
platform
::
dynload
::
ncclGroupStart
());
}
void
ProcessGroupNCCL
::
GroupEnd
()
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGroupEnd
());
NCCL_CHECK
(
platform
::
dynload
::
ncclGroupEnd
());
}
const
phi
::
DeviceContext
&
ProcessGroupNCCL
::
GetDeviceContext
(
...
...
@@ -144,13 +144,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
const
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
gpuStream_t
stream
)
{
return
platform
::
dynload
::
ncclAllGather
(
NCCL_CHECK
(
platform
::
dynload
::
ncclAllGather
(
input
.
data
(),
output
->
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
comm
,
stream
);
stream
)
)
;
},
CommType
::
ALLGATHER
,
sync_op
,
...
...
@@ -170,14 +170,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
const
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
gpuStream_t
stream
)
{
return
platform
::
dynload
::
ncclAllReduce
(
NCCL_CHECK
(
platform
::
dynload
::
ncclAllReduce
(
input
.
data
(),
output
->
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
type
()),
ToNCCLRedType
(
opts
.
reduce_op
),
comm
,
stream
);
stream
)
)
;
},
CommType
::
ALLREDUCE
,
sync_op
,
...
...
@@ -231,7 +231,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
for
(
auto
i
=
0
;
i
<
size_
;
i
++
)
{
in_numel
=
in_size_each_rank
[
i
]
*
in_row_size
;
input_partial
=
GetPartialTensor
(
input
,
in_offset
,
in_numel
);
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclSend
(
NCCL_CHECK
(
platform
::
dynload
::
ncclSend
(
input_partial
.
data
(),
in_numel
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
...
...
@@ -242,7 +242,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
out_numel
=
out_size_each_rank
[
i
]
*
out_row_size
;
output_partial
=
GetPartialTensor
(
*
output
,
out_offset
,
out_numel
);
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
NCCL_CHECK
(
platform
::
dynload
::
ncclRecv
(
output_partial
.
data
(),
out_numel
,
platform
::
ToNCCLDataType
(
output
->
dtype
()),
...
...
@@ -294,20 +294,127 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
ncclComm_t
comm
,
gpuStream_t
stream
)
{
int
root
=
opts
.
source_rank
+
opts
.
source_root
;
return
platform
::
dynload
::
ncclBroadcast
(
NCCL_CHECK
(
platform
::
dynload
::
ncclBroadcast
(
input
.
data
(),
output
->
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
type
()),
root
,
comm
,
stream
);
stream
)
)
;
},
CommType
::
BROADCAST
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Reduce
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
return
Collective
(
out_tensor
,
in_tensor
,
[
&
](
phi
::
DenseTensor
*
output
,
const
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
gpuStream_t
stream
)
{
NCCL_CHECK
(
platform
::
dynload
::
ncclReduce
(
input
.
data
(),
output
->
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
ToNCCLRedType
(
opts
.
reduce_op
),
opts
.
root_rank
,
comm
,
stream
));
},
CommType
::
REDUCE
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
ReduceScatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
return
Collective
(
out_tensor
,
in_tensor
,
[
&
](
phi
::
DenseTensor
*
output
,
const
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
gpuStream_t
stream
)
{
NCCL_CHECK
(
platform
::
dynload
::
ncclReduceScatter
(
input
.
data
(),
output
->
data
(),
output
->
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
ToNCCLRedType
(
opts
.
reduce_op
),
comm
,
stream
));
},
CommType
::
REDUCE_SCATTER
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Scatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
return
Collective
(
out_tensor
,
in_tensor
,
[
&
](
phi
::
DenseTensor
*
output
,
const
phi
::
DenseTensor
&
input
,
ncclComm_t
comm
,
gpuStream_t
stream
)
{
int64_t
numel
=
input
.
numel
()
/
size_
;
if
(
rank_
==
opts
.
root_rank
)
{
int64_t
offset
=
0
;
phi
::
DenseTensor
partial_tensor
;
GroupStart
();
for
(
auto
i
=
0
;
i
<
size_
;
i
++
)
{
partial_tensor
=
GetPartialTensor
(
input
,
offset
,
numel
);
NCCL_CHECK
(
platform
::
dynload
::
ncclSend
(
partial_tensor
.
data
(),
numel
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
offset
+=
numel
;
}
NCCL_CHECK
(
platform
::
dynload
::
ncclRecv
(
output
->
data
(),
numel
,
platform
::
ToNCCLDataType
(
output
->
dtype
()),
opts
.
root_rank
,
comm
,
stream
));
GroupEnd
();
}
else
{
NCCL_CHECK
(
platform
::
dynload
::
ncclRecv
(
output
->
data
(),
numel
,
platform
::
ToNCCLDataType
(
output
->
dtype
()),
opts
.
root_rank
,
comm
,
stream
));
}
},
CommType
::
SCATTER
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Recv
(
phi
::
DenseTensor
*
tensor
,
int
src_rank
,
...
...
@@ -328,13 +435,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
int
src
,
ncclComm_t
comm
,
gpuStream_t
stream
)
{
return
platform
::
dynload
::
ncclRecv
(
NCCL_CHECK
(
platform
::
dynload
::
ncclRecv
(
output
->
data
(),
output
->
numel
(),
platform
::
ToNCCLDataType
(
output
->
dtype
()),
src
,
comm
,
stream
);
stream
)
)
;
},
CommType
::
RECV
,
sync_op
,
...
...
@@ -361,13 +468,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
int
dst
,
ncclComm_t
comm
,
gpuStream_t
stream
)
{
return
platform
::
dynload
::
ncclSend
(
NCCL_CHECK
(
platform
::
dynload
::
ncclSend
(
input
->
data
(),
input
->
numel
(),
platform
::
ToNCCLDataType
(
input
->
dtype
()),
dst
,
comm
,
stream
);
stream
)
)
;
},
CommType
::
SEND
,
sync_op
,
...
...
@@ -406,7 +513,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
ncclUniqueId
nccl_id
;
if
(
rank_
==
0
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGetUniqueId
(
&
nccl_id
));
NCCL_CHECK
(
platform
::
dynload
::
ncclGetUniqueId
(
&
nccl_id
));
}
BroadcastUniqueNCCLID
(
&
nccl_id
);
...
...
@@ -418,7 +525,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
auto
comm_ctx
=
std
::
make_unique
<
phi
::
GPUContext
>
(
place
);
ncclComm_t
nccl_comm
;
NCCLCHECK
(
platform
::
dynload
::
ncclCommInitRank
(
NCCL
_
CHECK
(
platform
::
dynload
::
ncclCommInitRank
(
&
nccl_comm
,
GetSize
(),
nccl_id
,
GetRank
()));
comm_ctx
->
set_nccl_comm
(
nccl_comm
);
...
...
@@ -611,7 +718,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
ncclUniqueId
nccl_id
;
if
(
rank_
==
0
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclGetUniqueId
(
&
nccl_id
));
NCCL_CHECK
(
platform
::
dynload
::
ncclGetUniqueId
(
&
nccl_id
));
}
BroadcastUniqueNCCLID
(
&
nccl_id
);
...
...
@@ -632,7 +739,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
dev_ctx
[
i
].
reset
(
new
phi
::
GPUContext
(
places
[
i
]));
ncclComm_t
nccl_comm
;
NCCLCHECK
(
platform
::
dynload
::
ncclCommInitRank
(
NCCL
_
CHECK
(
platform
::
dynload
::
ncclCommInitRank
(
&
nccl_comm
,
GetSize
(),
nccl_id
,
GetRank
()));
dev_ctx
[
i
]
->
set_nccl_comm
(
nccl_comm
);
dev_ctx_raw
[
i
]
=
dev_ctx
[
i
].
get
();
...
...
@@ -1257,70 +1364,6 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
CommType
::
REDUCE
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
const
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclReduce
(
input
.
data
(),
output
.
data
(),
input
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
ToNCCLRedType
(
opts
.
reduce_op
),
opts
.
root_rank
,
comm
,
stream
));
},
CommType
::
REDUCE
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
platform
::
CUDADeviceGuard
cuda_guard
;
cuda_guard
.
SetDevice
(
output
.
place
());
memory
::
RecordStream
(
output
.
Holder
(),
stream
);
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclReduceScatter
(
input
.
data
(),
output
.
data
(),
output
.
numel
(),
platform
::
ToNCCLDataType
(
input
.
dtype
()),
ToNCCLRedType
(
opts
.
reduce_op
),
comm
,
stream
));
},
CommType
::
REDUCE_SCATTER
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
...
...
@@ -1374,67 +1417,5 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
CommType
::
SCATTER
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupNCCL
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
in_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
out_tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
return
Collective
(
in_tensors
,
out_tensors
,
[
&
](
phi
::
DenseTensor
&
input
,
phi
::
DenseTensor
&
output
,
ncclComm_t
comm
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_EQ
(
output
.
numel
(),
input
.
numel
()
/
size_
,
platform
::
errors
::
InvalidArgument
(
"Input and output tensors should have the same shape."
));
size_t
offset
=
0
;
if
(
rank_
==
opts
.
root_rank
)
{
GroupStart
();
for
(
auto
i
=
0
;
i
<
size_
;
i
++
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclSend
(
GetPointerByOffset
(
input
.
data
(),
offset
,
input
.
dtype
()),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
i
,
comm
,
stream
));
offset
+=
input
.
numel
()
/
size_
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
opts
.
root_rank
,
comm
,
stream
));
GroupEnd
();
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
output
.
data
(),
input
.
numel
()
/
size_
,
platform
::
ToNCCLDataType
(
input
.
dtype
()),
opts
.
root_rank
,
comm
,
stream
));
}
},
CommType
::
SCATTER
,
sync_op
,
use_calc_stream
);
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
浏览文件 @
edda13cd
...
...
@@ -127,6 +127,25 @@ class ProcessGroupNCCL final : public ProcessGroupStream {
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ReduceScatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
phi
::
DenseTensor
*
tensor
,
int
src_rank
,
int64_t
offset
,
...
...
@@ -184,32 +203,11 @@ class ProcessGroupNCCL final : public ProcessGroupStream {
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
private:
std
::
shared_ptr
<
ProcessGroupNCCL
::
NCCLTask
>
CreateTask
(
const
Place
&
place
,
int
rank
,
...
...
paddle/fluid/distributed/collective/ProcessGroupStream.cc
浏览文件 @
edda13cd
...
...
@@ -120,6 +120,72 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Broadcast(
"ProcessGroup%s does not support broadcast."
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Reduce
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceOptions
&
opts
,
bool
sync_op
)
{
return
Reduce
(
out_tensor
,
in_tensor
,
opts
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Reduce
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"ProcessGroup%s does not support reduce."
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
ReduceScatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
)
{
return
ReduceScatter
(
out_tensor
,
in_tensor
,
opts
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
ReduceScatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"ProcessGroup%s does not support reduce_scatter."
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Scatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ScatterOptions
&
opts
,
bool
sync_op
)
{
return
Scatter
(
out_tensor
,
in_tensor
,
opts
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Scatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"ProcessGroup%s does not support scatter."
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Recv
(
phi
::
DenseTensor
*
tensor
,
int
src_rank
,
...
...
@@ -190,72 +256,6 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAll(
"ProcessGroup%s does not support do alltoall"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
,
bool
sync_op
)
{
return
Reduce
(
in_tensors
,
out_tensors
,
opts
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do reduce"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
)
{
return
ReduceScatter
(
in_tensors
,
out_tensors
,
opts
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do reduce_scatter"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
,
bool
sync_op
)
{
return
Scatter
(
in_tensors
,
out_tensors
,
opts
,
sync_op
,
/*use_calc_stream*/
false
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
const
ScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support do scatter"
,
GetBackendName
()));
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupStream
::
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
int
src_rank
,
bool
sync_op
)
{
return
Recv
(
tensors
,
...
...
paddle/fluid/distributed/collective/ProcessGroupStream.h
浏览文件 @
edda13cd
...
...
@@ -117,6 +117,43 @@ class ProcessGroupStream : public ProcessGroup {
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceOptions
&
opts
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ReduceScatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ReduceScatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ReduceScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ScatterOptions
&
opts
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
ScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
phi
::
DenseTensor
*
tensor
,
int
src_rank
,
int64_t
offset
,
...
...
@@ -155,45 +192,6 @@ class ProcessGroupStream : public ProcessGroup {
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
ReduceOptions
&
opts
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Reduce
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
ReduceOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
ReduceScatterOptions
&
opts
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ReduceScatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
ReduceScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
ScatterOptions
&
opts
,
bool
sync_op
)
override
;
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Scatter
(
std
::
vector
<
phi
::
DenseTensor
>&
in_tensors
,
// NOLINT
std
::
vector
<
phi
::
DenseTensor
>&
out_tensors
,
// NOLINT
const
ScatterOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
);
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
std
::
vector
<
phi
::
DenseTensor
>&
tensors
,
// NOLINT
int
src_rank
,
...
...
paddle/fluid/pybind/distributed_py.cc
浏览文件 @
edda13cd
...
...
@@ -412,16 +412,17 @@ void BindDistributed(py::module *m) {
.
def
(
"reduce"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_
in_
tensor
,
py
::
handle
py_tensor
,
int
dst
,
distributed
::
ReduceOp
op
,
bool
sync_op
)
{
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
auto
p_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensor
.
impl
());
auto
*
out_dense
=
p_dense
.
get
();
auto
in_dense
=
*
p_dense
;
distributed
::
ReduceOptions
opts
{
op
,
dst
};
auto
dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
in_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
tensors
=
{
*
dense
};
return
self
.
Reduce
(
tensors
,
tensors
,
opts
,
sync_op
);
return
self
.
Reduce
(
out_dense
,
in_dense
,
opts
,
sync_op
);
},
py
::
arg
(
"tensor"
),
py
::
arg
(
"dst"
),
...
...
@@ -432,28 +433,27 @@ void BindDistributed(py::module *m) {
.
def
(
"reduce_scatter"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_in_tensor_list
,
py
::
handle
py_out_tensor
,
py
::
handle
py_in_tensor_list
,
distributed
::
ReduceOp
op
,
bool
sync_op
)
{
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
auto
p_out_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
out_tensor
.
impl
());
auto
out_dense
=
p_out_tensor
.
get
();
auto
in_tensor_list
=
CastPyArg2VectorOfTensor
(
py_in_tensor_list
.
ptr
(),
0
);
Tensor
concat_in_tensor
=
paddle
::
concat
(
in_tensor_list
,
0
);
auto
in_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
auto
p_in_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
concat_in_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
=
{
*
in_dense
};
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
auto
out_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
out_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
=
{
*
out_dense
};
auto
in_dense
=
*
p_in_tensor
;
distributed
::
ReduceScatterOptions
opts
{
op
};
return
self
.
ReduceScatter
(
in_wrapper
,
out_wrapper
,
opts
,
sync_op
);
return
self
.
ReduceScatter
(
out_dense
,
in_dense
,
opts
,
sync_op
);
},
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
arg
(
"in"
),
py
::
arg
(
"op"
),
py
::
arg
(
"sync_op"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
...
...
@@ -461,26 +461,25 @@ void BindDistributed(py::module *m) {
.
def
(
"reduce_scatter_tensor"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_in_tensor
,
py
::
handle
py_out_tensor
,
py
::
handle
py_in_tensor
,
distributed
::
ReduceOp
op
,
bool
sync_op
)
{
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
in_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
in_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
=
{
*
in_dense
};
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
auto
out_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
auto
p_out_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
out_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
=
{
*
out_dense
};
auto
out_dense
=
p_out_tensor
.
get
();
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
p_in_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
in_tensor
.
impl
());
auto
in_dense
=
*
p_in_tensor
;
distributed
::
ReduceScatterOptions
opts
{
op
};
return
self
.
ReduceScatter
(
in_wrapper
,
out_wrapper
,
opts
,
sync_op
);
return
self
.
ReduceScatter
(
out_dense
,
in_dense
,
opts
,
sync_op
);
},
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
arg
(
"in"
),
py
::
arg
(
"op"
),
py
::
arg
(
"sync_op"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
...
...
@@ -488,27 +487,27 @@ void BindDistributed(py::module *m) {
.
def
(
"scatter"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_in_tensor_list
,
py
::
handle
py_out_tensor
,
py
::
handle
py_in_tensor_list
,
int
src
,
bool
sync_op
)
{
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
auto
p_out_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
out_tensor
.
impl
());
auto
*
out_dense
=
p_out_tensor
.
get
();
auto
in_tensor_list
=
CastPyArg2VectorOfTensor
(
py_in_tensor_list
.
ptr
(),
0
);
Tensor
concat_in_tensor
=
paddle
::
concat
(
in_tensor_list
,
0
);
auto
in_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
auto
p_in_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
concat_in_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
=
{
*
in_dense
};
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
auto
out_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
out_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
=
{
*
out_dense
};
auto
in_dense
=
*
p_in_tensor
;
distributed
::
ScatterOptions
opts
{
src
};
return
self
.
Scatter
(
in_wrapper
,
out_wrapper
,
opts
,
sync_op
);
return
self
.
Scatter
(
out_dense
,
in_dense
,
opts
,
sync_op
);
},
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
arg
(
"in"
),
py
::
arg
(
"src"
),
py
::
arg
(
"sync_op"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
...
...
@@ -516,25 +515,25 @@ void BindDistributed(py::module *m) {
.
def
(
"scatter_tensor"
,
[](
distributed
::
ProcessGroup
&
self
,
py
::
handle
py_in_tensor
,
py
::
handle
py_out_tensor
,
py
::
handle
py_in_tensor
,
int
src
,
bool
sync_op
)
{
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
in_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
in_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
=
{
*
in_dense
};
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
auto
out_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
auto
p_out_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
out_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
=
{
*
out_dense
};
auto
*
out_dense
=
p_out_tensor
.
get
();
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
p_in_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
in_tensor
.
impl
());
auto
in_dense
=
*
p_in_tensor
;
distributed
::
ScatterOptions
opts
{
src
};
return
self
.
Scatter
(
in_wrapper
,
out_wrapper
,
opts
,
sync_op
);
return
self
.
Scatter
(
out_dense
,
in_dense
,
opts
,
sync_op
);
},
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
arg
(
"in"
),
py
::
arg
(
"src"
),
py
::
arg
(
"sync_op"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
...
...
@@ -986,16 +985,17 @@ void BindDistributed(py::module *m) {
.
def
(
"reduce_on_calc_stream"
,
[](
distributed
::
ProcessGroupStream
&
self
,
py
::
handle
py_
in_
tensor
,
py
::
handle
py_tensor
,
int
dst
,
distributed
::
ReduceOp
op
)
{
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
tensor
=
CastPyArg2Tensor
(
py_tensor
.
ptr
(),
0
);
auto
p_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensor
.
impl
());
auto
*
out_dense
=
p_dense
.
get
();
auto
in_dense
=
*
p_dense
;
distributed
::
ReduceOptions
opts
{
op
,
dst
};
auto
dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
in_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
tensors
=
{
*
dense
};
return
self
.
Reduce
(
tensors
,
tensors
,
return
self
.
Reduce
(
out_dense
,
in_dense
,
opts
,
/*sync_op*/
true
,
/*use_calc_stream*/
true
);
...
...
@@ -1008,116 +1008,116 @@ void BindDistributed(py::module *m) {
.
def
(
"reduce_scatter_on_calc_stream"
,
[](
distributed
::
ProcessGroupStream
&
self
,
py
::
handle
py_in_tensor_list
,
py
::
handle
py_out_tensor
,
py
::
handle
py_in_tensor_list
,
distributed
::
ReduceOp
op
)
{
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
auto
p_out_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
out_tensor
.
impl
());
auto
out_dense
=
p_out_tensor
.
get
();
auto
in_tensor_list
=
CastPyArg2VectorOfTensor
(
py_in_tensor_list
.
ptr
(),
0
);
Tensor
concat_in_tensor
=
paddle
::
concat
(
in_tensor_list
,
0
);
auto
in_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
auto
p_in_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
concat_in_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
=
{
*
in_dense
};
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
auto
out_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
out_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
=
{
*
out_dense
};
auto
in_dense
=
*
p_in_tensor
;
distributed
::
ReduceScatterOptions
opts
{
op
};
return
self
.
ReduceScatter
(
in_wrapper
,
out_wrapper
,
return
self
.
ReduceScatter
(
out_dense
,
in_dense
,
opts
,
/*sync_op*/
true
,
/*use_calc_stream*/
true
);
},
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
arg
(
"in"
),
py
::
arg
(
"op"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"reduce_scatter_tensor_on_calc_stream"
,
[](
distributed
::
ProcessGroupStream
&
self
,
py
::
handle
py_in_tensor
,
py
::
handle
py_out_tensor
,
py
::
handle
py_in_tensor
,
distributed
::
ReduceOp
op
)
{
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
in_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
in_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
=
{
*
in_dense
};
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
auto
out_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
auto
p_out_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
out_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
=
{
*
out_dense
};
auto
out_dense
=
p_out_tensor
.
get
();
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
p_in_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
in_tensor
.
impl
());
auto
in_dense
=
*
p_in_tensor
;
distributed
::
ReduceScatterOptions
opts
{
op
};
return
self
.
ReduceScatter
(
in_wrapper
,
out_wrapper
,
return
self
.
ReduceScatter
(
out_dense
,
in_dense
,
opts
,
/*sync_op*/
true
,
/*use_calc_stream*/
true
);
},
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
arg
(
"in"
),
py
::
arg
(
"op"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"scatter_on_calc_stream"
,
[](
distributed
::
ProcessGroupStream
&
self
,
py
::
handle
py_in_tensor_list
,
py
::
handle
py_out_tensor
,
py
::
handle
py_in_tensor_list
,
int
src
)
{
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
auto
p_out_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
out_tensor
.
impl
());
auto
*
out_dense
=
p_out_tensor
.
get
();
auto
in_tensor_list
=
CastPyArg2VectorOfTensor
(
py_in_tensor_list
.
ptr
(),
0
);
Tensor
concat_in_tensor
=
paddle
::
concat
(
in_tensor_list
,
0
);
auto
in_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
auto
p_in_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
concat_in_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
=
{
*
in_dense
};
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
auto
out_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
out_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
=
{
*
out_dense
};
auto
in_dense
=
*
p_in_tensor
;
distributed
::
ScatterOptions
opts
{
src
};
return
self
.
Scatter
(
in_wrapper
,
out_wrapper
,
return
self
.
Scatter
(
out_dense
,
in_dense
,
opts
,
/*sync_op*/
true
,
/*use_calc_stream*/
true
);
},
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
arg
(
"in"
),
py
::
arg
(
"src"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"scatter_tensor_on_calc_stream"
,
[](
distributed
::
ProcessGroupStream
&
self
,
py
::
handle
py_in_tensor
,
py
::
handle
py_out_tensor
,
py
::
handle
py_in_tensor
,
int
src
)
{
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
in_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
in_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
in_wrapper
=
{
*
in_dense
};
auto
out_tensor
=
CastPyArg2Tensor
(
py_out_tensor
.
ptr
(),
0
);
auto
out_dense
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
auto
p_out_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
out_tensor
.
impl
());
std
::
vector
<
phi
::
DenseTensor
>
out_wrapper
=
{
*
out_dense
};
auto
*
out_dense
=
p_out_tensor
.
get
();
auto
in_tensor
=
CastPyArg2Tensor
(
py_in_tensor
.
ptr
(),
0
);
auto
p_in_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
in_tensor
.
impl
());
auto
in_dense
=
*
p_in_tensor
;
distributed
::
ScatterOptions
opts
{
src
};
return
self
.
Scatter
(
in_wrapper
,
out_wrapper
,
return
self
.
Scatter
(
out_dense
,
in_dense
,
opts
,
/*sync_op*/
true
,
/*use_calc_stream*/
true
);
},
py
::
arg
(
"in"
),
py
::
arg
(
"out"
),
py
::
arg
(
"in"
),
py
::
arg
(
"src"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
...
...
python/paddle/distributed/communication/stream/reduce_scatter.py
浏览文件 @
edda13cd
...
...
@@ -57,11 +57,11 @@ def _reduce_scatter_tensor_in_dygraph(
if
use_calc_stream
:
return
group
.
process_group
.
reduce_scatter_tensor_on_calc_stream
(
in_tensor
,
out
_tensor
,
op_type
out_tensor
,
in
_tensor
,
op_type
)
task
=
group
.
process_group
.
reduce_scatter_tensor
(
in_tensor
,
out
_tensor
,
op_type
,
sync_op
out_tensor
,
in
_tensor
,
op_type
,
sync_op
)
if
sync_op
:
task
.
wait
()
...
...
@@ -78,11 +78,11 @@ def _reduce_scatter_in_dygraph(
if
use_calc_stream
:
return
group
.
process_group
.
reduce_scatter_on_calc_stream
(
tensor
_list
,
tensor
,
op_type
tensor
,
tensor_list
,
op_type
)
task
=
group
.
process_group
.
reduce_scatter
(
tensor
_list
,
tensor
,
op_type
,
sync_op
tensor
,
tensor_list
,
op_type
,
sync_op
)
if
sync_op
:
task
.
wait
()
...
...
python/paddle/distributed/communication/stream/scatter.py
浏览文件 @
edda13cd
...
...
@@ -53,11 +53,11 @@ def _scatter_tensor_in_dygraph(
if
use_calc_stream
:
return
group
.
process_group
.
scatter_tensor_on_calc_stream
(
in_tensor
,
out
_tensor
,
src_rank_in_group
out_tensor
,
in
_tensor
,
src_rank_in_group
)
task
=
group
.
process_group
.
scatter_tensor
(
in_tensor
,
out
_tensor
,
src_rank_in_group
,
sync_op
out_tensor
,
in
_tensor
,
src_rank_in_group
,
sync_op
)
if
sync_op
:
task
.
wait
()
...
...
@@ -80,11 +80,11 @@ def _scatter_in_dygraph(
if
use_calc_stream
:
return
group
.
process_group
.
scatter_on_calc_stream
(
tensor
_list
,
tensor
,
src_rank_in_group
tensor
,
tensor_list
,
src_rank_in_group
)
task
=
group
.
process_group
.
scatter
(
tensor
_list
,
tensor
,
src_rank_in_group
,
sync_op
tensor
,
tensor_list
,
src_rank_in_group
,
sync_op
)
if
sync_op
:
task
.
wait
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录