Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
7242f40b
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7242f40b
编写于
1月 18, 2023
作者:
J
jameszhang
提交者:
GitHub
1月 18, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
kunlun support p2p send/recv (#49896)
上级
6cd7fcaf
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
150 addition
and
24 deletion
+150
-24
paddle/fluid/distributed/collective/process_group_bkcl.cc
paddle/fluid/distributed/collective/process_group_bkcl.cc
+68
-0
paddle/fluid/distributed/collective/process_group_bkcl.h
paddle/fluid/distributed/collective/process_group_bkcl.h
+21
-7
python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
...on/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
+61
-17
未找到文件。
paddle/fluid/distributed/collective/process_group_bkcl.cc
浏览文件 @
7242f40b
...
...
@@ -16,6 +16,7 @@
#include "paddle/fluid/distributed/collective/bkcl_tools.h"
#include "paddle/fluid/distributed/collective/common.h"
#include "paddle/fluid/distributed/collective/utils.h"
#include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/phi/core/device_context.h"
...
...
@@ -87,6 +88,73 @@ void ProcessGroupBKCL::GroupEnd() {
PADDLE_ENFORCE_XPU_SUCCESS
(
bkcl_group_end
());
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupBKCL
::
Recv
(
phi
::
DenseTensor
*
tensor
,
int
src_rank
,
int64_t
offset
,
int64_t
numel
,
bool
sync_op
,
bool
use_calc_stream
)
{
// numel > 0 indicates the tensor need to be sliced
phi
::
DenseTensor
partial_tensor
;
if
(
numel
>
0
)
{
partial_tensor
=
GetPartialTensor
(
*
tensor
,
offset
,
numel
);
tensor
=
&
partial_tensor
;
}
return
Collective
(
tensor
,
// have to pass a tensor here
// TODO(zhangxiaoci) catch up with nccl's api
*
tensor
,
[
&
](
phi
::
DenseTensor
*
output
,
const
phi
::
DenseTensor
&
input
,
BKCLContext_t
comm
,
const
XPUStream
&
stream
)
{
return
bkcl_recv
(
comm
,
output
->
data
(),
output
->
numel
(),
src_rank
,
platform
::
ToBKCLDataType
(
framework
::
TransToProtoVarType
(
output
->
type
())),
stream
);
},
CommType
::
RECV
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupBKCL
::
Send
(
const
phi
::
DenseTensor
&
tensor
,
int
dst_rank
,
int64_t
offset
,
int64_t
numel
,
bool
sync_op
,
bool
use_calc_stream
)
{
// numel > 0 indicates the tensor need to be sliced
const
phi
::
DenseTensor
&
tensor_maybe_partial
=
numel
>
0
?
GetPartialTensor
(
tensor
,
offset
,
numel
)
:
tensor
;
return
Collective
(
nullptr
,
tensor_maybe_partial
,
[
&
](
phi
::
DenseTensor
*
output
,
const
phi
::
DenseTensor
&
input
,
BKCLContext_t
comm
,
const
XPUStream
&
stream
)
{
return
bkcl_send
(
comm
,
input
.
data
(),
input
.
numel
(),
dst_rank
,
platform
::
ToBKCLDataType
(
framework
::
TransToProtoVarType
(
input
.
type
())),
stream
);
},
CommType
::
SEND
,
sync_op
,
use_calc_stream
);
}
std
::
shared_ptr
<
ProcessGroupBKCL
::
BKCLTask
>
ProcessGroupBKCL
::
CreateTask
(
const
Place
&
place
,
int
rank
,
...
...
paddle/fluid/distributed/collective/process_group_bkcl.h
浏览文件 @
7242f40b
...
...
@@ -87,25 +87,25 @@ class ProcessGroupBKCL : public ProcessGroupWithStream {
phi
::
DeviceContext
*
GetDeviceContext
(
const
Place
&
place
,
bool
use_calc_stream
)
const
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
All
Reduce
(
std
::
shared_ptr
<
ProcessGroup
::
Task
>
All
Gather
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
AllreduceOptions
&
opts
,
int64_t
offset
,
// for compatibility, no use now
int64_t
numel
,
// for compatibility, no use now
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
const
Broadcast
Options
&
opts
,
const
Allreduce
Options
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllGather
(
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
phi
::
DenseTensor
*
out_tensor
,
const
phi
::
DenseTensor
&
in_tensor
,
int64_t
offset
,
// for compatibility, no use now
int64_t
numel
,
// for compatibility, no use now
const
BroadcastOptions
&
opts
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
...
...
@@ -115,6 +115,20 @@ class ProcessGroupBKCL : public ProcessGroupWithStream {
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Recv
(
phi
::
DenseTensor
*
tensor
,
int
src_rank
,
int64_t
offset
,
int64_t
numel
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Send
(
const
phi
::
DenseTensor
&
tensor
,
int
dst_rank
,
int64_t
offset
,
int64_t
numel
,
bool
sync_op
,
bool
use_calc_stream
)
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Barrier
(
const
BarrierOptions
&
=
BarrierOptions
())
override
;
...
...
python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
浏览文件 @
7242f40b
...
...
@@ -53,28 +53,26 @@ class TestProcessGroupFp32(unittest.TestCase):
)
sys
.
stdout
.
write
(
"rank {}: test new group api ok
\n
"
.
format
(
pg
.
rank
()))
# TODO(zhangxiaoci) allreduce unittest raise error
# test allreduce sum
# rank 0
x
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
tensor_x
=
paddle
.
to_tensor
(
x
)
#
x = np.random.random(self.shape).astype(self.dtype)
#
tensor_x = paddle.to_tensor(x)
# rank 1
y
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
tensor_y
=
paddle
.
to_tensor
(
y
)
#
y = np.random.random(self.shape).astype(self.dtype)
#
tensor_y = paddle.to_tensor(y)
sum_result
=
tensor_x
+
tensor_y
if
pg
.
rank
()
==
0
:
task
=
dist
.
all_reduce
(
tensor_x
)
assert
np
.
array_equal
(
tensor_x
,
sum_result
)
else
:
task
=
dist
.
all_reduce
(
tensor_y
)
assert
np
.
array_equal
(
tensor_y
,
sum_result
)
sys
.
stdout
.
write
(
"rank {}: test allreduce sum api ok
\n
"
.
format
(
pg
.
rank
())
)
# sum_result = tensor_x + tensor_y
# if pg.rank() == 0:
# task = dist.all_reduce(tensor_x)
# assert np.array_equal(tensor_x, sum_result)
# else:
# task = dist.all_reduce(tensor_y)
# assert np.array_equal(tensor_y, sum_result)
# TODO
# test allreduce max/min/prod
# sys.stdout.write(
# "rank {}: test allreduce sum api ok\n".format(pg.rank())
# )
# test broadcast
# rank 0
...
...
@@ -178,6 +176,52 @@ class TestProcessGroupFp32(unittest.TestCase):
assert
np
.
array_equal
(
tensor_y
,
old_tensor_y
)
sys
.
stdout
.
write
(
"rank {}: test reduce sum api ok
\n
"
.
format
(
pg
.
rank
()))
# test send async api
# rank 0
x
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
tensor_x
=
paddle
.
to_tensor
(
x
)
# rank 1
y
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
tensor_y
=
paddle
.
to_tensor
(
y
)
if
pg
.
rank
()
==
0
:
task
=
dist
.
send
(
tensor_x
,
1
,
sync_op
=
False
)
task
.
wait
()
else
:
task
=
dist
.
recv
(
tensor_y
,
0
,
sync_op
=
False
)
task
.
wait
()
assert
np
.
array_equal
(
tensor_y
,
tensor_x
)
# test send sync api
# rank 0
x
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
tensor_x
=
paddle
.
to_tensor
(
x
)
# rank 1
y
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
tensor_y
=
paddle
.
to_tensor
(
y
)
if
pg
.
rank
()
==
0
:
task
=
dist
.
send
(
tensor_x
,
1
,
sync_op
=
True
)
else
:
task
=
dist
.
recv
(
tensor_y
,
0
,
sync_op
=
True
)
assert
np
.
array_equal
(
tensor_y
,
tensor_x
)
# test send 0-d tensor
# rank 0
x
=
np
.
random
.
uniform
(
-
1
,
1
,
[]).
astype
(
self
.
dtype
)
tensor_x
=
paddle
.
to_tensor
(
x
)
# rank 1
y
=
np
.
array
(
0.2022
).
astype
(
self
.
dtype
)
tensor_y
=
paddle
.
to_tensor
(
y
)
if
pg
.
rank
()
==
0
:
task
=
dist
.
send
(
tensor_x
,
1
,
sync_op
=
True
)
else
:
task
=
dist
.
recv
(
tensor_y
,
0
,
sync_op
=
True
)
assert
np
.
array_equal
(
tensor_y
,
tensor_x
)
and
tensor_y
.
shape
==
[]
sys
.
stdout
.
write
(
"rank {}: test send api ok
\n
"
.
format
(
pg
.
rank
()))
class
TestProcessGroupFp16
(
TestProcessGroupFp32
):
def
setUp
(
self
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录