Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
e83f5f33
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
e83f5f33
编写于
12月 09, 2022
作者:
姜
姜永久
提交者:
GitHub
12月 09, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
remove xpu eager guard tests (#48786)
上级
25dafc58
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
176 addition
and
192 deletion
+176
-192
python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
...ests/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
+45
-47
python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
...on/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
+131
-145
未找到文件。
python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
浏览文件 @
e83f5f33
...
@@ -19,7 +19,6 @@ import numpy as np
...
@@ -19,7 +19,6 @@ import numpy as np
import
paddle
import
paddle
import
paddle.distributed
as
dist
import
paddle.distributed
as
dist
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
paddle.fluid.framework
import
_test_eager_guard
from
paddle.nn
import
Linear
from
paddle.nn
import
Linear
paddle
.
seed
(
1024
)
paddle
.
seed
(
1024
)
...
@@ -69,58 +68,57 @@ class SimpleNet(fluid.Layer):
...
@@ -69,58 +68,57 @@ class SimpleNet(fluid.Layer):
class
TestDistTraning
(
unittest
.
TestCase
):
class
TestDistTraning
(
unittest
.
TestCase
):
def
test_multiple_xpus
(
self
):
def
test_multiple_xpus
(
self
):
self
.
trainer_id
=
dist
.
get_rank
()
self
.
trainer_id
=
dist
.
get_rank
()
with
_test_eager_guard
():
self
.
pg
=
dist
.
init_parallel_env
()
self
.
pg
=
dist
.
init_parallel_env
()
model_a
=
SimpleNet
(
self
.
trainer_id
)
model_a
=
SimpleNet
(
self
.
trainer_id
)
model_b
=
SimpleNet
(
self
.
trainer_id
)
model_b
=
SimpleNet
(
self
.
trainer_id
)
state_dict
=
model_a
.
state_dict
()
state_dict
=
model_a
.
state_dict
()
model_b
.
set_state_dict
(
state_dict
)
model_b
.
set_state_dict
(
state_dict
)
model_a
=
paddle
.
DataParallel
(
model_a
=
paddle
.
DataParallel
(
model_a
,
find_unused_parameters
=
True
,
group
=
self
.
pg
model_a
,
find_unused_parameters
=
True
,
group
=
self
.
pg
)
model_b
=
paddle
.
DataParallel
(
model_b
,
find_unused_parameters
=
True
,
group
=
self
.
pg
)
ones_input
=
paddle
.
ones
(
shape
=
(
batch
,
in_dim
))
ones_input
.
stop_gradient
=
True
w1_grad_sum
=
np
.
zeros
((
in_dim
,
out_dim
),
dtype
=
'float32'
)
w2_grad_sum
=
np
.
zeros
((
in_dim
,
out_dim
),
dtype
=
'float32'
)
for
step_id
in
range
(
5
):
random_input
=
paddle
.
rand
(
shape
=
(
batch
,
in_dim
))
random_input
.
stop_gradient
=
True
if
step_id
%
2
==
0
:
out_a
=
model_a
(
random_input
)
out_b
=
model_b
(
random_input
)
else
:
out_a
=
model_a
(
ones_input
)
out_b
=
model_b
(
ones_input
)
out_a
.
sum
().
backward
()
out_b
.
sum
().
backward
()
self
.
check_gradient
(
model_a
.
parameters
())
self
.
check_gradient
(
model_b
.
parameters
())
# test acc gradient
w1_grad_sum
=
self
.
check_acc
(
model_a
.
_layers
.
w1
.
grad
,
w1_grad_sum
,
model_b
.
_layers
.
w1
.
grad
,
)
)
model_b
=
paddle
.
DataParallel
(
w2_grad_sum
=
self
.
check_acc
(
model_b
,
find_unused_parameters
=
True
,
group
=
self
.
pg
model_a
.
_layers
.
w2
.
grad
,
w2_grad_sum
,
model_b
.
_layers
.
w2
.
grad
,
)
)
ones_input
=
paddle
.
ones
(
shape
=
(
batch
,
in_dim
))
model_a
.
clear_gradients
()
ones_input
.
stop_gradient
=
True
w1_grad_sum
=
np
.
zeros
((
in_dim
,
out_dim
),
dtype
=
'float32'
)
w2_grad_sum
=
np
.
zeros
((
in_dim
,
out_dim
),
dtype
=
'float32'
)
for
step_id
in
range
(
5
):
random_input
=
paddle
.
rand
(
shape
=
(
batch
,
in_dim
))
random_input
.
stop_gradient
=
True
if
step_id
%
2
==
0
:
out_a
=
model_a
(
random_input
)
out_b
=
model_b
(
random_input
)
else
:
out_a
=
model_a
(
ones_input
)
out_b
=
model_b
(
ones_input
)
out_a
.
sum
().
backward
()
out_b
.
sum
().
backward
()
self
.
check_gradient
(
model_a
.
parameters
())
self
.
check_gradient
(
model_b
.
parameters
())
# test acc gradient
w1_grad_sum
=
self
.
check_acc
(
model_a
.
_layers
.
w1
.
grad
,
w1_grad_sum
,
model_b
.
_layers
.
w1
.
grad
,
)
w2_grad_sum
=
self
.
check_acc
(
model_a
.
_layers
.
w2
.
grad
,
w2_grad_sum
,
model_b
.
_layers
.
w2
.
grad
,
)
model_a
.
clear_gradients
()
def
check_acc
(
self
,
grad
,
grad_sum
,
acc_grad
):
def
check_acc
(
self
,
grad
,
grad_sum
,
acc_grad
):
if
grad
is
not
None
:
if
grad
is
not
None
:
...
...
python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
浏览文件 @
e83f5f33
...
@@ -21,7 +21,6 @@ import numpy as np
...
@@ -21,7 +21,6 @@ import numpy as np
import
paddle
import
paddle
import
paddle.distributed
as
dist
import
paddle.distributed
as
dist
from
paddle.fluid.dygraph.parallel
import
ParallelEnv
from
paddle.fluid.dygraph.parallel
import
ParallelEnv
from
paddle.fluid.framework
import
_test_eager_guard
def
init_process_group
(
strategy
=
None
):
def
init_process_group
(
strategy
=
None
):
...
@@ -45,150 +44,137 @@ class TestProcessGroupFp32(unittest.TestCase):
...
@@ -45,150 +44,137 @@ class TestProcessGroupFp32(unittest.TestCase):
self
.
shape
=
(
2
,
10
,
5
)
self
.
shape
=
(
2
,
10
,
5
)
def
test_create_process_group_bkcl
(
self
):
def
test_create_process_group_bkcl
(
self
):
with
_test_eager_guard
():
device_id
=
paddle
.
distributed
.
ParallelEnv
().
dev_id
device_id
=
paddle
.
distributed
.
ParallelEnv
().
dev_id
paddle
.
set_device
(
'xpu:%d'
%
device_id
)
paddle
.
set_device
(
'xpu:%d'
%
device_id
)
pg
=
init_process_group
()
pg
=
init_process_group
()
sys
.
stdout
.
write
(
sys
.
stdout
.
write
(
"rank {}: size {} name {}
\n
"
.
format
(
pg
.
rank
(),
pg
.
size
(),
pg
.
name
())
"rank {}: size {} name {}
\n
"
.
format
(
)
pg
.
rank
(),
pg
.
size
(),
pg
.
name
()
sys
.
stdout
.
write
(
"rank {}: test new group api ok
\n
"
.
format
(
pg
.
rank
()))
)
)
# test allreduce sum
sys
.
stdout
.
write
(
# rank 0
"rank {}: test new group api ok
\n
"
.
format
(
pg
.
rank
())
x
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
)
tensor_x
=
paddle
.
to_tensor
(
x
)
# rank 1
# test allreduce sum
y
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
# rank 0
tensor_y
=
paddle
.
to_tensor
(
y
)
x
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
tensor_x
=
paddle
.
to_tensor
(
x
)
sum_result
=
tensor_x
+
tensor_y
# rank 1
if
pg
.
rank
()
==
0
:
y
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
task
=
dist
.
all_reduce
(
tensor_x
)
tensor_y
=
paddle
.
to_tensor
(
y
)
assert
np
.
array_equal
(
tensor_x
,
sum_result
)
else
:
sum_result
=
tensor_x
+
tensor_y
task
=
dist
.
all_reduce
(
tensor_y
)
if
pg
.
rank
()
==
0
:
assert
np
.
array_equal
(
tensor_y
,
sum_result
)
task
=
dist
.
all_reduce
(
tensor_x
)
assert
np
.
array_equal
(
tensor_x
,
sum_result
)
sys
.
stdout
.
write
(
else
:
"rank {}: test allreduce sum api ok
\n
"
.
format
(
pg
.
rank
())
task
=
dist
.
all_reduce
(
tensor_y
)
)
assert
np
.
array_equal
(
tensor_y
,
sum_result
)
# TODO
sys
.
stdout
.
write
(
# test allreduce max/min/prod
"rank {}: test allreduce sum api ok
\n
"
.
format
(
pg
.
rank
())
)
# test broadcast
# rank 0
# TODO
x
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
# test allreduce max/min/prod
tensor_x
=
paddle
.
to_tensor
(
x
)
# rank 1
# test broadcast
y
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
# rank 0
tensor_y
=
paddle
.
to_tensor
(
y
)
x
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
tensor_x
=
paddle
.
to_tensor
(
x
)
broadcast_result
=
paddle
.
assign
(
tensor_x
)
# rank 1
if
pg
.
rank
()
==
0
:
y
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
# XPU don't support event query by now, so just use sync op here
tensor_y
=
paddle
.
to_tensor
(
y
)
task
=
dist
.
broadcast
(
tensor_x
,
0
)
paddle
.
device
.
xpu
.
synchronize
()
broadcast_result
=
paddle
.
assign
(
tensor_x
)
assert
np
.
array_equal
(
broadcast_result
,
tensor_x
)
if
pg
.
rank
()
==
0
:
else
:
# XPU don't support event query by now, so just use sync op here
task
=
dist
.
broadcast
(
tensor_y
,
0
)
task
=
dist
.
broadcast
(
tensor_x
,
0
)
paddle
.
device
.
xpu
.
synchronize
()
paddle
.
device
.
xpu
.
synchronize
()
assert
np
.
array_equal
(
broadcast_result
,
tensor_y
)
assert
np
.
array_equal
(
broadcast_result
,
tensor_x
)
else
:
sys
.
stdout
.
write
(
"rank {}: test broadcast api ok
\n
"
.
format
(
pg
.
rank
()))
task
=
dist
.
broadcast
(
tensor_y
,
0
)
paddle
.
device
.
xpu
.
synchronize
()
# test barrier
assert
np
.
array_equal
(
broadcast_result
,
tensor_y
)
# rank 0
if
pg
.
rank
()
==
0
:
sys
.
stdout
.
write
(
pg
.
barrier
(
device_id
)
"rank {}: test broadcast api ok
\n
"
.
format
(
pg
.
rank
())
# rank 1
)
else
:
task
=
pg
.
barrier
(
device_id
)
# test barrier
task
.
wait
()
# rank 0
if
pg
.
rank
()
==
0
:
sys
.
stdout
.
write
(
"rank {}: test barrier api ok
\n
"
.
format
(
pg
.
rank
()))
pg
.
barrier
(
device_id
)
# rank 1
# test allgather
else
:
# rank 0
task
=
pg
.
barrier
(
device_id
)
x
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
task
.
wait
()
y
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
tensor_x
=
paddle
.
to_tensor
(
x
)
sys
.
stdout
.
write
(
"rank {}: test barrier api ok
\n
"
.
format
(
pg
.
rank
()))
tensor_y
=
paddle
.
to_tensor
(
y
)
out_shape
=
list
(
self
.
shape
)
# test allgather
out_shape
[
0
]
*=
2
# rank 0
out
=
np
.
random
.
random
(
out_shape
).
astype
(
self
.
dtype
)
x
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
tensor_out
=
paddle
.
to_tensor
(
out
)
y
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
if
pg
.
rank
()
==
0
:
tensor_x
=
paddle
.
to_tensor
(
x
)
task
=
pg
.
all_gather
(
tensor_x
,
tensor_out
)
tensor_y
=
paddle
.
to_tensor
(
y
)
task
.
wait
()
out_shape
=
list
(
self
.
shape
)
paddle
.
device
.
xpu
.
synchronize
()
out_shape
[
0
]
*=
2
# rank 1
out
=
np
.
random
.
random
(
out_shape
).
astype
(
self
.
dtype
)
else
:
tensor_out
=
paddle
.
to_tensor
(
out
)
tensor_out_list
=
[
if
pg
.
rank
()
==
0
:
paddle
.
empty_like
(
tensor_x
),
task
=
pg
.
all_gather
(
tensor_x
,
tensor_out
)
paddle
.
empty_like
(
tensor_x
),
task
.
wait
()
]
paddle
.
device
.
xpu
.
synchronize
()
task
=
dist
.
all_gather
(
tensor_out_list
,
tensor_y
)
# rank 1
paddle
.
device
.
xpu
.
synchronize
()
else
:
tensor_out
=
paddle
.
concat
(
tensor_out_list
)
tensor_out_list
=
[
out_1
=
paddle
.
slice
(
tensor_out
,
[
0
],
[
0
],
[
out_shape
[
0
]
//
2
])
paddle
.
empty_like
(
tensor_x
),
out_2
=
paddle
.
slice
(
paddle
.
empty_like
(
tensor_x
),
tensor_out
,
[
0
],
[
out_shape
[
0
]
//
2
],
[
out_shape
[
0
]]
]
)
task
=
dist
.
all_gather
(
tensor_out_list
,
tensor_y
)
assert
np
.
array_equal
(
tensor_x
,
out_1
)
paddle
.
device
.
xpu
.
synchronize
()
assert
np
.
array_equal
(
tensor_y
,
out_2
)
tensor_out
=
paddle
.
concat
(
tensor_out_list
)
sys
.
stdout
.
write
(
"rank {}: test allgather api ok
\n
"
.
format
(
pg
.
rank
()))
out_1
=
paddle
.
slice
(
tensor_out
,
[
0
],
[
0
],
[
out_shape
[
0
]
//
2
])
out_2
=
paddle
.
slice
(
if
pg
.
rank
()
==
0
:
tensor_out
,
[
0
],
[
out_shape
[
0
]
//
2
],
[
out_shape
[
0
]]
task
=
pg
.
all_gather
(
tensor_x
,
tensor_out
)
)
task
.
wait
()
assert
np
.
array_equal
(
tensor_x
,
out_1
)
paddle
.
device
.
xpu
.
synchronize
()
assert
np
.
array_equal
(
tensor_y
,
out_2
)
# rank 1
sys
.
stdout
.
write
(
else
:
"rank {}: test allgather api ok
\n
"
.
format
(
pg
.
rank
())
tensor_out_list
=
[]
)
task
=
dist
.
all_gather
(
tensor_out_list
,
tensor_y
)
paddle
.
device
.
xpu
.
synchronize
()
if
pg
.
rank
()
==
0
:
tensor_out
=
paddle
.
concat
(
tensor_out_list
)
task
=
pg
.
all_gather
(
tensor_x
,
tensor_out
)
out_1
=
paddle
.
slice
(
tensor_out
,
[
0
],
[
0
],
[
out_shape
[
0
]
//
2
])
task
.
wait
()
out_2
=
paddle
.
slice
(
paddle
.
device
.
xpu
.
synchronize
()
tensor_out
,
[
0
],
[
out_shape
[
0
]
//
2
],
[
out_shape
[
0
]]
# rank 1
)
else
:
assert
np
.
array_equal
(
tensor_x
,
out_1
)
tensor_out_list
=
[]
assert
np
.
array_equal
(
tensor_y
,
out_2
)
task
=
dist
.
all_gather
(
tensor_out_list
,
tensor_y
)
sys
.
stdout
.
write
(
"rank {}: test allgather api2 ok
\n
"
.
format
(
pg
.
rank
()))
paddle
.
device
.
xpu
.
synchronize
()
tensor_out
=
paddle
.
concat
(
tensor_out_list
)
# test Reduce
out_1
=
paddle
.
slice
(
tensor_out
,
[
0
],
[
0
],
[
out_shape
[
0
]
//
2
])
# rank 0
out_2
=
paddle
.
slice
(
x
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
tensor_out
,
[
0
],
[
out_shape
[
0
]
//
2
],
[
out_shape
[
0
]]
y
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
)
tensor_x
=
paddle
.
to_tensor
(
x
)
assert
np
.
array_equal
(
tensor_x
,
out_1
)
tensor_y
=
paddle
.
to_tensor
(
y
)
assert
np
.
array_equal
(
tensor_y
,
out_2
)
sum_result
=
tensor_x
+
tensor_y
sys
.
stdout
.
write
(
if
pg
.
rank
()
==
0
:
"rank {}: test allgather api2 ok
\n
"
.
format
(
pg
.
rank
())
task
=
dist
.
reduce
(
tensor_x
,
0
,
sync_op
=
True
)
)
paddle
.
device
.
xpu
.
synchronize
()
# rank 1
# test Reduce
else
:
# rank 0
task
=
dist
.
reduce
(
tensor_y
,
0
,
sync_op
=
False
)
x
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
task
.
wait
()
y
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
paddle
.
device
.
xpu
.
synchronize
()
tensor_x
=
paddle
.
to_tensor
(
x
)
if
pg
.
rank
()
==
0
:
tensor_y
=
paddle
.
to_tensor
(
y
)
assert
np
.
array_equal
(
tensor_x
,
sum_result
)
sum_result
=
tensor_x
+
tensor_y
sys
.
stdout
.
write
(
"rank {}: test reduce sum api ok
\n
"
.
format
(
pg
.
rank
()))
if
pg
.
rank
()
==
0
:
task
=
dist
.
reduce
(
tensor_x
,
0
,
sync_op
=
True
)
paddle
.
device
.
xpu
.
synchronize
()
# rank 1
else
:
task
=
dist
.
reduce
(
tensor_y
,
0
,
sync_op
=
False
)
task
.
wait
()
paddle
.
device
.
xpu
.
synchronize
()
if
pg
.
rank
()
==
0
:
assert
np
.
array_equal
(
tensor_x
,
sum_result
)
sys
.
stdout
.
write
(
"rank {}: test reduce sum api ok
\n
"
.
format
(
pg
.
rank
())
)
class
TestProcessGroupFp16
(
TestProcessGroupFp32
):
class
TestProcessGroupFp16
(
TestProcessGroupFp32
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录