Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
7158061a
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7158061a
编写于
1月 22, 2021
作者:
G
gongweibao
提交者:
GitHub
1月 22, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add startup bash files of test_ascend_group. (#30645)
Add startup bash files of test_ascend_group
上级
e4287ca6
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
150 addition
and
0 deletion
+150
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-0
python/paddle/fluid/tests/unittests/ascend_group.py
python/paddle/fluid/tests/unittests/ascend_group.py
+118
-0
python/paddle/fluid/tests/unittests/test_ascend_group.sh
python/paddle/fluid/tests/unittests/test_ascend_group.sh
+30
-0
未找到文件。
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
7158061a
...
...
@@ -39,6 +39,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_run_random_port)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_launch_async
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend
)
list
(
APPEND MIXED_DIST_TEST_OPS test_ascend_group
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_api_input
)
list
(
APPEND MIXED_DIST_TEST_OPS test_collective_optimizer
)
...
...
@@ -524,6 +525,7 @@ if(WITH_DISTRIBUTE)
bash_test_modules
(
test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
bash_test_modules
(
test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
bash_test_modules
(
test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
bash_test_modules
(
test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
# port range (20000, 23000) is reserved for dist-ops
set
(
dist_ut_port 20001
)
...
...
python/paddle/fluid/tests/unittests/ascend_group.py
0 → 100644
浏览文件 @
7158061a
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
time
import
paddle.fluid
as
fluid
from
paddle.fluid
import
unique_name
import
paddle.fluid.core
as
core
import
paddle
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.distributed
import
fleet
paddle
.
enable_static
()
OpRole
=
core
.
op_proto_and_checker_maker
.
OpRole
OP_ROLE_KEY
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
OP_ROLE_VAR_KEY
=
core
.
op_proto_and_checker_maker
.
kOpRoleVarAttrName
()
role
=
fleet
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
def
init_communicator
(
startup_program
,
main_program
,
current_endpoint
,
endpoints
,
ring_id
):
nranks
=
len
(
endpoints
)
other_endpoints
=
endpoints
[:]
other_endpoints
.
remove
(
current_endpoint
)
group_rank
=
endpoints
.
index
(
current_endpoint
)
assert
group_rank
>=
0
block
=
startup_program
.
global_block
()
nccl_id_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
inputs
=
{},
outputs
=
{
'Out'
:
nccl_id_var
},
attrs
=
{
'rank'
:
group_rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
,
OP_ROLE_KEY
:
OpRole
.
Forward
,
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
group_rank
,
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
,
})
block
.
create_var
(
name
=
"data"
,
persistable
=
True
,
dtype
=
'float32'
)
with
fluid
.
program_guard
(
main_program
):
op_type
=
"c_allreduce_sum"
data
=
fluid
.
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
'float32'
,
value
=
2.5
)
helper
=
LayerHelper
(
op_type
,
**
locals
())
helper
.
append_op
(
type
=
op_type
,
inputs
=
{
'X'
:
[
data
]},
outputs
=
{
'Out'
:
[
data
]},
attrs
=
{
'ring_id'
:
ring_id
,
'use_calc_stream'
:
True
})
def
train
(
world_endpoints
,
world_device_ids
,
local_device_ids
,
local_rank
):
startup_programs
=
[]
main_programs
=
[]
#trainer_endpoints=["127.0.0.1:6071","127.0.0.1:6072","127.0.0.1:6073","127.0.0.1:6074"]
trainer_endpoints
=
world_endpoints
groups
=
[[],
[],
[]]
groups
[
0
]
=
[
trainer_endpoints
[
0
],
trainer_endpoints
[
1
]]
groups
[
1
]
=
[
trainer_endpoints
[
2
],
trainer_endpoints
[
3
]]
groups
[
2
]
=
[
trainer_endpoints
[
0
],
trainer_endpoints
[
2
]]
for
i
in
range
(
len
(
trainer_endpoints
)):
startup_programs
.
append
(
fluid
.
Program
())
main_programs
.
append
(
fluid
.
Program
())
for
idx
,
group
in
enumerate
(
groups
):
for
te
in
group
:
te_idx
=
trainer_endpoints
.
index
(
te
)
startup_program
=
startup_programs
[
te_idx
]
main_program
=
main_programs
[
te_idx
]
init_communicator
(
startup_program
,
main_program
,
te
,
group
,
idx
)
print
(
len
(
startup_programs
))
print
(
startup_programs
[
local_rank
])
print
(
main_programs
[
local_rank
])
worker_endpoints
=
fleet
.
worker_endpoints
()
world_device_ids
=
fleet
.
world_device_ids
()
local_device_ids
=
fleet
.
local_device_ids
()
local_rank
=
int
(
fleet
.
local_rank
())
print
(
"worker_endpoints:"
,
worker_endpoints
)
print
(
"world_device_ids:"
,
world_device_ids
)
print
(
"local_device_ids:"
,
local_device_ids
)
print
(
"local_rank:"
,
local_rank
)
train
(
worker_endpoints
,
world_device_ids
,
local_device_ids
,
local_rank
)
python/paddle/fluid/tests/unittests/test_ascend_group.sh
0 → 100644
浏览文件 @
7158061a
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set
-e
cluster_node_ips
=
"127.0.0.1"
export
PADDLE_TRAINERS_NUM
=
4
export
POD_IP
=
127.0.0.1
export
PADDLE_TRAINERS
=
127.0.0.1
export
PADDLE_TRAINER_ID
=
0
export
PADDLE_PORT
=
35789
export
TRAINER_PORTS_NUM
=
4
distributed_args
=
"--ips=
${
cluster_node_ips
}
--ascend_npus=0,1,2,3 --log_dir=testlog"
python
-m
paddle.distributed.fleet.launch
${
distributed_args
}
\
ascend_group.py fleetascendgroup
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录