Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
ad2bc0c3
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ad2bc0c3
编写于
1月 21, 2020
作者:
G
gongweibao
提交者:
Tao Luo
1月 21, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix a distribution bug and cleanup some not need logs. (#22381)
上级
7b0692a6
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
20 addition
and
7 deletion
+20
-7
python/paddle/distributed/launch.py
python/paddle/distributed/launch.py
+10
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+1
-3
python/paddle/fluid/tests/unittests/test_launch.sh
python/paddle/fluid/tests/unittests/test_launch.sh
+9
-4
未找到文件。
python/paddle/distributed/launch.py
浏览文件 @
ad2bc0c3
...
@@ -202,6 +202,16 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
...
@@ -202,6 +202,16 @@ paddlecloud environment.".format(args.cluster_node_ips, node_ips))
]
]
selected_gpus_num
=
len
(
selected_gpus
)
selected_gpus_num
=
len
(
selected_gpus
)
if
args
.
use_paddlecloud
and
num_nodes
>
1
:
cloud_paddle_port
=
os
.
getenv
(
"PADDLE_PORT"
,
""
)
cloud_paddle_port_num
=
os
.
getenv
(
"PADDLE_PORTS_NUM"
,
""
)
if
cloud_paddle_port
!=
""
and
cloud_paddle_port_num
!=
""
:
cloud_paddle_port_num
=
int
(
cloud_paddle_port_num
)
if
cloud_paddle_port_num
>=
selected_gpus_num
:
args
.
started_port
=
int
(
cloud_paddle_port
)
logger
.
warning
(
"Use Cloud specified port:{}."
.
format
(
cloud_paddle_port
))
trainers_endpoints
=
""
trainers_endpoints
=
""
for
ip
in
node_ips
:
for
ip
in
node_ips
:
for
i
in
range
(
selected_gpus_num
):
for
i
in
range
(
selected_gpus_num
):
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
ad2bc0c3
...
@@ -7,13 +7,13 @@ file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
...
@@ -7,13 +7,13 @@ file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
string
(
REPLACE
".py"
""
DIST_TEST_OPS
"
${
DIST_TEST_OPS
}
"
)
string
(
REPLACE
".py"
""
DIST_TEST_OPS
"
${
DIST_TEST_OPS
}
"
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_mnist
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_mnist
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext
)
list
(
APPEND DIST_TEST_OPS test_listen_and_serv_op
)
set
(
MIXED_DIST_TEST_OPS
${
DIST_TEST_OPS
}
)
set
(
MIXED_DIST_TEST_OPS
${
DIST_TEST_OPS
}
)
#remove distribute unittests.
#remove distribute unittests.
list
(
APPEND MIXED_DIST_TEST_OPS test_dgc_op
)
list
(
APPEND MIXED_DIST_TEST_OPS test_dgc_op
)
list
(
APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op
)
list
(
APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op
)
list
(
APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler
)
list
(
APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler
)
list
(
APPEND MIXED_DIST_TEST_OPS test_listen_and_serv_op
)
list
(
APPEND MIXED_DIST_TEST_OPS test_nce_remote_table_op
)
list
(
APPEND MIXED_DIST_TEST_OPS test_nce_remote_table_op
)
list
(
APPEND MIXED_DIST_TEST_OPS test_recv_save_op
)
list
(
APPEND MIXED_DIST_TEST_OPS test_recv_save_op
)
list
(
APPEND MIXED_DIST_TEST_OPS test_transpiler_ops
)
list
(
APPEND MIXED_DIST_TEST_OPS test_transpiler_ops
)
...
@@ -288,7 +288,6 @@ if(WITH_DISTRIBUTE)
...
@@ -288,7 +288,6 @@ if(WITH_DISTRIBUTE)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_se_resnext_dgc"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_se_resnext_dgc"
)
endif
()
endif
()
if
(
NOT APPLE
)
if
(
NOT APPLE
)
bash_test_modules
(
test_listen_and_serv_op MODULES test_listen_and_serv.sh
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
# NOTE. test_launch only work in gpu collective mode
# NOTE. test_launch only work in gpu collective mode
bash_test_modules
(
test_launch MODULES test_launch.sh
)
bash_test_modules
(
test_launch MODULES test_launch.sh
)
...
@@ -297,7 +296,6 @@ if(WITH_DISTRIBUTE)
...
@@ -297,7 +296,6 @@ if(WITH_DISTRIBUTE)
set
(
dist_ut_port 1000
)
set
(
dist_ut_port 1000
)
foreach
(
TEST_OP
${
DIST_TEST_OPS
}
)
foreach
(
TEST_OP
${
DIST_TEST_OPS
}
)
message
(
STATUS
"set dist_ut_port=
${
dist_ut_port
}
on
${
TEST_OP
}
"
)
bash_test_modules
(
${
TEST_OP
}
MODULES dist_test.sh SERIAL LABELS
"RUN_TYPE=EXCLUSIVE"
ENVS
"PADDLE_DIST_UT_PORT=
${
dist_ut_port
}
"
)
bash_test_modules
(
${
TEST_OP
}
MODULES dist_test.sh SERIAL LABELS
"RUN_TYPE=EXCLUSIVE"
ENVS
"PADDLE_DIST_UT_PORT=
${
dist_ut_port
}
"
)
MATH
(
EXPR dist_ut_port
"
${
dist_ut_port
}
+50"
)
MATH
(
EXPR dist_ut_port
"
${
dist_ut_port
}
+50"
)
endforeach
(
TEST_OP
)
endforeach
(
TEST_OP
)
...
...
python/paddle/fluid/tests/unittests/test_launch.sh
浏览文件 @
ad2bc0c3
...
@@ -11,12 +11,14 @@ export POD_IP=127.0.0.1
...
@@ -11,12 +11,14 @@ export POD_IP=127.0.0.1
export
PADDLE_TRAINERS
=
127.0.0.1,127.0.0.2
export
PADDLE_TRAINERS
=
127.0.0.1,127.0.0.2
export
PADDLE_TRAINER_ID
=
0
export
PADDLE_TRAINER_ID
=
0
distributed_args
=
"--use_paddlecloud --cluster_node_ips=
${
cluster_node_ips
}
--node_ip=
${
node_ip
}
export
PADDLE_PORT
=
35019
--selected_gpus=0,1 --log_dir=testlog"
export
PADDLE_PORTS_NUM
=
2
distributed_args
=
"--use_paddlecloud --cluster_node_ips=
${
cluster_node_ips
}
--node_ip=
${
node_ip
}
--selected_gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES
=
0,1 python
-m
paddle.distributed.launch
${
distributed_args
}
multi_process.py
CUDA_VISIBLE_DEVICES
=
0,1 python
-m
paddle.distributed.launch
${
distributed_args
}
multi_process.py
str1
=
"selected_gpus:0 worker_endpoints:127.0.0.1:
6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170
trainer_id:0"
str1
=
"selected_gpus:0 worker_endpoints:127.0.0.1:
35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019
trainer_id:0"
str2
=
"selected_gpus:1 worker_endpoints:127.0.0.1:
6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6171
trainer_id:1"
str2
=
"selected_gpus:1 worker_endpoints:127.0.0.1:
35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020
trainer_id:1"
file_0
=
"multi_process.check_0.log"
file_0
=
"multi_process.check_0.log"
file_1
=
"multi_process.check_1.log"
file_1
=
"multi_process.check_1.log"
...
@@ -43,6 +45,9 @@ if [ -f $file_1 ]; then
...
@@ -43,6 +45,9 @@ if [ -f $file_1 ]; then
rm
$file_1
rm
$file_1
fi
fi
unset
PADDLE_PORT
unset
PADDLE_PORTS_NUM
echo
""
echo
""
echo
"paddle.distributed.launch async poll process test"
echo
"paddle.distributed.launch async poll process test"
if
!
CUDA_VISIBLE_DEVICES
=
0,1 python
-m
paddle.distributed.launch
${
distributed_args
}
multi_process.py abort
;
then
if
!
CUDA_VISIBLE_DEVICES
=
0,1 python
-m
paddle.distributed.launch
${
distributed_args
}
multi_process.py abort
;
then
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录