Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
2c7870e0
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2c7870e0
编写于
11月 12, 2021
作者:
Y
Yuang Liu
提交者:
GitHub
11月 12, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[fleet_executor] handle empty addr for single card train (#37150)
上级
742378f4
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
72 addition
and
21 deletion
+72
-21
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+15
-9
paddle/fluid/distributed/fleet_executor/message_bus.cc
paddle/fluid/distributed/fleet_executor/message_bus.cc
+8
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+1
-0
python/paddle/fluid/tests/unittests/test_fleet_executor.py
python/paddle/fluid/tests/unittests/test_fleet_executor.py
+1
-12
python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py
...luid/tests/unittests/test_fleet_executor_multi_devices.py
+47
-0
未找到文件。
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
浏览文件 @
2c7870e0
...
@@ -43,24 +43,30 @@ void FleetExecutor::InitMessageBus() {
...
@@ -43,24 +43,30 @@ void FleetExecutor::InitMessageBus() {
std
::
unordered_map
<
int64_t
,
std
::
string
>
rank_to_addr
;
std
::
unordered_map
<
int64_t
,
std
::
string
>
rank_to_addr
;
std
::
string
addr
;
std
::
string
addr
;
for
(
const
auto
&
rank_info
:
exe_desc_
.
cluster_info
())
{
for
(
const
auto
&
rank_info
:
exe_desc_
.
cluster_info
())
{
// init the dns map
int64_t
rank
=
rank_info
.
rank
();
int64_t
rank
=
rank_info
.
rank
();
std
::
string
ip_port
=
rank_info
.
ip_port
();
std
::
string
ip_port
=
rank_info
.
ip_port
();
ss
<<
rank
<<
"
\t
->
\t
"
<<
ip_port
<<
"
\n
"
;
ss
<<
rank
<<
"
\t
->
\t
"
<<
ip_port
<<
"
\n
"
;
// TODO(Yuang):
replace the first 'rank' with real interceptor id
// TODO(Yuang):
init interceptor_id_to_rank out of this loop
interceptor_id_to_rank
.
insert
(
std
::
make_pair
(
rank
,
rank
));
interceptor_id_to_rank
.
insert
(
std
::
make_pair
(
rank
,
rank
));
rank_to_addr
.
insert
(
std
::
make_pair
(
rank
,
ip_port
));
rank_to_addr
.
insert
(
std
::
make_pair
(
rank
,
ip_port
));
if
(
rank
==
cur_rank
)
{
if
(
rank
==
cur_rank
)
{
addr
=
ip_port
;
addr
=
ip_port
;
}
}
}
}
PADDLE_ENFORCE_NE
(
if
(
addr
==
""
)
{
addr
,
""
,
PADDLE_ENFORCE_EQ
(
platform
::
errors
::
NotFound
(
rank_to_addr
.
size
(),
0
,
"Current rank is %s, which ip_port cannot be found in the config."
,
platform
::
errors
::
NotFound
(
"Empty address is not valid for "
cur_rank
));
"paddle.distributed.launch method."
));
VLOG
(
3
)
<<
"Current rank is "
<<
cur_rank
<<
" and the ip_port is "
<<
addr
PADDLE_ENFORCE_EQ
(
<<
"."
;
cur_rank
,
0
,
VLOG
(
3
)
<<
"The number of ranks are "
<<
interceptor_id_to_rank
.
size
()
<<
"."
;
platform
::
errors
::
NotFound
(
"Address is empty but cur rank is not 0."
));
}
VLOG
(
3
)
<<
"Current rank is "
<<
cur_rank
<<
" and the ip_port is "
<<
(
addr
==
""
?
"empty"
:
addr
)
<<
"."
;
VLOG
(
3
)
<<
"The number of ranks are "
<<
(
rank_to_addr
.
size
()
==
0
?
1
:
rank_to_addr
.
size
())
<<
"."
;
VLOG
(
5
)
<<
ss
.
str
();
VLOG
(
5
)
<<
ss
.
str
();
MessageBus
&
message_bus_instance
=
MessageBus
::
Instance
();
MessageBus
&
message_bus_instance
=
MessageBus
::
Instance
();
if
(
!
message_bus_instance
.
IsInit
())
{
if
(
!
message_bus_instance
.
IsInit
())
{
...
...
paddle/fluid/distributed/fleet_executor/message_bus.cc
浏览文件 @
2c7870e0
...
@@ -85,6 +85,10 @@ bool MessageBus::Send(const InterceptorMessage& interceptor_message) {
...
@@ -85,6 +85,10 @@ bool MessageBus::Send(const InterceptorMessage& interceptor_message) {
}
}
void
MessageBus
::
ListenPort
()
{
void
MessageBus
::
ListenPort
()
{
if
(
addr_
==
""
)
{
VLOG
(
3
)
<<
"No need listen to port since training on single card."
;
return
;
}
#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
!defined(PADDLE_WITH_ASCEND_CL)
!defined(PADDLE_WITH_ASCEND_CL)
// function keep listen the port and handle the message
// function keep listen the port and handle the message
...
@@ -121,6 +125,10 @@ bool MessageBus::IsSameRank(int64_t src_id, int64_t dst_id) {
...
@@ -121,6 +125,10 @@ bool MessageBus::IsSameRank(int64_t src_id, int64_t dst_id) {
dst_rank
,
interceptor_id_to_rank_
.
end
(),
dst_rank
,
interceptor_id_to_rank_
.
end
(),
platform
::
errors
::
NotFound
(
platform
::
errors
::
NotFound
(
"Cannot find rank for dst interceptor id %lld. Init error."
,
dst_id
));
"Cannot find rank for dst interceptor id %lld. Init error."
,
dst_id
));
if
(
addr_
==
""
)
{
// single card training, must be same rank
return
true
;
}
const
auto
&
src_ip
=
rank_to_addr_
.
find
(
src_rank
->
second
);
const
auto
&
src_ip
=
rank_to_addr_
.
find
(
src_rank
->
second
);
PADDLE_ENFORCE_NE
(
src_ip
,
rank_to_addr_
.
end
(),
PADDLE_ENFORCE_NE
(
src_ip
,
rank_to_addr_
.
end
(),
platform
::
errors
::
NotFound
(
platform
::
errors
::
NotFound
(
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
2c7870e0
...
@@ -142,6 +142,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
...
@@ -142,6 +142,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
LIST
(
REMOVE_ITEM TEST_OPS test_fleet_gradient_scale
)
LIST
(
REMOVE_ITEM TEST_OPS test_fleet_gradient_scale
)
LIST
(
REMOVE_ITEM TEST_OPS test_disable_signal_handler
)
LIST
(
REMOVE_ITEM TEST_OPS test_disable_signal_handler
)
LIST
(
REMOVE_ITEM TEST_OPS test_fleet_executor
)
LIST
(
REMOVE_ITEM TEST_OPS test_fleet_executor
)
LIST
(
REMOVE_ITEM TEST_OPS test_fleet_executor_multi_devices
)
endif
()
endif
()
# Temporally disable test_deprecated_decorator
# Temporally disable test_deprecated_decorator
...
...
python/paddle/fluid/tests/unittests/test_fleet_executor.py
浏览文件 @
2c7870e0
...
@@ -13,7 +13,6 @@
...
@@ -13,7 +13,6 @@
# limitations under the License.
# limitations under the License.
import
unittest
import
unittest
import
os
import
paddle
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
...
@@ -32,17 +31,7 @@ class TestFleetExecutor(unittest.TestCase):
...
@@ -32,17 +31,7 @@ class TestFleetExecutor(unittest.TestCase):
}
}
exe
.
run
(
empty_program
,
feed
=
{
'x'
:
[
1
]})
exe
.
run
(
empty_program
,
feed
=
{
'x'
:
[
1
]})
def
test_executor_on_multi_devices
(
self
):
def
test_executor_on_single_device
(
self
):
places
=
[
fluid
.
CPUPlace
()]
if
fluid
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
run_fleet_executor
(
place
)
def
test_dist_executor_on_multi_devices
(
self
):
os
.
environ
[
"PADDLE_TRAINER_ID"
]
=
"0"
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
]
=
"127.0.0.1:7000,127.0.0.1:7001,127.0.0.1:7002"
places
=
[
fluid
.
CPUPlace
()]
places
=
[
fluid
.
CPUPlace
()]
if
fluid
.
is_compiled_with_cuda
():
if
fluid
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
places
.
append
(
fluid
.
CUDAPlace
(
0
))
...
...
python/paddle/fluid/tests/unittests/test_fleet_executor_multi_devices.py
0 → 100644
浏览文件 @
2c7870e0
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
os
import
paddle
import
paddle.fluid
as
fluid
paddle
.
enable_static
()
class
TestFleetExecutor
(
unittest
.
TestCase
):
def
run_fleet_executor
(
self
,
place
):
exe
=
paddle
.
static
.
Executor
(
place
)
empty_program
=
paddle
.
static
.
Program
()
with
fluid
.
program_guard
(
empty_program
,
empty_program
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1
],
dtype
=
paddle
.
float32
)
empty_program
.
_pipeline_opt
=
{
"fleet_opt"
:
True
,
"section_program"
:
empty_program
}
exe
.
run
(
empty_program
,
feed
=
{
'x'
:
[
1
]})
def
test_dist_executor_on_multi_devices
(
self
):
os
.
environ
[
"PADDLE_TRAINER_ID"
]
=
"0"
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
]
=
"127.0.0.1:7000,127.0.0.1:7001,127.0.0.1:7002"
places
=
[
fluid
.
CPUPlace
()]
if
fluid
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
CUDAPlace
(
0
))
for
place
in
places
:
self
.
run_fleet_executor
(
place
)
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录