Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
f85bd5c9
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f85bd5c9
编写于
11月 18, 2021
作者:
Y
Yuang Liu
提交者:
GitHub
11月 18, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[fleet_executor] Parse runtime graph to start carrier (#37282)
上级
38141036
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
64 addition
and
5 deletion
+64
-5
paddle/fluid/distributed/fleet_executor/carrier.cc
paddle/fluid/distributed/fleet_executor/carrier.cc
+22
-0
paddle/fluid/distributed/fleet_executor/carrier.h
paddle/fluid/distributed/fleet_executor/carrier.h
+5
-0
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+21
-5
paddle/fluid/distributed/fleet_executor/fleet_executor.h
paddle/fluid/distributed/fleet_executor/fleet_executor.h
+1
-0
paddle/fluid/distributed/fleet_executor/interceptor.cc
paddle/fluid/distributed/fleet_executor/interceptor.cc
+10
-0
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+5
-0
未找到文件。
paddle/fluid/distributed/fleet_executor/carrier.cc
浏览文件 @
f85bd5c9
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/distributed/fleet_executor/carrier.h"
#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
#include "paddle/fluid/distributed/fleet_executor/interceptor_message_service.h"
#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
namespace
paddle
{
...
...
@@ -22,8 +23,11 @@ namespace distributed {
void
Carrier
::
Init
(
const
std
::
unordered_map
<
int64_t
,
TaskNode
*>&
interceptor_id_to_node
)
{
PADDLE_ENFORCE_EQ
(
is_init_
,
false
,
platform
::
errors
::
AlreadyExists
(
"Carrier is already init."
));
interceptor_id_to_node_
=
interceptor_id_to_node
;
CreateInterceptors
();
is_init_
=
true
;
}
bool
Carrier
::
EnqueueInterceptorMessage
(
...
...
@@ -63,6 +67,24 @@ Interceptor* Carrier::GetInterceptor(int64_t interceptor_id) {
return
iter
->
second
.
get
();
}
void
Carrier
::
Start
()
{
// TODO(fleet_executor dev): this start is a faked one, need replace
for
(
const
auto
&
pair
:
interceptor_idx_to_interceptor_
)
{
VLOG
(
3
)
<<
"Fake run is sending start to interceptor "
<<
pair
.
first
<<
"."
;
InterceptorMessage
tmp_msg
;
tmp_msg
.
set_src_id
(
pair
.
first
);
tmp_msg
.
set_dst_id
(
pair
.
first
);
tmp_msg
.
set_message_type
(
DATA_IS_READY
);
MessageBus
&
message_bus_instance
=
MessageBus
::
Instance
();
PADDLE_ENFORCE_EQ
(
message_bus_instance
.
IsInit
(),
true
,
platform
::
errors
::
PreconditionNotMet
(
"Message bus has not been initialized."
));
message_bus_instance
.
Send
(
tmp_msg
);
}
}
bool
Carrier
::
IsInit
()
const
{
return
is_init_
;
}
Interceptor
*
Carrier
::
SetInterceptor
(
int64_t
interceptor_id
,
std
::
unique_ptr
<
Interceptor
>
interceptor
)
{
auto
iter
=
interceptor_idx_to_interceptor_
.
find
(
interceptor_id
);
...
...
paddle/fluid/distributed/fleet_executor/carrier.h
浏览文件 @
f85bd5c9
...
...
@@ -56,6 +56,10 @@ class Carrier final {
void
SetCreatingFlag
(
bool
flag
);
void
Start
();
bool
IsInit
()
const
;
DISABLE_COPY_AND_ASSIGN
(
Carrier
);
private:
...
...
@@ -75,6 +79,7 @@ class Carrier final {
std
::
vector
<
InterceptorMessage
>
message_tmp_
{};
bool
creating_interceptors_
{
true
};
bool
is_init_
{
false
};
};
}
// namespace distributed
...
...
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
浏览文件 @
f85bd5c9
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
#include "paddle/fluid/distributed/fleet_executor/carrier.h"
#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
#include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
...
...
@@ -34,14 +35,21 @@ FleetExecutor::~FleetExecutor() {
void
FleetExecutor
::
Init
(
const
paddle
::
framework
::
ProgramDesc
&
program_desc
)
{
runtime_graph_
=
std
::
make_unique
<
RuntimeGraph
>
(
program_desc
,
exe_desc_
);
InitCarrier
();
InitMessageBus
();
}
void
FleetExecutor
::
InitCarrier
()
{
Carrier
&
carrier_instance
=
Carrier
::
Instance
();
if
(
!
carrier_instance
.
IsInit
())
{
carrier_instance
.
Init
(
runtime_graph_
->
intercepter_id_to_node
());
}
}
void
FleetExecutor
::
InitMessageBus
()
{
std
::
stringstream
ss
;
ss
<<
"
\n
The DNS table of the message bus is:
\n
"
;
int64_t
cur_rank
=
exe_desc_
.
cur_rank
();
std
::
unordered_map
<
int64_t
,
int64_t
>
interceptor_id_to_rank
;
std
::
unordered_map
<
int64_t
,
std
::
string
>
rank_to_addr
;
std
::
string
addr
;
for
(
const
auto
&
rank_info
:
exe_desc_
.
cluster_info
())
{
...
...
@@ -49,8 +57,6 @@ void FleetExecutor::InitMessageBus() {
int64_t
rank
=
rank_info
.
rank
();
std
::
string
ip_port
=
rank_info
.
ip_port
();
ss
<<
rank
<<
"
\t
->
\t
"
<<
ip_port
<<
"
\n
"
;
// TODO(Yuang): init interceptor_id_to_rank out of this loop
interceptor_id_to_rank
.
insert
(
std
::
make_pair
(
rank
,
rank
));
rank_to_addr
.
insert
(
std
::
make_pair
(
rank
,
ip_port
));
if
(
rank
==
cur_rank
)
{
addr
=
ip_port
;
...
...
@@ -58,7 +64,7 @@ void FleetExecutor::InitMessageBus() {
}
if
(
addr
==
""
)
{
PADDLE_ENFORCE_EQ
(
rank_to_addr
.
size
(),
0
,
rank_to_addr
.
size
(),
1
,
platform
::
errors
::
NotFound
(
"Empty address is not valid for "
"paddle.distributed.launch method."
));
PADDLE_ENFORCE_EQ
(
...
...
@@ -72,12 +78,22 @@ void FleetExecutor::InitMessageBus() {
VLOG
(
5
)
<<
ss
.
str
();
MessageBus
&
message_bus_instance
=
MessageBus
::
Instance
();
if
(
!
message_bus_instance
.
IsInit
())
{
message_bus_instance
.
Init
(
interceptor_id_to_rank
,
rank_to_addr
,
addr
);
message_bus_instance
.
Init
(
runtime_graph_
->
intercepter_id_to_rank
(),
rank_to_addr
,
addr
);
}
}
void
FleetExecutor
::
Run
()
{
// Run
Carrier
&
carrier_instance
=
Carrier
::
Instance
();
MessageBus
&
message_bus_instance
=
MessageBus
::
Instance
();
PADDLE_ENFORCE_EQ
(
carrier_instance
.
IsInit
(),
true
,
platform
::
errors
::
Unavailable
(
"Carrier has not been init yet."
));
PADDLE_ENFORCE_EQ
(
message_bus_instance
.
IsInit
(),
true
,
platform
::
errors
::
Unavailable
(
"MessageBus has not been init yet."
));
carrier_instance
.
Start
();
}
void
FleetExecutor
::
Release
()
{
...
...
paddle/fluid/distributed/fleet_executor/fleet_executor.h
浏览文件 @
f85bd5c9
...
...
@@ -43,6 +43,7 @@ class FleetExecutor final {
FleetExecutorDesc
exe_desc_
;
std
::
unique_ptr
<
RuntimeGraph
>
runtime_graph_
;
void
InitMessageBus
();
void
InitCarrier
();
};
}
// namespace distributed
...
...
paddle/fluid/distributed/fleet_executor/interceptor.cc
浏览文件 @
f85bd5c9
...
...
@@ -33,6 +33,16 @@ void Interceptor::RegisterMsgHandle(MsgHandle handle) { handle_ = handle; }
void
Interceptor
::
Handle
(
const
InterceptorMessage
&
msg
)
{
if
(
handle_
)
{
handle_
(
msg
);
}
else
{
VLOG
(
3
)
<<
"Interceptor is using default message handler. This handler is "
"only used for test purpose. Check whether you init interceptor "
"in the proper way."
;
if
(
msg
.
message_type
()
==
DATA_IS_READY
)
{
VLOG
(
3
)
<<
"Fake handler is sending stop message to it self."
;
InterceptorMessage
msg
;
msg
.
set_message_type
(
STOP
);
Send
(
interceptor_id_
,
msg
);
}
}
}
...
...
python/paddle/fluid/executor.py
浏览文件 @
f85bd5c9
...
...
@@ -1958,6 +1958,11 @@ class Executor(object):
fleet_exe_desc
.
cluster_info
.
append
(
rank_info
)
nrank
=
len
(
trainer_endpoints
)
else
:
fleet_exe_desc
.
cur_rank
=
0
rank_info
=
fleet_executor_desc_pb2
.
RankInfo
()
rank_info
.
rank
=
0
rank_info
.
ip_port
=
''
fleet_exe_desc
.
cluster_info
.
append
(
rank_info
)
logging
.
warning
(
"Fleet Executor will run on single device only."
)
fleet_opt
=
program
.
_pipeline_opt
[
"fleet_opt"
]
if
"dist_strategy"
in
fleet_opt
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录