Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
7eb121df
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7eb121df
编写于
12月 14, 2021
作者:
Y
Yuang Liu
提交者:
GitHub
12月 14, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[fleet_executor] Take task node from python side (#38083)
上级
f5b1fd7c
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
31 addition
and
6 deletion
+31
-6
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+16
-4
paddle/fluid/distributed/fleet_executor/fleet_executor.h
paddle/fluid/distributed/fleet_executor/fleet_executor.h
+4
-1
paddle/fluid/distributed/fleet_executor/runtime_graph.h
paddle/fluid/distributed/fleet_executor/runtime_graph.h
+8
-0
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+3
-1
未找到文件。
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
浏览文件 @
7eb121df
...
...
@@ -33,10 +33,22 @@ FleetExecutor::FleetExecutor(const std::string& exe_desc_str) {
FleetExecutor
::~
FleetExecutor
()
{
root_scope_
->
DropKids
();
}
void
FleetExecutor
::
Init
(
const
framework
::
ProgramDesc
&
program_desc
,
framework
::
Scope
*
scope
,
const
platform
::
Place
&
place
)
{
runtime_graph_
=
std
::
make_shared
<
RuntimeGraph
>
(
program_desc
,
exe_desc_
);
void
FleetExecutor
::
Init
(
const
framework
::
ProgramDesc
&
program_desc
,
framework
::
Scope
*
scope
,
const
platform
::
Place
&
place
,
const
std
::
vector
<
TaskNode
*>&
task_nodes
,
const
std
::
unordered_map
<
int64_t
,
int64_t
>&
task_id_to_rank
)
{
if
(
task_nodes
.
size
()
==
0
)
{
runtime_graph_
=
std
::
make_shared
<
RuntimeGraph
>
(
program_desc
,
exe_desc_
);
}
else
{
runtime_graph_
=
std
::
make_shared
<
RuntimeGraph
>
();
std
::
unordered_map
<
int64_t
,
TaskNode
*>
interceptor_id_to_task
;
for
(
auto
task_node
:
task_nodes
)
{
int64_t
interceptor_id
=
task_node
->
task_id
();
interceptor_id_to_task
.
emplace
(
interceptor_id
,
task_node
);
}
runtime_graph_
->
SetInterceptorIdToRank
(
task_id_to_rank
);
runtime_graph_
->
SetInterceptorIdToNode
(
interceptor_id_to_task
);
}
root_scope_
=
scope
;
place_
=
place
;
PADDLE_ENFORCE_NOT_NULL
(
root_scope_
,
platform
::
errors
::
InvalidArgument
(
...
...
paddle/fluid/distributed/fleet_executor/fleet_executor.h
浏览文件 @
7eb121df
...
...
@@ -30,6 +30,7 @@ namespace distributed {
class
RuntimeGraph
;
class
Carrier
;
class
MessageBus
;
class
TaskNode
;
class
FleetExecutor
final
{
public:
...
...
@@ -37,7 +38,9 @@ class FleetExecutor final {
explicit
FleetExecutor
(
const
std
::
string
&
exe_desc_str
);
~
FleetExecutor
();
void
Init
(
const
framework
::
ProgramDesc
&
program_desc
,
framework
::
Scope
*
scope
,
const
platform
::
Place
&
place
);
const
platform
::
Place
&
place
,
const
std
::
vector
<
TaskNode
*>&
task_nodes
,
const
std
::
unordered_map
<
int64_t
,
int64_t
>&
task_id_to_rank
);
void
Run
();
private:
...
...
paddle/fluid/distributed/fleet_executor/runtime_graph.h
浏览文件 @
7eb121df
...
...
@@ -44,6 +44,14 @@ class RuntimeGraph final {
const
std
::
unordered_map
<
int64_t
,
int64_t
>&
intercepter_id_to_rank
()
const
{
return
intercepter_id_to_rank_
;
}
void
SetInterceptorIdToRank
(
const
std
::
unordered_map
<
int64_t
,
int64_t
>&
intercepter_id_to_rank
)
{
intercepter_id_to_rank_
=
intercepter_id_to_rank
;
}
void
SetInterceptorIdToNode
(
const
std
::
unordered_map
<
int64_t
,
TaskNode
*>&
intercepter_id_to_node
)
{
intercepter_id_to_node_
=
intercepter_id_to_node
;
}
std
::
string
DebugString
()
const
;
private:
...
...
python/paddle/fluid/executor.py
浏览文件 @
7eb121df
...
...
@@ -1979,10 +1979,12 @@ class Executor(object):
fleet_exe_desc
.
num_micro_batches
=
fleet_opt
[
"num_micro_batches"
]
num_of_gpu
=
fleet_exe_desc
.
dp_degree
*
fleet_exe_desc
.
mp_degree
*
fleet_exe_desc
.
pp_degree
assert
nrank
==
num_of_gpu
,
"The number of rank is not equal to the number of gpu."
task_id_to_rank
=
fleet_opt
.
get
(
"task_id_to_rank"
,
{})
tasks
=
fleet_opt
.
get
(
"tasks"
,
[])
fleet_exe
=
core
.
FleetExecutor
(
fleet_exe_desc
.
SerializeToString
())
place
=
core
.
Place
()
place
.
set_place
(
self
.
place
)
fleet_exe
.
init
(
program
.
desc
,
scope
,
place
)
fleet_exe
.
init
(
program
.
desc
,
scope
,
place
,
tasks
,
task_id_to_rank
)
return
fleet_exe
def
_run_using_fleet_executor
(
self
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录