Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
2f188341
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2f188341
编写于
12月 20, 2021
作者:
Y
Yuang Liu
提交者:
GitHub
12月 20, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[fleet_executor] Remove runtime graph, all scheduler on python side (#38261)
上级
8c9c81cc
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
165 addition
and
366 deletion
+165
-366
paddle/fluid/distributed/fleet_executor/carrier.cc
paddle/fluid/distributed/fleet_executor/carrier.cc
+6
-7
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+21
-24
paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
...luid/distributed/fleet_executor/fleet_executor_desc.proto
+1
-5
paddle/fluid/distributed/fleet_executor/runtime_graph.cc
paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+2
-287
paddle/fluid/distributed/fleet_executor/runtime_graph.h
paddle/fluid/distributed/fleet_executor/runtime_graph.h
+0
-19
python/paddle/distributed/fleet/fleet_executor_utils.py
python/paddle/distributed/fleet/fleet_executor_utils.py
+18
-1
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+28
-22
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+1
-0
python/paddle/fluid/tests/unittests/test_fleet_executor.py
python/paddle/fluid/tests/unittests/test_fleet_executor.py
+1
-1
python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
...d/tests/unittests/test_fleet_executor_origin_scheduler.py
+87
-0
未找到文件。
paddle/fluid/distributed/fleet_executor/carrier.cc
浏览文件 @
2f188341
...
...
@@ -240,13 +240,12 @@ void Carrier::CreateInterceptors() {
task_node
->
run_at_offset
(),
task_node
->
run_per_steps
()));
std
::
unique_ptr
<
Interceptor
>
interceptor
;
if
(
task_node
->
type
().
empty
())
{
// TODO(wangxi): delete this in future
interceptor
.
reset
(
new
Interceptor
(
interceptor_id
,
task_node
));
}
else
{
interceptor
=
InterceptorFactory
::
Create
(
task_node
->
type
(),
interceptor_id
,
task_node
);
}
PADDLE_ENFORCE_NE
(
task_node
->
type
().
empty
(),
true
,
platform
::
errors
::
NotFound
(
"Cannot found type for task node with id %lld"
,
task_node
->
task_id
()));
interceptor
=
InterceptorFactory
::
Create
(
task_node
->
type
(),
interceptor_id
,
task_node
);
interceptor
->
SetPlace
(
place_
);
interceptor
->
SetMiniBatchScope
(
minibatch_scope_
);
interceptor
->
SetMicroBatchScope
(
microbatch_scopes_
);
...
...
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
浏览文件 @
2f188341
...
...
@@ -48,32 +48,29 @@ void FleetExecutor::Init(
const
framework
::
ProgramDesc
&
program_desc
,
framework
::
Scope
*
scope
,
const
platform
::
Place
&
place
,
const
std
::
vector
<
TaskNode
*>&
task_nodes
,
const
std
::
unordered_map
<
int64_t
,
int64_t
>&
task_id_to_rank
)
{
if
(
task_nodes
.
size
()
==
0
)
{
LOG
(
INFO
)
<<
"fleet executor will use c++ side scheduler construction."
;
runtime_graph_
=
std
::
make_shared
<
RuntimeGraph
>
(
program_desc
,
exe_desc_
);
}
else
{
LOG
(
INFO
)
<<
"fleet executor has been set dependency on python side."
;
// TODO(fleet_exe devs): the unused_vars should be got from run time graph
std
::
vector
<
std
::
unique_ptr
<
framework
::
OperatorBase
>>
ops
;
for
(
auto
task_node
:
task_nodes
)
{
for
(
auto
op
:
task_node
->
ops
())
{
ops
.
emplace_back
(
std
::
unique_ptr
<
framework
::
OperatorBase
>
(
op
));
}
}
auto
unused_vars
=
framework
::
GetUnusedVars
(
program_desc
.
Block
(
0
),
ops
,
{});
runtime_graph_
=
std
::
make_shared
<
RuntimeGraph
>
();
std
::
unordered_map
<
int64_t
,
TaskNode
*>
interceptor_id_to_task
;
for
(
auto
task_node
:
task_nodes
)
{
task_node
->
SetUnusedVars
(
unused_vars
);
int64_t
interceptor_id
=
task_node
->
task_id
();
interceptor_id_to_task
.
emplace
(
interceptor_id
,
task_node
);
}
runtime_graph_
->
SetInterceptorIdToRank
(
task_id_to_rank
);
runtime_graph_
->
SetInterceptorIdToNode
(
interceptor_id_to_task
);
for
(
auto
&
unique_op
:
ops
)
{
unique_op
.
release
();
PADDLE_ENFORCE_GT
(
task_nodes
.
size
(),
0
,
platform
::
errors
::
InvalidArgument
(
"Fleet executor is inited with empty task node"
));
// TODO(fleet_exe devs): the unused_vars should be got from run time graph
std
::
vector
<
std
::
unique_ptr
<
framework
::
OperatorBase
>>
ops
;
for
(
auto
task_node
:
task_nodes
)
{
for
(
auto
op
:
task_node
->
ops
())
{
ops
.
emplace_back
(
std
::
unique_ptr
<
framework
::
OperatorBase
>
(
op
));
}
}
auto
unused_vars
=
framework
::
GetUnusedVars
(
program_desc
.
Block
(
0
),
ops
,
{});
runtime_graph_
=
std
::
make_shared
<
RuntimeGraph
>
();
std
::
unordered_map
<
int64_t
,
TaskNode
*>
interceptor_id_to_task
;
for
(
auto
task_node
:
task_nodes
)
{
task_node
->
SetUnusedVars
(
unused_vars
);
int64_t
interceptor_id
=
task_node
->
task_id
();
interceptor_id_to_task
.
emplace
(
interceptor_id
,
task_node
);
}
runtime_graph_
->
SetInterceptorIdToRank
(
task_id_to_rank
);
runtime_graph_
->
SetInterceptorIdToNode
(
interceptor_id_to_task
);
for
(
auto
&
unique_op
:
ops
)
{
unique_op
.
release
();
}
root_scope_
=
scope
;
place_
=
place
;
PADDLE_ENFORCE_NOT_NULL
(
root_scope_
,
platform
::
errors
::
InvalidArgument
(
...
...
paddle/fluid/distributed/fleet_executor/fleet_executor_desc.proto
浏览文件 @
2f188341
...
...
@@ -23,9 +23,5 @@ message RankInfo {
message
FleetExecutorDesc
{
optional
int64
cur_rank
=
1
[
default
=
0
];
// Rank id of current processor
repeated
RankInfo
cluster_info
=
2
;
optional
int32
dp_degree
=
3
[
default
=
1
];
optional
int32
mp_degree
=
4
[
default
=
1
];
optional
int32
pp_degree
=
5
[
default
=
1
];
optional
int64
num_micro_batches
=
6
[
default
=
1
];
optional
int64
num_slots
=
7
[
default
=
1
];
optional
int64
num_micro_batches
=
3
[
default
=
1
];
}
paddle/fluid/distributed/fleet_executor/runtime_graph.cc
浏览文件 @
2f188341
...
...
@@ -14,300 +14,15 @@
#include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
namespace
distributed
{
namespace
{
using
OperatorBase
=
RuntimeGraph
::
OperatorBase
;
using
OpRole
=
paddle
::
framework
::
OpRole
;
using
OpRegistry
=
paddle
::
framework
::
OpRegistry
;
using
ProgramDesc
=
paddle
::
framework
::
ProgramDesc
;
bool
IsForward
(
int32_t
op_role
)
{
return
(
op_role
==
static_cast
<
int32_t
>
(
OpRole
::
kForward
))
||
(
op_role
==
(
static_cast
<
int32_t
>
(
OpRole
::
kForward
)
|
static_cast
<
int32_t
>
(
OpRole
::
kLoss
)));
}
bool
IsLRSched
(
int32_t
op_role
)
{
return
op_role
==
static_cast
<
int32_t
>
(
OpRole
::
kLRSched
);
}
bool
IsBackward
(
int32_t
op_role
)
{
return
(
op_role
==
static_cast
<
int32_t
>
(
OpRole
::
kBackward
))
||
(
op_role
==
(
static_cast
<
int32_t
>
(
OpRole
::
kBackward
)
|
static_cast
<
int32_t
>
(
OpRole
::
kLoss
)));
}
bool
IsOptimize
(
int32_t
op_role
)
{
return
op_role
==
static_cast
<
int32_t
>
(
OpRole
::
kOptimize
);
}
struct
DistCoord
{
int32_t
dp_idx
;
int32_t
pp_idx
;
int32_t
mp_idx
;
};
class
DistCoordSys
final
{
public:
DistCoordSys
(
int32_t
dp_degree
,
int32_t
pp_degree
,
int32_t
mp_degree
)
:
dp_degree_
(
dp_degree
),
pp_degree_
(
pp_degree
),
mp_degree_
(
mp_degree
)
{}
DistCoord
RankToCoord
(
int64_t
rank
)
const
;
int64_t
CoordToRank
(
const
DistCoord
&
coord
)
const
;
private:
DISABLE_COPY_AND_ASSIGN
(
DistCoordSys
);
bool
InvalidCoord
(
const
DistCoord
&
coord
)
const
;
int32_t
dp_degree_
;
int32_t
pp_degree_
;
int32_t
mp_degree_
;
};
DistCoord
DistCoordSys
::
RankToCoord
(
int64_t
rank
)
const
{
DistCoord
coord
;
coord
.
mp_idx
=
rank
%
mp_degree_
;
rank
/=
mp_degree_
;
coord
.
pp_idx
=
rank
%
pp_degree_
;
rank
/=
pp_degree_
;
coord
.
dp_idx
=
rank
%
dp_degree_
;
return
coord
;
}
int64_t
DistCoordSys
::
CoordToRank
(
const
DistCoord
&
coord
)
const
{
if
(
InvalidCoord
(
coord
))
{
return
-
1
;
}
return
coord
.
dp_idx
*
pp_degree_
*
mp_degree_
+
coord
.
pp_idx
*
mp_degree_
+
coord
.
mp_idx
;
}
bool
DistCoordSys
::
InvalidCoord
(
const
DistCoord
&
coord
)
const
{
return
coord
.
mp_idx
<
0
||
coord
.
mp_idx
>=
mp_degree_
||
coord
.
pp_idx
<
0
||
coord
.
pp_idx
>=
pp_degree_
||
coord
.
dp_idx
<
0
||
coord
.
dp_idx
>=
dp_degree_
;
}
}
// namespace
std
::
vector
<
OpRole
>
RuntimeGraph
::
functionality_order
=
{
OpRole
::
kLRSched
,
OpRole
::
kForward
,
OpRole
::
kBackward
,
OpRole
::
kOptimize
};
RuntimeGraph
::
RuntimeGraph
(
const
ProgramDesc
&
program
,
const
FleetExecutorDesc
&
exe_desc
)
:
exe_desc_
(
exe_desc
)
{
if
(
exe_desc
.
pp_degree
()
==
1
)
{
OriginProgramCompile
(
program
);
}
else
{
SplitProgramBasedFunctionality
(
program
);
AssignTaskToIntercepter
();
FakeDependence
();
FakeRuntimeInfo
();
}
}
void
RuntimeGraph
::
OriginProgramCompile
(
const
ProgramDesc
&
program
)
{
int64_t
cur_rank
=
exe_desc_
.
cur_rank
();
int64_t
max_run_times
=
exe_desc_
.
num_micro_batches
();
int64_t
max_slot_nums
=
exe_desc_
.
num_slots
();
auto
task_node
=
std
::
make_unique
<
TaskNode
>
(
program
,
cur_rank
,
max_run_times
,
max_slot_nums
);
// TODO(wangxi): add skip vars
auto
unused_vars
=
framework
::
GetUnusedVars
(
program
.
Block
(
0
),
task_node
->
unique_ops
(),
{});
task_node
->
SetType
(
"Compute"
);
task_node
->
SetUnusedVars
(
unused_vars
);
task_nodes_
.
emplace_back
(
std
::
move
(
task_node
));
int64_t
task_id
=
task_nodes_
[
0
]
->
task_id
();
intercepter_id_to_rank_
.
insert
({
task_id
,
cur_rank
});
intercepter_id_to_node_
.
insert
({
task_id
,
task_nodes_
[
0
].
get
()});
}
void
RuntimeGraph
::
SplitProgramBasedFunctionality
(
const
ProgramDesc
&
program
)
{
for
(
const
auto
&
op_desc
:
program
.
Block
(
0
).
AllOps
())
{
ops_
.
emplace_back
(
OpRegistry
::
CreateOp
(
*
op_desc
));
}
// TODO(wangxi): how to gc pipeline backward send
auto
unused_vars
=
framework
::
GetUnusedVars
(
program
.
Block
(
0
),
ops_
,
{});
std
::
unordered_map
<
int32_t
,
std
::
vector
<
OperatorBase
*>>
role_to_ops
;
for
(
const
auto
&
op
:
ops_
)
{
int32_t
op_role
=
op
->
Attr
<
int32_t
>
(
"op_role"
);
OpRole
new_op_role
;
if
(
IsLRSched
(
op_role
))
{
new_op_role
=
OpRole
::
kLRSched
;
}
else
if
(
IsForward
(
op_role
))
{
new_op_role
=
OpRole
::
kForward
;
}
else
if
(
IsBackward
(
op_role
))
{
new_op_role
=
OpRole
::
kBackward
;
}
else
if
(
IsOptimize
(
op_role
))
{
new_op_role
=
OpRole
::
kOptimize
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"The op %s is None of LRSched, Forward, Backward or Optimize."
,
op
->
Type
()));
}
int32_t
new_op_role_id
=
static_cast
<
int32_t
>
(
new_op_role
);
if
(
role_to_ops
.
find
(
new_op_role_id
)
==
role_to_ops
.
end
())
{
role_to_ops
.
insert
({
new_op_role_id
,
{}});
}
role_to_ops
.
at
(
new_op_role_id
).
emplace_back
(
op
.
get
());
}
int64_t
cur_rank
=
exe_desc_
.
cur_rank
();
DistCoordSys
coord_sys
(
exe_desc_
.
dp_degree
(),
exe_desc_
.
pp_degree
(),
exe_desc_
.
mp_degree
());
const
auto
&
coord
=
coord_sys
.
RankToCoord
(
cur_rank
);
int
pipeline_stage
=
coord
.
pp_idx
;
int64_t
num_pipeline_stages
=
exe_desc_
.
pp_degree
();
// TODO(fleet_executor dev): start up steps should be a config `num_slots`
int64_t
start_up_steps
=
num_pipeline_stages
-
pipeline_stage
;
int64_t
num_micro_batches
=
exe_desc_
.
num_micro_batches
();
int64_t
task_id
=
cur_rank
*
functionality_order
.
size
();
for
(
std
::
size_t
i
=
0
;
i
<
functionality_order
.
size
();
++
i
)
{
VLOG
(
3
)
<<
"Runtime graph is creating task node for: "
<<
task_id
<<
"."
;
OpRole
role
=
functionality_order
[
i
];
int32_t
role_id
=
static_cast
<
int64_t
>
(
role
);
int64_t
max_run_times
=
num_micro_batches
;
int64_t
max_slot_nums
=
start_up_steps
;
// NOTE: use short path, each interceptor should run for max_run_times
std
::
vector
<
OperatorBase
*>
task_ops
{};
if
(
role_to_ops
.
find
(
role_id
)
!=
role_to_ops
.
end
())
{
task_ops
=
role_to_ops
.
at
(
role_id
);
}
std
::
unique_ptr
<
TaskNode
>
task_node
=
std
::
make_unique
<
TaskNode
>
(
role_id
,
task_ops
,
cur_rank
,
task_id
,
max_run_times
,
max_slot_nums
);
if
(
IsLRSched
(
role_id
)
||
IsOptimize
(
role_id
))
{
task_node
->
SetType
(
"Amplifier"
);
if
(
IsLRSched
(
role_id
))
{
task_node
->
SetRunPerSteps
(
max_run_times
);
}
else
{
task_node
->
SetRunAtOffset
(
max_run_times
-
1
);
task_node
->
SetRunPerSteps
(
max_run_times
);
}
}
else
{
task_node
->
SetType
(
"Compute"
);
}
task_node
->
SetUnusedVars
(
unused_vars
);
task_nodes_
.
emplace_back
(
std
::
move
(
task_node
));
++
task_id
;
}
}
void
RuntimeGraph
::
FakeDependence
()
{
int64_t
cur_rank
=
exe_desc_
.
cur_rank
();
DistCoordSys
coord_sys
(
exe_desc_
.
dp_degree
(),
exe_desc_
.
pp_degree
(),
exe_desc_
.
mp_degree
());
const
auto
&
coord
=
coord_sys
.
RankToCoord
(
cur_rank
);
DistCoord
upstream_coord
=
coord
,
downstream_coord
=
coord
;
upstream_coord
.
pp_idx
-=
1
;
downstream_coord
.
pp_idx
+=
1
;
int64_t
pp_upstream
=
coord_sys
.
CoordToRank
(
upstream_coord
);
int64_t
pp_downstream
=
coord_sys
.
CoordToRank
(
downstream_coord
);
bool
is_first_stage
=
(
pp_upstream
==
-
1
);
bool
is_last_stage
=
(
pp_downstream
==
-
1
);
int32_t
num_of_functionality
=
functionality_order
.
size
();
// lr(1:m) -> forward -> backward -> (m:1)optimize
// ↑ ↓
// lr(1:m) -> forward -> backward -> (m:1)optimize
// ↑ ↓
// lr(1:m) -> forward -> backward -> (m:1)optimize
for
(
std
::
size_t
i
=
0
;
i
<
task_nodes_
.
size
();
++
i
)
{
auto
&
node
=
task_nodes_
[
i
];
bool
is_forward
=
IsForward
(
node
->
role
());
bool
is_backward
=
IsBackward
(
node
->
role
());
int64_t
cur_id
=
cur_rank
*
num_of_functionality
+
i
;
int64_t
prev_id
=
cur_id
-
1
;
int64_t
next_id
=
cur_id
+
1
;
int64_t
upstream_id
=
pp_upstream
*
num_of_functionality
+
i
;
int64_t
downstream_id
=
pp_downstream
*
num_of_functionality
+
i
;
// 1F1B, last stage pp_buff_size should be 1, while first stage
// pp_buff_size should be pp_degree
int64_t
pp_buff_size
=
exe_desc_
.
pp_degree
()
-
coord
.
pp_idx
;
std
::
vector
<
std
::
pair
<
int64_t
,
int64_t
>>
ups
;
std
::
vector
<
std
::
pair
<
int64_t
,
int64_t
>>
downs
;
if
(
i
!=
0
)
{
// not lr
int64_t
buff_size
=
is_backward
?
pp_buff_size
:
2
;
ups
.
emplace_back
(
prev_id
,
buff_size
);
}
if
(
i
!=
task_nodes_
.
size
()
-
1
)
{
// not optimize
int64_t
buff_size
=
is_forward
?
pp_buff_size
:
2
;
downs
.
emplace_back
(
next_id
,
buff_size
);
}
if
(
is_forward
)
{
if
(
!
is_first_stage
)
{
ups
.
emplace_back
(
upstream_id
,
2
);
}
if
(
!
is_last_stage
)
{
downs
.
emplace_back
(
downstream_id
,
2
);
}
}
else
if
(
is_backward
)
{
if
(
!
is_last_stage
)
{
ups
.
emplace_back
(
downstream_id
,
2
);
}
if
(
!
is_first_stage
)
{
downs
.
emplace_back
(
upstream_id
,
2
);
}
}
for
(
auto
up
:
ups
)
{
VLOG
(
3
)
<<
"Task("
<<
cur_id
<<
") AddUpstream Task("
<<
up
.
first
<<
") with buff_size="
<<
up
.
second
;
node
->
AddUpstreamTask
(
up
.
first
,
up
.
second
);
}
for
(
auto
down
:
downs
)
{
VLOG
(
3
)
<<
"Task("
<<
cur_id
<<
") AddDownstream Task("
<<
down
.
first
<<
") with buff_size="
<<
down
.
second
;
node
->
AddDownstreamTask
(
down
.
first
,
down
.
second
);
}
}
}
void
RuntimeGraph
::
AssignTaskToIntercepter
()
{
for
(
const
auto
&
task
:
task_nodes_
)
{
int64_t
intercepter_id
=
task
->
task_id
();
VLOG
(
3
)
<<
"Runtime graph is assigning task to interceptor: "
<<
intercepter_id
<<
" with type: "
<<
task
->
type
()
<<
"."
;
if
(
intercepter_id_to_node_
.
find
(
intercepter_id
)
!=
intercepter_id_to_node_
.
end
())
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Repeated intercepter id: %d"
,
intercepter_id
));
}
intercepter_id_to_node_
.
insert
({
intercepter_id
,
task
.
get
()});
}
}
void
RuntimeGraph
::
FakeRuntimeInfo
()
{
int64_t
nrank
=
exe_desc_
.
cluster_info
().
size
();
int32_t
num_of_functionality
=
functionality_order
.
size
();
for
(
int64_t
i
=
0
;
i
<
nrank
;
++
i
)
{
for
(
int32_t
j
=
0
;
j
<
num_of_functionality
;
++
j
)
{
int64_t
intercepter_id
=
i
*
num_of_functionality
+
j
;
intercepter_id_to_rank_
.
insert
({
intercepter_id
,
i
});
}
}
}
std
::
string
RuntimeGraph
::
DebugString
()
const
{
std
::
ostringstream
os
;
os
<<
"
\n
Runtime Graph Debug:
\n
"
;
for
(
const
auto
&
task
:
task_nodes
_
)
{
os
<<
task
->
DebugString
();
for
(
const
auto
&
pair
:
intercepter_id_to_node
_
)
{
os
<<
pair
.
second
->
DebugString
();
os
<<
"
\n
"
;
}
return
os
.
str
();
...
...
paddle/fluid/distributed/fleet_executor/runtime_graph.h
浏览文件 @
2f188341
...
...
@@ -22,21 +22,12 @@
#include "paddle/fluid/platform/macros.h"
namespace
paddle
{
namespace
framework
{
class
ProgramDesc
;
class
OperatorBase
;
}
namespace
distributed
{
class
TaskNode
;
class
RuntimeGraph
final
{
public:
using
ProgramDesc
=
paddle
::
framework
::
ProgramDesc
;
using
OperatorBase
=
paddle
::
framework
::
OperatorBase
;
RuntimeGraph
()
=
default
;
explicit
RuntimeGraph
(
const
ProgramDesc
&
program
,
const
FleetExecutorDesc
&
exe_desc
);
~
RuntimeGraph
()
=
default
;
const
std
::
unordered_map
<
int64_t
,
TaskNode
*>&
intercepter_id_to_node
()
const
{
return
intercepter_id_to_node_
;
...
...
@@ -56,18 +47,8 @@ class RuntimeGraph final {
private:
DISABLE_COPY_AND_ASSIGN
(
RuntimeGraph
);
void
SplitProgramBasedFunctionality
(
const
ProgramDesc
&
program
);
void
FakeDependence
();
void
AssignTaskToIntercepter
();
void
FakeRuntimeInfo
();
void
OriginProgramCompile
(
const
ProgramDesc
&
program
);
// LRSched, Forward, Backward, Optimize
static
std
::
vector
<
paddle
::
framework
::
OpRole
>
functionality_order
;
std
::
vector
<
std
::
unique_ptr
<
TaskNode
>>
task_nodes_
;
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops_
;
std
::
unordered_map
<
int64_t
,
TaskNode
*>
intercepter_id_to_node_
;
std
::
unordered_map
<
int64_t
,
int64_t
>
intercepter_id_to_rank_
;
FleetExecutorDesc
exe_desc_
;
};
}
// namespace distributed
...
...
python/paddle/distributed/fleet/fleet_executor_utils.py
浏览文件 @
2f188341
...
...
@@ -89,7 +89,7 @@ def is_backward_op(op_role):
(
op_role
==
(
int
(
OpRole
.
Backward
)
^
int
(
OpRole
.
Loss
)))
def
one_f_one_
b
(
program
,
cur_rank
,
max_run_times
,
dist_opt
,
nrank
):
def
run1f1
b
(
program
,
cur_rank
,
max_run_times
,
dist_opt
,
nrank
):
"""
Split the program to support 1f1b pipeline scheduler.
This funct will split the program based on the op_role.
...
...
@@ -201,3 +201,20 @@ def one_f_one_b(program, cur_rank, max_run_times, dist_opt, nrank):
for
j
in
range
(
num_of_functionality
):
task_id_to_rank
[
int
(
i
*
num_of_functionality
+
j
)]
=
i
return
task_nodes
,
task_id_to_rank
def
origin
(
program
,
cur_rank
):
"""
Origin scheduler for fleet executor, supports non-pp mode
:param program: The origin program.
:param cur_rank: Current rank (can be got from fleet.worker_index()).
:return:
task_nodes (list): four task nodes for current rank
task_id_to_rank (dict): a fake dict, since there is no upstream or downstream, this dict won't be used
"""
print
(
"fleet executor will use python side origin scheduler."
)
task_node
=
core
.
TaskNode
(
program
.
desc
,
cur_rank
,
1
,
1
)
task_node
.
set_type
(
"Compute"
)
task_id
=
task_node
.
task_id
()
task_id_to_rank
=
{
task_id
:
cur_rank
}
return
[
task_node
],
task_id_to_rank
python/paddle/fluid/executor.py
浏览文件 @
2f188341
...
...
@@ -1972,31 +1972,37 @@ class Executor(object):
rank_info
.
rank
=
rank
rank_info
.
ip_port
=
endpoint
fleet_exe_desc
.
cluster_info
.
append
(
rank_info
)
if
"dist_strategy"
in
fleet_opt
:
fleet_exe_desc
.
dp_degree
=
fleet_opt
[
"dist_strategy"
][
"dp_degree"
]
fleet_exe_desc
.
mp_degree
=
fleet_opt
[
"dist_strategy"
][
"mp_degree"
]
fleet_exe_desc
.
pp_degree
=
fleet_opt
[
"dist_strategy"
][
"pp_degree"
]
if
"num_micro_batches"
in
fleet_opt
:
fleet_exe_desc
.
num_micro_batches
=
fleet_opt
[
"num_micro_batches"
]
num_of_gpu
=
fleet_exe_desc
.
dp_degree
*
fleet_exe_desc
.
mp_degree
*
fleet_exe_desc
.
pp_degree
assert
nrank
==
num_of_gpu
,
"The number of rank is not equal to the number of gpu."
if
'python_side'
in
fleet_opt
:
strategy
=
fleet_opt
[
'python_side'
]
if
strategy
==
'1F1B'
:
from
paddle.distributed.fleet.fleet_executor_utils
import
one_f_one_b
tasks
,
task_id_to_rank
=
one_f_one_b
(
program
,
cur_rank
,
fleet_opt
.
get
(
'num_micro_batches'
,
1
),
fleet_opt
.
get
(
'dist_strategy'
,
{}),
nrank
)
# NOTE: have to hold these vars, otherwise will be destructed
fleet_opt
[
'tasks'
]
=
tasks
fleet_opt
[
'task_id_to_rank'
]
=
task_id_to_rank
else
:
raise
"Fleet_executor only supports 1F1B scheduler if you choose python side split, "
\
"but received "
+
str
(
strategy
)
+
"."
assert
'scheduler'
in
fleet_opt
,
\
"Fleet executor need configuration for scheduler, you can choose from 1F1B or Origin."
scheduler
=
fleet_opt
[
'scheduler'
]
if
scheduler
==
'1F1B'
:
from
paddle.distributed.fleet.fleet_executor_utils
import
run1f1b
if
"dist_strategy"
not
in
fleet_opt
or
\
"pp_degree"
not
in
fleet_opt
[
"dist_strategy"
]
or
\
fleet_opt
[
"dist_strategy"
][
"pp_degree"
]
==
1
:
warnings
.
warn
(
"Using 1F1B scheduler with pp_degree == 1."
)
tasks
,
task_id_to_rank
=
run1f1b
(
program
,
cur_rank
,
fleet_opt
.
get
(
'num_micro_batches'
,
1
),
fleet_opt
.
get
(
'dist_strategy'
,
{}),
nrank
)
elif
scheduler
==
'Origin'
:
from
paddle.distributed.fleet.fleet_executor_utils
import
origin
if
"dist_strategy"
in
fleet_opt
and
\
"pp_degree"
in
fleet_opt
[
"dist_strategy"
]:
assert
fleet_opt
[
"dist_strategy"
][
"pp_degree"
]
==
1
,
\
"For pipeline mode, the scheduler should be 1F1B instead of Origin."
if
"num_micro_batches"
in
fleet_opt
:
assert
fleet_opt
[
"num_micro_batches"
]
==
1
,
\
"For origin scheduler mode, the num micro batches should be 1."
tasks
,
task_id_to_rank
=
origin
(
program
,
cur_rank
)
else
:
task_id_to_rank
=
fleet_opt
.
get
(
"task_id_to_rank"
,
{})
tasks
=
fleet_opt
.
get
(
"tasks"
,
[])
raise
"Fleet_executor only supports 1F1B and Origin scheduler, "
\
"but received "
+
str
(
scheduler
)
+
"."
# NOTE: have to hold these vars, otherwise will be destructed
fleet_opt
[
'tasks'
]
=
tasks
fleet_opt
[
'task_id_to_rank'
]
=
task_id_to_rank
fleet_exe
=
core
.
FleetExecutor
(
fleet_exe_desc
.
SerializeToString
())
place
=
core
.
Place
()
place
.
set_place
(
self
.
place
)
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
2f188341
...
...
@@ -146,6 +146,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
LIST
(
REMOVE_ITEM TEST_OPS test_disable_signal_handler
)
LIST
(
REMOVE_ITEM TEST_OPS test_fleet_executor
)
LIST
(
REMOVE_ITEM TEST_OPS test_fleet_executor_multi_devices
)
LIST
(
REMOVE_ITEM TEST_OPS test_fleet_executor_origin_scheduler
)
LIST
(
REMOVE_ITEM TEST_OPS test_auto_parallel_mapper
)
LIST
(
REMOVE_ITEM TEST_OPS test_fleet_executor_task_node
)
endif
()
...
...
python/paddle/fluid/tests/unittests/test_fleet_executor.py
浏览文件 @
2f188341
...
...
@@ -34,7 +34,7 @@ class TestFleetExecutor(unittest.TestCase):
fleet_opt
=
{
"dist_strategy"
:
strategy
.
sharding_configs
,
"num_micro_batches"
:
strategy
.
pipeline_configs
[
"accumulate_steps"
],
"
python_side
"
:
"1F1B"
"
scheduler
"
:
"1F1B"
}
return
fleet_opt
...
...
python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
0 → 100644
浏览文件 @
2f188341
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
paddle
.
enable_static
()
class
TestFleetExecutor
(
unittest
.
TestCase
):
def
fake_fleet_opt
(
self
):
# TODO: Fake for coverage will be removed in the future
import
paddle.distributed.fleet
as
fleet
strategy
=
fleet
.
DistributedStrategy
()
strategy
.
sharding_configs
=
{
"dp_degree"
:
1
,
"mp_degree"
:
1
,
"pp_degree"
:
1
}
strategy
.
pipeline_configs
=
{
"accumulate_steps"
:
1
}
fleet_opt
=
{
"dist_strategy"
:
strategy
.
sharding_configs
,
"num_micro_batches"
:
strategy
.
pipeline_configs
[
"accumulate_steps"
],
"scheduler"
:
"Origin"
}
return
fleet_opt
def
run_fleet_executor
(
self
,
place
,
x_data
,
y_data
):
exe
=
paddle
.
static
.
Executor
(
place
)
empty_program
=
paddle
.
static
.
Program
()
with
fluid
.
program_guard
(
empty_program
,
empty_program
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
x_data
.
shape
,
dtype
=
x_data
.
dtype
)
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
y_data
.
shape
,
dtype
=
y_data
.
dtype
)
z
=
x
+
y
a
=
2
*
x
+
3
*
y
loss
=
paddle
.
mean
(
a
)
base_lr
=
0.1
passes
=
[
30
,
60
,
80
,
90
]
steps_per_pass
=
10
bd
=
[
steps_per_pass
*
p
for
p
in
passes
]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
lr_val
=
paddle
.
optimizer
.
lr
.
PiecewiseDecay
(
boundaries
=
bd
,
values
=
lr
)
opt
=
paddle
.
optimizer
.
AdamW
(
learning_rate
=
lr_val
,
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
1.0
))
opt
.
minimize
(
loss
)
# TODO: section_program will be removed in the future
empty_program
.
_pipeline_opt
=
{
"fleet_opt"
:
self
.
fake_fleet_opt
(),
"section_program"
:
empty_program
}
res
=
exe
.
run
(
empty_program
,
feed
=
{
'x'
:
x_data
,
'y'
:
y_data
},
fetch_list
=
[
z
.
name
,
a
.
name
])
return
res
def
test_executor_on_single_device
(
self
):
if
fluid
.
is_compiled_with_cuda
():
shape
=
(
10000
,
3462
)
x_data
=
np
.
random
.
rand
(
*
shape
)
y_data
=
np
.
random
.
rand
(
*
shape
)
z_data
=
x_data
+
y_data
a_data
=
2
*
x_data
+
3
*
y_data
res
=
self
.
run_fleet_executor
(
fluid
.
CUDAPlace
(
0
),
x_data
,
y_data
)
self
.
assertTrue
(
np
.
allclose
(
res
[
0
],
z_data
))
self
.
assertTrue
(
np
.
allclose
(
res
[
1
],
a_data
))
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录