Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
6b48dfe9
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
6b48dfe9
编写于
3年前
作者:
W
WangXi
提交者:
GitHub
3年前
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[fleet_executor] Add interceptor gc (#37889)
上级
55b87742
develop
Ligoml-patch-1
ZHUI-patch-1
add_some_yaml_config
dingjiaweiww-patch-1
dy2static
enable_eager_model_test
final_state_gen_python_c
final_state_intermediate
fix-numpy-issue
fix_concat_slice
fix_rnn_docs
fix_tensor_type
incubate/infrt
inplace_addto
move_embedding_to_phi
move_histogram_to_pten
move_sgd_to_phi
move_slice_to_pten
move_temporal_shift_to_phi
move_yolo_box_to_phi
npu_fix_alloc
preln_ernie
prv-md-even-more
prv-onednn-2.5
pten_tensor_refactor
release/2.3
release/2.3-fc-ernie-fix
release/2.4
revert-37926-eager_coreops_500
revert-37956-revert-37727-pylayer_support_tuple
revert-38100-mingdong
revert-38301-allocation_rearrange_pr
revert-38703-numpy_bf16_package_reupload
revert-38732-remove_useless_header_in_elementwise_mul_grad
revert-38959-Reduce_Grad
revert-39143-adjust_empty
revert-39227-move_trace_op_to_pten
revert-39268-dev/remove_concat_fluid_kernel
revert-40170-support_partial_grad
revert-41056-revert-40727-move_some_activaion_to_phi
revert-41065-revert-40993-mv_ele_floordiv_pow
revert-41068-revert-40790-phi_new
revert-41944-smaller_inference_api_test
revert-42149-do-not-reset-default-stream-for-stream-safe-cuda-allocator
revert-43155-fix_ut_tempfile
revert-43882-revert-41944-smaller_inference_api_test
revert-45808-phi/simplify_size_op
revert-46827-deform_comment
support_weight_transpose
zhiqiu-patch-1
v2.4.0-rc0
v2.3.2
v2.3.1
v2.3.0
v2.3.0-rc0
无相关合并请求
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
111 addition
and
48 deletion
+111
-48
paddle/fluid/distributed/fleet_executor/CMakeLists.txt
paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+1
-1
paddle/fluid/distributed/fleet_executor/carrier.cc
paddle/fluid/distributed/fleet_executor/carrier.cc
+59
-37
paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
...e/fluid/distributed/fleet_executor/compute_interceptor.cc
+6
-0
paddle/fluid/distributed/fleet_executor/interceptor.h
paddle/fluid/distributed/fleet_executor/interceptor.h
+5
-0
paddle/fluid/distributed/fleet_executor/runtime_graph.cc
paddle/fluid/distributed/fleet_executor/runtime_graph.cc
+24
-10
paddle/fluid/distributed/fleet_executor/runtime_graph.h
paddle/fluid/distributed/fleet_executor/runtime_graph.h
+1
-0
paddle/fluid/distributed/fleet_executor/task_node.h
paddle/fluid/distributed/fleet_executor/task_node.h
+15
-0
未找到文件。
paddle/fluid/distributed/fleet_executor/CMakeLists.txt
浏览文件 @
6b48dfe9
...
...
@@ -13,7 +13,7 @@ endif()
cc_library
(
fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc
interceptor.cc compute_interceptor.cc amplifier_interceptor.cc interceptor_message_service.cc message_bus.cc
DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto collective_helper op_registry
${
BRPC_DEPS
}
)
executor_gc_helper
${
BRPC_DEPS
}
)
if
(
WITH_DISTRIBUTE
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
...
...
This diff is collapsed.
Click to expand it.
paddle/fluid/distributed/fleet_executor/carrier.cc
浏览文件 @
6b48dfe9
...
...
@@ -18,6 +18,7 @@
#include "paddle/fluid/distributed/fleet_executor/message_bus.h"
#include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/scope.h"
namespace
paddle
{
...
...
@@ -191,49 +192,70 @@ void Carrier::HandleTmpMessages() {
message_tmp_
.
clear
();
}
static
std
::
shared_ptr
<
framework
::
GarbageCollector
>
GetGC
(
const
platform
::
Place
&
place
)
{
int64_t
max_memory_size
=
framework
::
GetEagerDeletionThreshold
();
std
::
shared_ptr
<
framework
::
GarbageCollector
>
gc
;
if
(
max_memory_size
>=
0
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
place
))
{
if
(
framework
::
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
framework
::
UnsafeFastGPUGarbageCollector
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
max_memory_size
));
}
}
#endif
}
// max_memory_size >= 0
return
gc
;
}
void
Carrier
::
CreateInterceptors
()
{
if
(
runtime_graph_
->
intercepter_id_to_node
().
empty
())
return
;
auto
gc
=
GetGC
(
place_
);
// create each Interceptor
if
(
!
(
runtime_graph_
->
intercepter_id_to_node
().
empty
()))
{
// no auto init since there is no config
for
(
const
auto
&
item
:
runtime_graph_
->
intercepter_id_to_node
())
{
int64_t
interceptor_id
=
item
.
first
;
TaskNode
*
task_node
=
item
.
second
;
PADDLE_ENFORCE_LT
(
task_node
->
run_at_offset
(),
task_node
->
run_per_steps
(),
platform
::
errors
::
InvalidArgument
(
"Interceptor's run_at_offset must < run_per_steps, must now "
"run_at_offset=%ld run_per_steps=%ld"
,
task_node
->
run_at_offset
(),
task_node
->
run_per_steps
()));
std
::
unique_ptr
<
Interceptor
>
interceptor
;
if
(
task_node
->
type
().
empty
())
{
// TODO(wangxi): delete this in future
interceptor
.
reset
(
new
Interceptor
(
interceptor_id
,
task_node
));
}
else
{
interceptor
=
InterceptorFactory
::
Create
(
task_node
->
type
(),
interceptor_id
,
task_node
);
}
interceptor
->
SetPlace
(
place_
);
interceptor
->
SetMiniBatchScope
(
minibatch_scope_
);
interceptor
->
SetMicroBatchScope
(
microbatch_scopes_
);
interceptor
->
SetRootScope
(
root_scope_
);
// no auto init since there is no config
for
(
const
auto
&
item
:
runtime_graph_
->
intercepter_id_to_node
())
{
int64_t
interceptor_id
=
item
.
first
;
TaskNode
*
task_node
=
item
.
second
;
SetInterceptor
(
interceptor_id
,
std
::
move
(
interceptor
));
VLOG
(
3
)
<<
"Create Interceptor with interceptor id: "
<<
interceptor_id
<<
" with type: "
<<
task_node
->
type
()
<<
"."
;
PADDLE_ENFORCE_LT
(
task_node
->
run_at_offset
(),
task_node
->
run_per_steps
(),
platform
::
errors
::
InvalidArgument
(
"Interceptor's run_at_offset must < run_per_steps, must now "
"run_at_offset=%ld run_per_steps=%ld"
,
task_node
->
run_at_offset
(),
task_node
->
run_per_steps
()));
if
(
task_node
->
upstream
().
empty
())
{
source_interceptor_ids_
.
emplace_back
(
interceptor_id
);
}
std
::
unique_ptr
<
Interceptor
>
interceptor
;
if
(
task_node
->
type
().
empty
())
{
// TODO(wangxi): delete this in future
interceptor
.
reset
(
new
Interceptor
(
interceptor_id
,
task_node
));
}
else
{
interceptor
=
InterceptorFactory
::
Create
(
task_node
->
type
(),
interceptor_id
,
task_node
);
}
interceptor
->
SetPlace
(
place_
);
interceptor
->
SetMiniBatchScope
(
minibatch_scope_
);
interceptor
->
SetMicroBatchScope
(
microbatch_scopes_
);
interceptor
->
SetRootScope
(
root_scope_
);
interceptor
->
SetGC
(
gc
);
SetInterceptor
(
interceptor_id
,
std
::
move
(
interceptor
));
VLOG
(
3
)
<<
"Create Interceptor with interceptor id: "
<<
interceptor_id
<<
" with type: "
<<
task_node
->
type
()
<<
"."
;
if
(
task_node
->
upstream
().
empty
())
{
source_interceptor_ids_
.
emplace_back
(
interceptor_id
);
}
// The carrier will be always waiting for outside initializer
// since there is no interceptor has been created during auto init
creating_flag_mutex_
.
lock
();
creating_interceptors_
=
false
;
creating_flag_mutex_
.
unlock
();
HandleTmpMessages
();
}
// The carrier will be always waiting for outside initializer
// since there is no interceptor has been created during auto init
creating_flag_mutex_
.
lock
();
creating_interceptors_
=
false
;
creating_flag_mutex_
.
unlock
();
HandleTmpMessages
();
}
}
// namespace distributed
...
...
This diff is collapsed.
Click to expand it.
paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
浏览文件 @
6b48dfe9
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/operator.h"
namespace
paddle
{
...
...
@@ -172,6 +173,11 @@ void ComputeInterceptor::RunOps() {
<<
step_
+
1
<<
" time."
;
for
(
auto
op
:
node_
->
ops
())
{
op
->
Run
(
*
microbatch_scopes_
[
step_
%
node_
->
max_run_times
()],
place_
);
if
(
gc_
)
{
framework
::
DeleteUnusedTensors
(
*
microbatch_scopes_
[
step_
%
node_
->
max_run_times
()],
op
,
node_
->
unused_vars
(),
gc_
.
get
());
}
}
}
...
...
This diff is collapsed.
Click to expand it.
paddle/fluid/distributed/fleet_executor/interceptor.h
浏览文件 @
6b48dfe9
...
...
@@ -31,6 +31,7 @@
namespace
paddle
{
namespace
framework
{
class
Scope
;
class
GarbageCollector
;
}
namespace
distributed
{
...
...
@@ -73,6 +74,9 @@ class Interceptor {
void
SetMicroBatchScope
(
const
std
::
vector
<
framework
::
Scope
*>&
scopes
)
{
microbatch_scopes_
=
scopes
;
}
void
SetGC
(
const
std
::
shared_ptr
<
framework
::
GarbageCollector
>&
gc
)
{
gc_
=
gc
;
}
TaskNode
*
GetTaskNode
()
const
{
return
node_
;
}
...
...
@@ -94,6 +98,7 @@ class Interceptor {
framework
::
Scope
*
root_scope_
{
nullptr
};
framework
::
Scope
*
minibatch_scope_
{
nullptr
};
std
::
vector
<
framework
::
Scope
*>
microbatch_scopes_
{};
std
::
shared_ptr
<
framework
::
GarbageCollector
>
gc_
{
nullptr
};
private:
// pool the local mailbox, parse the Message
...
...
This diff is collapsed.
Click to expand it.
paddle/fluid/distributed/fleet_executor/runtime_graph.cc
浏览文件 @
6b48dfe9
...
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/distributed/fleet_executor/runtime_graph.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
...
...
@@ -101,16 +102,7 @@ RuntimeGraph::RuntimeGraph(const ProgramDesc& program,
const
FleetExecutorDesc
&
exe_desc
)
:
exe_desc_
(
exe_desc
)
{
if
(
exe_desc
.
pp_degree
()
==
1
)
{
int64_t
cur_rank
=
exe_desc_
.
cur_rank
();
int64_t
max_run_times
=
exe_desc_
.
num_micro_batches
();
int64_t
max_slot_nums
=
exe_desc_
.
num_slots
();
auto
task_node
=
std
::
make_unique
<
TaskNode
>
(
program
,
cur_rank
,
max_run_times
,
max_slot_nums
);
task_node
->
SetType
(
"Compute"
);
task_nodes_
.
emplace_back
(
std
::
move
(
task_node
));
int64_t
task_id
=
task_nodes_
[
0
]
->
task_id
();
intercepter_id_to_rank_
.
insert
({
task_id
,
cur_rank
});
intercepter_id_to_node_
.
insert
({
task_id
,
task_nodes_
[
0
].
get
()});
OriginProgramCompile
(
program
);
}
else
{
SplitProgramBasedFunctionality
(
program
);
AssignTaskToIntercepter
();
...
...
@@ -119,10 +111,31 @@ RuntimeGraph::RuntimeGraph(const ProgramDesc& program,
}
}
void
RuntimeGraph
::
OriginProgramCompile
(
const
ProgramDesc
&
program
)
{
int64_t
cur_rank
=
exe_desc_
.
cur_rank
();
int64_t
max_run_times
=
exe_desc_
.
num_micro_batches
();
int64_t
max_slot_nums
=
exe_desc_
.
num_slots
();
auto
task_node
=
std
::
make_unique
<
TaskNode
>
(
program
,
cur_rank
,
max_run_times
,
max_slot_nums
);
// TODO(wangxi): add skip vars
auto
unused_vars
=
framework
::
GetUnusedVars
(
program
.
Block
(
0
),
task_node
->
unique_ops
(),
{});
task_node
->
SetType
(
"Compute"
);
task_node
->
SetUnusedVars
(
unused_vars
);
task_nodes_
.
emplace_back
(
std
::
move
(
task_node
));
int64_t
task_id
=
task_nodes_
[
0
]
->
task_id
();
intercepter_id_to_rank_
.
insert
({
task_id
,
cur_rank
});
intercepter_id_to_node_
.
insert
({
task_id
,
task_nodes_
[
0
].
get
()});
}
void
RuntimeGraph
::
SplitProgramBasedFunctionality
(
const
ProgramDesc
&
program
)
{
for
(
const
auto
&
op_desc
:
program
.
Block
(
0
).
AllOps
())
{
ops_
.
emplace_back
(
OpRegistry
::
CreateOp
(
*
op_desc
));
}
// TODO(wangxi): how to gc pipeline backward send
auto
unused_vars
=
framework
::
GetUnusedVars
(
program
.
Block
(
0
),
ops_
,
{});
std
::
unordered_map
<
int32_t
,
std
::
vector
<
OperatorBase
*>>
role_to_ops
;
for
(
const
auto
&
op
:
ops_
)
{
...
...
@@ -183,6 +196,7 @@ void RuntimeGraph::SplitProgramBasedFunctionality(const ProgramDesc& program) {
}
else
{
task_node
->
SetType
(
"Compute"
);
}
task_node
->
SetUnusedVars
(
unused_vars
);
task_nodes_
.
emplace_back
(
std
::
move
(
task_node
));
++
task_id
;
}
...
...
This diff is collapsed.
Click to expand it.
paddle/fluid/distributed/fleet_executor/runtime_graph.h
浏览文件 @
6b48dfe9
...
...
@@ -52,6 +52,7 @@ class RuntimeGraph final {
void
FakeDependence
();
void
AssignTaskToIntercepter
();
void
FakeRuntimeInfo
();
void
OriginProgramCompile
(
const
ProgramDesc
&
program
);
// LRSched, Forward, Backward, Optimize
static
std
::
vector
<
paddle
::
framework
::
OpRole
>
functionality_order
;
std
::
vector
<
std
::
unique_ptr
<
TaskNode
>>
task_nodes_
;
...
...
This diff is collapsed.
Click to expand it.
paddle/fluid/distributed/fleet_executor/task_node.h
浏览文件 @
6b48dfe9
...
...
@@ -57,12 +57,24 @@ class TaskNode final {
const
std
::
string
&
type
()
const
{
return
type_
;
}
const
paddle
::
framework
::
ProgramDesc
&
program
()
const
{
return
program_
;
}
const
std
::
vector
<
OperatorBase
*>&
ops
()
const
{
return
ops_
;
}
const
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>&
unique_ops
()
const
{
return
ops_vec_
;
}
const
std
::
unordered_map
<
const
OperatorBase
*
,
std
::
vector
<
std
::
string
>>&
unused_vars
()
const
{
return
unused_vars_
;
}
void
SetRunPerSteps
(
int64_t
value
);
void
SetRunAtOffset
(
int64_t
value
);
void
SetReplyUpPerSteps
(
int64_t
value
);
void
SetSendDownPerSteps
(
int64_t
value
);
void
SetType
(
const
std
::
string
&
type
)
{
type_
=
type
;
}
void
SetUnusedVars
(
const
std
::
unordered_map
<
const
OperatorBase
*
,
std
::
vector
<
std
::
string
>>&
unused_vars
)
{
unused_vars_
=
unused_vars
;
}
// upstream need buffs?
bool
AddUpstreamTask
(
int64_t
task_id
,
int64_t
buff_size
=
1
);
...
...
@@ -79,6 +91,9 @@ class TaskNode final {
std
::
unordered_map
<
int64_t
,
int64_t
>
downstream_
;
framework
::
ProgramDesc
program_
;
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops_vec_
;
std
::
unordered_map
<
const
OperatorBase
*
,
std
::
vector
<
std
::
string
>>
unused_vars_
;
int32_t
role_
;
int64_t
rank_
;
int64_t
task_id_
;
...
...
This diff is collapsed.
Click to expand it.
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录
新手
引导
客服
返回
顶部