Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
3e9d8548
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3e9d8548
编写于
2月 01, 2023
作者:
L
LiYuRio
提交者:
GitHub
2月 01, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix gc and infinite buffer size (#50122)
上级
9f231147
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
56 addition
and
39 deletion
+56
-39
paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
...e/fluid/distributed/fleet_executor/compute_interceptor.cc
+25
-17
paddle/fluid/distributed/fleet_executor/compute_interceptor.h
...le/fluid/distributed/fleet_executor/compute_interceptor.h
+2
-0
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+14
-13
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+4
-3
python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
...d/tests/unittests/test_fleet_executor_cond_interceptor.py
+11
-6
未找到文件。
paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
浏览文件 @
3e9d8548
...
...
@@ -50,14 +50,17 @@ void ComputeInterceptor::IncreaseReady(int64_t up_id) {
auto
max_ready_size
=
it
->
second
.
first
;
auto
ready_size
=
it
->
second
.
second
;
ready_size
+=
1
;
PADDLE_ENFORCE_LE
(
ready_size
,
max_ready_size
,
platform
::
errors
::
OutOfRange
(
"upstream=%lld ready_size must <= max_ready_size, but "
"now ready_size=%lld, max_ready_size=%lld"
,
up_id
,
ready_size
,
max_ready_size
));
if
(
max_ready_size
!=
INFINITE_BUFFER_SIZE
)
{
PADDLE_ENFORCE_LE
(
ready_size
,
max_ready_size
,
platform
::
errors
::
OutOfRange
(
"upstream=%lld ready_size must <= max_ready_size, but "
"now ready_size=%lld, max_ready_size=%lld"
,
up_id
,
ready_size
,
max_ready_size
));
}
it
->
second
.
second
=
ready_size
;
}
...
...
@@ -96,6 +99,9 @@ bool ComputeInterceptor::CanWriteOutput() {
for
(
auto
&
outs
:
out_buffs_
)
{
auto
max_buffer_size
=
outs
.
second
.
first
;
auto
used_size
=
outs
.
second
.
second
;
if
(
max_buffer_size
==
INFINITE_BUFFER_SIZE
)
{
continue
;
}
// full, return false
if
(
used_size
==
max_buffer_size
)
{
VLOG
(
3
)
<<
"Interceptor "
<<
GetInterceptorId
()
...
...
@@ -112,15 +118,17 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
auto
max_buff_size
=
outs
.
second
.
first
;
auto
used_size
=
outs
.
second
.
second
;
used_size
+=
1
;
PADDLE_ENFORCE_LE
(
used_size
,
max_buff_size
,
platform
::
errors
::
OutOfRange
(
"downstream=%lld used buff size must <= "
"max_buff_size, but now used_size=%lld, "
"max_buff_size=%lld"
,
down_id
,
used_size
,
max_buff_size
));
if
(
max_buff_size
!=
INFINITE_BUFFER_SIZE
)
{
PADDLE_ENFORCE_LE
(
used_size
,
max_buff_size
,
platform
::
errors
::
OutOfRange
(
"downstream=%lld used buff size must <= "
"max_buff_size, but now used_size=%lld, "
"max_buff_size=%lld"
,
down_id
,
used_size
,
max_buff_size
));
}
outs
.
second
.
second
=
used_size
;
InterceptorMessage
ready_msg
;
...
...
paddle/fluid/distributed/fleet_executor/compute_interceptor.h
浏览文件 @
3e9d8548
...
...
@@ -22,6 +22,8 @@
namespace
paddle
{
namespace
distributed
{
const
int64_t
INFINITE_BUFFER_SIZE
=
-
1
;
class
ComputeInterceptor
:
public
Interceptor
{
public:
ComputeInterceptor
(
int64_t
interceptor_id
,
TaskNode
*
node
);
...
...
paddle/fluid/distributed/fleet_executor/fleet_executor.cc
浏览文件 @
3e9d8548
...
...
@@ -111,21 +111,22 @@ void FleetExecutor::Init(
task_node
->
SetUnusedVars
(
unused_vars
);
if
(
task_node
->
type
()
==
"Cond"
)
{
std
::
vector
<
std
::
string
>
while_block_vars
;
std
::
vector
<
std
::
string
>
vars_in_parent
;
std
::
vector
<
std
::
string
>
vars_in_sub
;
for
(
auto
&
var
:
program_desc
.
Block
(
0
).
AllVars
())
{
vars_in_parent
.
emplace_back
(
var
->
Name
());
}
VLOG
(
3
)
<<
"Vars in while sub block:"
;
for
(
auto
&
var
:
program_desc
.
Block
(
1
).
AllVars
())
{
vars_in_sub
.
emplace_back
(
var
->
Name
());
VLOG
(
3
)
<<
var
->
Name
();
while_block_vars
.
emplace_back
(
var
->
Name
());
}
for
(
const
auto
&
pair
:
unused_vars
)
{
if
(
pair
.
first
->
Type
()
==
"while"
)
{
for
(
const
auto
&
var_name
:
pair
.
second
)
{
while_block_vars
.
emplace_back
(
var_name
);
}
}
}
VLOG
(
3
)
<<
"Vars below will be removed after while:"
;
for
(
const
auto
&
name
:
while_block_vars
)
{
VLOG
(
3
)
<<
name
;
}
std
::
sort
(
vars_in_parent
.
begin
(),
vars_in_parent
.
end
());
std
::
sort
(
vars_in_sub
.
begin
(),
vars_in_sub
.
end
());
std
::
set_difference
(
vars_in_sub
.
begin
(),
vars_in_sub
.
end
(),
vars_in_parent
.
begin
(),
vars_in_parent
.
end
(),
std
::
back_inserter
(
while_block_vars
));
task_node
->
SetWhileBlockVars
(
while_block_vars
);
}
int64_t
interceptor_id
=
task_node
->
task_id
();
...
...
python/paddle/fluid/executor.py
浏览文件 @
3e9d8548
...
...
@@ -2534,8 +2534,9 @@ class Executor:
place
=
core
.
Place
()
place
.
set_place
(
self
.
place
)
# NOTE: the last argument is used to force create some vars in root scope,
# won't be used during train.
inference_root_scope_vars
=
(
fleet_opt
[
"fetch_var"
]
if
"fetch_var"
in
fleet_opt
else
[]
)
self
.
_fleet_executor
.
init
(
carrier_id
,
program
.
desc
,
...
...
@@ -2544,7 +2545,7 @@ class Executor:
num_micro_batches
,
tasks
,
task_id_to_rank
,
[]
,
inference_root_scope_vars
,
micro_scope_list
,
)
...
...
python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
浏览文件 @
3e9d8548
...
...
@@ -165,19 +165,24 @@ class TestFleetExecutor(unittest.TestCase):
lazy_initialize
=
True
,
)
infinite_buff_size
=
-
1
task_a
.
add_downstream_task
(
task_b
.
task_id
(),
2
)
task_b
.
add_upstream_task
(
task_a
.
task_id
(),
2
)
task_b
.
add_downstream_task
(
task_c
.
task_id
(),
100
)
task_c
.
add_upstream_task
(
task_b
.
task_id
(),
100
)
task_b
.
add_downstream_task
(
task_c
.
task_id
(),
infinite_buff_size
)
task_c
.
add_upstream_task
(
task_b
.
task_id
(),
infinite_buff_size
)
task_c
.
add_downstream_task
(
task_d
.
task_id
(),
2
)
task_d
.
add_upstream_task
(
task_c
.
task_id
(),
2
)
task_d
.
add_downstream_task
(
task_b
.
task_id
(),
100
,
core
.
DependType
.
LOOP
)
task_b
.
add_upstream_task
(
task_d
.
task_id
(),
100
,
core
.
DependType
.
LOOP
)
task_d
.
add_downstream_task
(
task_b
.
task_id
(),
infinite_buff_size
,
core
.
DependType
.
LOOP
)
task_b
.
add_upstream_task
(
task_d
.
task_id
(),
infinite_buff_size
,
core
.
DependType
.
LOOP
)
task_b
.
add_downstream_task
(
task_e
.
task_id
(),
100
,
core
.
DependType
.
STOP_LOOP
task_e
.
task_id
(),
infinite_buff_size
,
core
.
DependType
.
STOP_LOOP
)
task_e
.
add_upstream_task
(
task_b
.
task_id
(),
100
,
core
.
DependType
.
STOP_LOOP
task_b
.
task_id
(),
infinite_buff_size
,
core
.
DependType
.
STOP_LOOP
)
main_program
.
_pipeline_opt
=
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录