Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
98c7191d
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
98c7191d
编写于
7月 09, 2021
作者:
Y
Yuang Liu
提交者:
GitHub
7月 09, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[hybrid performance] pipeline cache trainer (#33998)
上级
dfff52ea
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
31 addition
and
14 deletion
+31
-14
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+3
-0
paddle/fluid/framework/pipeline_trainer.cc
paddle/fluid/framework/pipeline_trainer.cc
+14
-5
paddle/fluid/framework/section_worker.cc
paddle/fluid/framework/section_worker.cc
+5
-1
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+9
-8
未找到文件。
paddle/fluid/framework/device_worker.h
浏览文件 @
98c7191d
...
...
@@ -581,6 +581,7 @@ class SectionWorker : public DeviceWorker {
void
RunUpdate
(
std
::
unique_ptr
<
GarbageCollector
>&
,
std
::
unordered_map
<
const
OperatorBase
*
,
std
::
vector
<
std
::
string
>>&
);
void
PrepareUnusedVar
();
protected:
int
section_id_
;
...
...
@@ -595,6 +596,8 @@ class SectionWorker : public DeviceWorker {
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops_
;
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program_
;
std
::
unordered_map
<
const
OperatorBase
*
,
std
::
vector
<
std
::
string
>>
unused_vars_
;
static
uint64_t
batch_id_
;
platform
::
DeviceContext
*
dev_ctx_
=
nullptr
;
...
...
paddle/fluid/framework/pipeline_trainer.cc
浏览文件 @
98c7191d
...
...
@@ -113,19 +113,28 @@ void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
this_worker
->
SetRootScope
(
root_scope_
);
this_worker
->
SetMinibatchScope
(
minibatch_scope_
);
this_worker
->
SetMicrobatchScopes
(
microbatch_scopes_
);
this_worker
->
PrepareUnusedVar
();
}
void
PipelineTrainer
::
Run
()
{
VLOG
(
5
)
<<
"Going to run PipelineTrainer::Run()"
;
section_thread_
=
std
::
async
(
&
DeviceWorker
::
TrainFiles
,
worker_
.
get
());
}
void
PipelineTrainer
::
Finalize
()
{
try
{
section_thread_
.
get
();
worker_
->
TrainFiles
();
}
catch
(
platform
::
EOFException
&
e
)
{
std
::
rethrow_exception
(
std
::
current_exception
());
}
for
(
auto
*
micro_scop
:
microbatch_scopes_
)
{
// By default, we should delete all kid scopes after run executor because
// some operators may create local scope when running, such as while_op.
// But when while_op also create a local executor to run it's sub block,
// the sub scopes it created should not be dropped immediately, because
// while_grad_op will use some variables created during while_op run, so
// we need to keep the kids and wait for the outer executor to drop them.
micro_scop
->
DropKids
();
}
}
void
PipelineTrainer
::
Finalize
()
{
if
(
need_dump_field_
)
{
FinalizeDumpEnv
();
}
...
...
paddle/fluid/framework/section_worker.cc
浏览文件 @
98c7191d
...
...
@@ -96,12 +96,16 @@ void SectionWorker::RunUpdate(
}
}
void
SectionWorker
::
PrepareUnusedVar
()
{
VLOG
(
5
)
<<
"begin prepare the unsed vars"
;
unused_vars_
=
GetUnusedVars
(
program_
->
Block
(
0
),
ops_
,
skip_vars_
);
}
void
SectionWorker
::
TrainFiles
()
{
VLOG
(
5
)
<<
"begin section_worker TrainFiles"
;
int64_t
max_memory_size
=
GetEagerDeletionThreshold
();
std
::
unique_ptr
<
GarbageCollector
>
gc
;
auto
unused_vars_
=
GetUnusedVars
(
program_
->
Block
(
0
),
ops_
,
skip_vars_
);
if
(
max_memory_size
>=
0
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
platform
::
is_gpu_place
(
place_
))
{
...
...
python/paddle/fluid/executor.py
浏览文件 @
98c7191d
...
...
@@ -1638,8 +1638,12 @@ class Executor(object):
dataset
.
_dynamic_adjust_before_train
(
trainer
.
proto_desc
.
thread_num
)
trainer_desc
=
trainer
.
_desc
()
# slow, cache
ctx
=
[
trainer_desc
,
dataset
,
scope
,
real_fetch_list
]
trainer_instance
=
self
.
_default_executor
.
init_for_dataset
(
program
.
desc
,
trainer_desc
,
scope
,
dataset
.
dataset
)
ctx
=
[
scope
,
real_fetch_list
,
trainer_instance
]
if
use_program_cache
:
self
.
_add_ctx_cache
(
cache_key
,
ctx
)
return
ctx
def
_run_pipeline
(
self
,
...
...
@@ -1654,20 +1658,17 @@ class Executor(object):
print_period
=
100
,
fetch_handler
=
None
,
use_program_cache
=
False
):
trainer_desc
,
dataset
,
scope
,
real_fetch_list
=
\
scope
,
real_fetch_list
,
trainer_instance
=
\
self
.
_prepare_pipeline_ctx
(
program
,
dataset
,
scope
,
thread
,
is_infer
,
debug
,
fetch_list
,
fetch_info
,
print_period
,
fetch_handler
,
use_program_cache
)
trainer_instance
=
self
.
_default_executor
.
init_for_dataset
(
program
.
desc
,
trainer_desc
,
scope
,
dataset
.
dataset
)
self
.
_default_executor
.
run_from_dataset
(
trainer_instance
)
self
.
_default_executor
.
release_trainer
(
trainer_instance
)
dataset
.
_dynamic_adjust_after_train
()
dataset
.
_finish_to_run
()
if
not
use_program_cache
:
self
.
_default_executor
.
release_trainer
(
trainer_instance
)
if
real_fetch_list
:
arr
=
scope
.
find_var
(
'fetch'
).
get_fetch_list
()
tensors
=
arr
.
_move_to_list
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录