Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
e39aa70e
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
e39aa70e
编写于
7月 07, 2020
作者:
L
lilong12
提交者:
GitHub
7月 07, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add the support for pipeline (#24560)
* add device_worker for pipeline, test=develop
上级
0dcb8754
变更
8
展开全部
显示空白变更内容
内联
并排
Showing
8 changed file
with
720 addition
and
706 deletion
+720
-706
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+19
-65
paddle/fluid/framework/pipeline_trainer.cc
paddle/fluid/framework/pipeline_trainer.cc
+153
-220
paddle/fluid/framework/section_worker.cc
paddle/fluid/framework/section_worker.cc
+460
-340
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+19
-37
paddle/fluid/framework/trainer_desc.proto
paddle/fluid/framework/trainer_desc.proto
+2
-0
python/paddle/fluid/device_worker.py
python/paddle/fluid/device_worker.py
+3
-10
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+1
-1
python/paddle/fluid/tests/unittests/test_pipeline.py
python/paddle/fluid/tests/unittests/test_pipeline.py
+63
-33
未找到文件。
paddle/fluid/framework/device_worker.h
浏览文件 @
e39aa70e
...
...
@@ -51,10 +51,6 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
class
FleetWrapper
;
#define SEC_LOG \
VLOG(3) << "[s" << section_id_ << "p" << pipeline_id_ << "t" << thread_id_ \
<< "]: "
class
PullDenseWorker
{
public:
virtual
~
PullDenseWorker
()
{}
...
...
@@ -311,40 +307,9 @@ class DownpourWorkerOpt : public DownpourWorker {
};
#if defined(PADDLE_WITH_NCCL)
using
ScopeQueue
=
operators
::
reader
::
BlockingQueue
<
Scope
*>
;
class
SyncFunctor
{
public:
SyncFunctor
(
int
rank_id
,
int
rank_num
,
int
sync_steps
);
virtual
~
SyncFunctor
()
{}
void
SetSyncParam
(
const
std
::
vector
<
std
::
string
>&
sync_param
)
{
sync_param_
=
&
sync_param
;
}
void
SetNcclCtxMap
(
platform
::
NCCLContextMap
*
nccl_ctx_map
)
{
nccl_ctx_map_
=
nccl_ctx_map
;
}
int
operator
()(
Scope
*
scope
);
static
std
::
vector
<
Scope
*>
pipeline_scopes_
;
static
uint64_t
sync_flag_
;
protected:
const
int
rank_id_
;
const
int
rank_num_
;
const
std
::
vector
<
std
::
string
>*
sync_param_
=
nullptr
;
platform
::
NCCLContextMap
*
nccl_ctx_map_
=
nullptr
;
uint64_t
sync_signal_
;
const
int
sync_steps_
;
int
counter_
;
void
Synchronize
();
};
class
SectionWorker
:
public
DeviceWorker
{
public:
SectionWorker
()
{}
SectionWorker
()
{
local_batch_id_
=
0
;
}
~
SectionWorker
()
override
{}
void
Initialize
(
const
TrainerDesc
&
desc
)
override
;
...
...
@@ -360,50 +325,39 @@ class SectionWorker : public DeviceWorker {
const
platform
::
Place
&
place
()
const
{
return
place_
;
}
void
SetSectionIndex
(
int
section_id
)
{
section_id_
=
section_id
;
}
void
SetDeviceIndex
(
int
tid
)
override
{
pipeline_id_
=
tid
;
}
void
SetDeviceIndex
(
int
tid
)
override
{}
void
SetThreadIndex
(
int
thread_id
)
{
thread_id_
=
thread_id
;
}
void
SetVarNames
(
const
std
::
vector
<
std
::
string
>&
in_var_names
,
const
std
::
vector
<
std
::
string
>&
out_var_names
)
{
in_var_names_
=
&
in_var_names
;
out_var_names_
=
&
out_var_names
;
}
void
SetScopeQueue
(
ScopeQueue
*
in_scope_queue
,
ScopeQueue
*
out_scope_queue
)
{
in_scope_queue_
=
in_scope_queue
;
out_scope_queue_
=
out_scope_queue
;
void
SetMicrobatchNum
(
int
num
)
{
num_microbatches_
=
num
;
}
void
SetMicrobatchScopes
(
const
std
::
vector
<
Scope
*>&
scope
)
{
microbatch_scopes_
=
scope
;
}
void
SetCountMutex
(
std
::
mutex
*
mutex
)
{
worker_count_mutex_
=
mutex
;
}
void
SetWorkerCount
(
int
*
worker_count
)
{
worker_count_
=
worker_count
;
}
void
SetSectionNum
(
int
section_num
)
{
section_num_
=
section_num
;
}
void
SetPipelineNum
(
int
pipeline_num
)
{
pipeline_num_
=
pipeline_num
;
}
void
SetNextSectionPlace
(
const
paddle
::
platform
::
Place
&
place
)
{
next_section_place_
=
place
;
void
SetMinibatchScope
(
const
Scope
*
scope
)
{
minibatch_scope_
=
scope
;
}
void
SetSkipVars
(
const
std
::
vector
<
std
::
string
>&
skip_vars
)
{
skip_vars_
=
skip_vars
;
}
SyncFunctor
*
sync_func_
=
nullptr
;
void
SetSyncFunctor
(
SyncFunctor
*
sync_func
)
{
sync_func_
=
sync_func
;
}
static
std
::
atomic
<
int
>
cpu_id_
;
protected:
void
AutoSetCPUAffinity
(
bool
reuse
);
int
section_id_
;
int
pipeline_id_
;
int
section_num_
;
int
pipeline_num_
;
int
thread_id_
;
// This worker will consume scope from in_scope_queue_
// and produce scope to out_scope_queue_
ScopeQueue
*
in_scope_queue_
=
nullptr
;
ScopeQueue
*
out_scope_queue_
=
nullptr
;
const
std
::
vector
<
std
::
string
>*
in_var_names_
=
nullptr
;
const
std
::
vector
<
std
::
string
>*
out_var_names_
=
nullptr
;
std
::
mutex
*
worker_count_mutex_
=
nullptr
;
int
*
worker_count_
=
nullptr
;
paddle
::
platform
::
Place
next_section_place_
;
int
num_microbatches_
;
std
::
vector
<
Scope
*>
microbatch_scopes_
;
std
::
vector
<
std
::
string
>
skip_vars_
;
const
Scope
*
minibatch_scope_
;
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops_
;
static
std
::
mutex
thread_mutex
;
static
std
::
condition_variable
thread_condition
;
static
bool
threads_completed
;
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program_
;
static
uint64_t
batch_id_
;
uint64_t
local_batch_id_
;
platform
::
DeviceContext
*
dev_ctx_
=
nullptr
;
};
#endif
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/pipeline_trainer.cc
浏览文件 @
e39aa70e
...
...
@@ -23,8 +23,13 @@ namespace framework {
void
PipelineTrainer
::
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
dataset
)
{
pipeline_num_
=
trainer_desc
.
thread_num
();
VLOG
(
3
)
<<
"pipeline num: "
<<
pipeline_num_
;
const
auto
&
section_params
=
trainer_desc
.
section_param
();
num_microbatches_
=
section_params
.
num_microbatches
();
VLOG
(
3
)
<<
"Number of microbatches per minibatch: "
<<
num_microbatches_
;
section_num_
=
section_params
.
section_config_size
();
VLOG
(
3
)
<<
"Number of program sections: "
<<
section_num_
;
trainer_desc_
=
trainer_desc
;
start_cpu_core_id_
=
section_params
.
start_cpu_core_id
();
SetDataset
(
dataset
);
ParseDumpConfig
(
trainer_desc
);
...
...
@@ -32,96 +37,62 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
const
std
::
vector
<
paddle
::
framework
::
DataFeed
*>
readers
=
dataset
->
GetReaders
();
VLOG
(
3
)
<<
"readers num: "
<<
readers
.
size
();
pipeline_config_
=
trainer_desc
.
section_param
();
scope_queue_size_
=
pipeline_config_
.
queue_size
();
sync_steps_
=
pipeline_config_
.
sync_steps
();
section_num_
=
pipeline_config_
.
section_config_size
();
VLOG
(
3
)
<<
"scope_queue_size: "
<<
scope_queue_size_
;
VLOG
(
3
)
<<
"section num: "
<<
section_num_
;
VLOG
(
3
)
<<
"sync_steps: "
<<
sync_steps_
;
int
num_readers
=
readers
.
size
();
PADDLE_ENFORCE_EQ
(
num_readers
,
1
,
platform
::
errors
::
InvalidArgument
(
"Number of dataset readers for pipeline "
"must be 1 now, but the value you give is %d."
,
num_readers
));
auto
*
reader
=
readers
[
0
];
feed_var_names_
=
reader
->
GetUseSlotAlias
();
workers_
.
resize
(
section_num_
);
in_var_names_
.
resize
(
section_num_
);
out_var_names_
.
resize
(
section_num_
);
worker_count_
.
resize
(
section_num_
);
worker_count_mutex_
.
resize
(
section_num_
);
param_need_sync_
.
reset
(
new
std
::
vector
<
std
::
string
>
);
int
reader_index
=
0
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
const
auto
&
section_config
=
pipeline_config_
.
section_config
(
i
);
int
concurrency
=
section_config
.
concurrency
();
VLOG
(
3
)
<<
"the thread num of each pipeline in section "
<<
i
<<
" is: "
<<
concurrency
;
in_var_names_
[
i
].
reset
(
new
std
::
vector
<
std
::
string
>
(
section_config
.
section_in_var_names
().
begin
(),
section_config
.
section_in_var_names
().
end
()));
out_var_names_
[
i
].
reset
(
new
std
::
vector
<
std
::
string
>
(
section_config
.
section_out_var_names
().
begin
(),
section_config
.
section_out_var_names
().
end
()));
worker_count_
[
i
].
resize
(
pipeline_num_
);
worker_count_mutex_
[
i
].
resize
(
pipeline_num_
);
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
worker_count_
[
i
][
j
]
=
new
int
(
concurrency
);
worker_count_mutex_
[
i
][
j
].
reset
(
new
std
::
mutex
);
}
const
auto
&
section_config
=
section_params
.
section_config
(
i
);
platform
::
Place
place
;
workers_
[
i
].
resize
(
pipeline_num_
);
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
workers_
[
i
][
j
].
resize
(
concurrency
);
int
place_id
=
section_config
.
place_id
();
switch
(
section_config
.
place
())
{
case
SectionConfig
::
CPUPlace
:
place
=
platform
::
CPUPlace
();
break
;
case
SectionConfig
::
CUDAPlace
:
// Note that one section has at most one GPU place in one pipeline
place
=
platform
::
CUDAPlace
(
j
);
PADDLE_ENFORCE_GE
(
place_id
,
0
,
platform
::
errors
::
InvalidArgument
(
"The place_id value for CUDAPlace shoud be greater "
"than or equal to 0, but the value you give is %d."
,
place_id
));
place
=
platform
::
CUDAPlace
(
place_id
);
break
;
case
SectionConfig
::
CUDAPinnedPlace
:
place
=
platform
::
CUDAPinnedPlace
();
break
;
default:
PADDLE_ENFORCE
(
false
,
"Unkown place type in SectionConfig: %d"
,
section_config
.
place
());
}
PADDLE_ENFORCE_NOT_NULL
(
nullptr
,
platform
::
errors
::
InvalidArgument
(
"Unkown place type in SectionConfig: %d"
,
section_config
.
place
()));
}
places_
.
emplace_back
(
place
);
VLOG
(
3
)
<<
"Device worker place: "
<<
place
<<
", device id: "
<<
place_id
<<
", section: "
<<
i
;
for
(
int
k
=
0
;
k
<
concurrency
;
++
k
)
{
workers_
[
i
][
j
][
k
]
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
workers_
[
i
]
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
trainer_desc
.
device_worker_name
());
auto
this_worker
=
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
][
j
][
k
]);
this_worker
->
SetSectionIndex
(
i
);
this_worker
->
SetDeviceIndex
(
j
);
this_worker
->
SetThreadIndex
(
k
);
this_worker
->
SetSectionNum
(
section_num_
);
this_worker
->
SetPipelineNum
(
pipeline_num_
);
workers_
[
i
]);
if
(
i
==
0
)
{
this_worker
->
SetDataFeed
(
readers
[
reader_index
++
]);
// we only set reader for the first section
this_worker
->
SetDataFeed
(
reader
);
this_worker
->
SetReaderPlace
(
place
);
}
if
(
i
==
section_num_
-
1
)
{
this_worker
->
SetNeedDumpField
(
need_dump_field_
);
this_worker
->
SetNeedDumpParam
(
need_dump_param_
);
this_worker
->
SetDumpFieldVector
(
dump_fields_
);
this_worker
->
SetDumpParamVector
(
dump_param_
);
}
this_worker
->
SetThreadIndex
(
i
);
this_worker
->
SetSectionIndex
(
i
);
this_worker
->
SetPlace
(
place
);
this_worker
->
Initialize
(
trainer_desc
);
this_worker
->
InitRandomDumpConfig
(
trainer_desc
);
}
}
}
param_need_sync_
.
reset
(
new
std
::
vector
<
std
::
string
>
(
pipeline_config_
.
param_need_sync
().
begin
(),
pipeline_config_
.
param_need_sync
().
end
()));
VLOG
(
3
)
<<
"param_need_sync_ have: "
;
for
(
const
std
::
string
&
name
:
*
param_need_sync_
)
{
VLOG
(
3
)
<<
name
;
this_worker
->
SetMicrobatchNum
(
num_microbatches_
);
}
// set debug here
SetDebug
(
trainer_desc
.
debug
());
...
...
@@ -140,13 +111,7 @@ std::string PipelineTrainer::GetDumpPath(int tid) {
void
PipelineTrainer
::
InitDumpEnv
()
{
queue_
=
paddle
::
framework
::
MakeChannel
<
std
::
string
>
();
// Only set dump channel on the last section
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
for
(
size_t
k
=
0
;
k
<
workers_
[
section_num_
-
1
][
j
].
size
();
++
k
)
{
workers_
[
section_num_
-
1
][
j
][
k
]
->
SetChannelWriter
(
queue_
.
get
());
}
}
// TODO(hutuxian): should make it as a config
// TODO(sandyhouse): should make it as a config
dump_thread_num_
=
1
;
for
(
int
i
=
0
;
i
<
dump_thread_num_
;
i
++
)
{
dump_thread_
.
push_back
(
...
...
@@ -154,150 +119,105 @@ void PipelineTrainer::InitDumpEnv() {
}
}
void
PipelineTrainer
::
InitFirstScopeQueue
(
ScopeQueue
*
scope_queue
,
int
pipeline_id
,
const
ProgramDesc
&
main_program
,
const
Scope
&
root_scope
)
{
for
(
int
i
=
0
;
i
<
scope_queue_size_
;
++
i
)
{
Scope
*
scope
=
&
pipeline_scopes_
[
pipeline_id
]
->
NewScope
();
for
(
auto
&
var
:
main_program
.
Block
(
0
).
AllVars
())
{
if
(
!
var
->
Persistable
())
{
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
void
PipelineTrainer
::
CopyParameters
(
int
section_id
,
int
microbatch_id
,
const
ProgramDesc
&
program
,
const
platform
::
Place
&
place
)
{
auto
&
global_block
=
program
.
Block
(
0
);
for
(
auto
&
var
:
global_block
.
AllVars
())
{
int
is_feed_var
=
std
::
count
(
feed_var_names_
.
begin
(),
feed_var_names_
.
end
(),
var
->
Name
());
if
((
var
->
Persistable
()
||
is_feed_var
)
&&
microbatch_id
==
0
)
{
if
(
is_feed_var
)
{
auto
*
new_ptr
=
minibatch_scopes_
[
section_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"data name: "
<<
var
->
Name
()
<<
", ptr: "
<<
new_ptr
;
InitializeVariable
(
new_ptr
,
var
->
GetType
());
}
else
{
if
(
section_num_
==
1
)
{
// Means only one section and it must be
// CUDAPlace, so copy all persistable vars to
// pipeline scope
const
LoDTensor
&
root_tensor
=
root_scope
.
FindVar
(
var
->
Name
())
->
Get
<
LoDTensor
>
();
LoDTensor
*
gpu_tensor
=
pipeline_scopes_
[
pipeline_id
]
->
Var
(
var
->
Name
())
->
GetMutable
<
LoDTensor
>
();
platform
::
Place
place
=
platform
::
CUDAPlace
(
pipeline_id
);
auto
*
ptr
=
root_scope_
->
FindVar
(
var
->
Name
());
auto
*
new_ptr
=
minibatch_scopes_
[
section_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"Create persistable var "
<<
var
->
Name
()
<<
" for minibatch "
<<
section_id
<<
", which pointer is "
<<
new_ptr
;
InitializeVariable
(
new_ptr
,
var
->
GetType
());
const
LoDTensor
&
root_tensor
=
ptr
->
Get
<
LoDTensor
>
();
LoDTensor
*
minibatch_tensor
=
new_ptr
->
GetMutable
<
LoDTensor
>
();
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
root_tensor
),
place
,
static_cast
<
Tensor
*>
(
gpu_tensor
));
}
}
static_cast
<
Tensor
*>
(
minibatch_tensor
));
}
}
else
if
(
!
var
->
Persistable
()
&&
!
is_feed_var
)
{
auto
*
ptr
=
microbatch_scopes_
[
section_id
][
microbatch_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"Create variable "
<<
var
->
Name
()
<<
" for section "
<<
section_id
<<
" microbatch "
<<
microbatch_id
<<
", which pointer is "
<<
ptr
;
InitializeVariable
(
ptr
,
var
->
GetType
());
}
scope_queue
->
Send
(
scope
);
}
}
void
PipelineTrainer
::
CopyParameters
(
const
Scope
&
root_scope
,
int
pipeline_id
)
{
for
(
const
std
::
string
&
name
:
*
param_need_sync_
)
{
const
LoDTensor
&
root_tensor
=
root_scope
.
FindVar
(
name
)
->
Get
<
LoDTensor
>
();
// TODO(hutxian): check a new var of the same name is created in
// pipeline_scope
LoDTensor
*
gpu_tensor
=
pipeline_scopes_
[
pipeline_id
]
->
Var
(
name
)
->
GetMutable
<
LoDTensor
>
();
platform
::
Place
place
=
platform
::
CUDAPlace
(
pipeline_id
);
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
root_tensor
),
place
,
static_cast
<
Tensor
*>
(
gpu_tensor
));
void
PipelineTrainer
::
GetSkipVars
(
int
section_id
,
const
ProgramDesc
&
program
)
{
auto
&
global_block
=
program
.
Block
(
0
);
for
(
auto
&
op
:
global_block
.
AllOps
())
{
if
(
op
->
Type
()
!=
"enqueue"
)
{
continue
;
}
auto
input_arg_names
=
op
->
InputArgumentNames
();
PADDLE_ENFORCE_EQ
(
input_arg_names
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Number of input arguments for enqueue op must be 1, "
"but the value is %d."
,
input_arg_names
.
size
()));
std
::
string
input_arg_name
=
input_arg_names
[
0
];
if
(
input_arg_name
.
rfind
(
"@GRAD"
)
!=
input_arg_name
.
size
()
-
5
)
{
skip_vars_
[
section_id
].
emplace_back
(
input_arg_name
);
VLOG
(
3
)
<<
"add skip var name: "
<<
input_arg_name
;
}
}
}
void
PipelineTrainer
::
InitTrainerEnv
(
const
ProgramDesc
&
main_program
,
const
platform
::
Place
&
place
)
{
PADDLE_ENFORCE
(
root_scope_
,
"Null root_scope pointer"
);
SectionWorker
::
cpu_id_
.
store
(
pipeline_config_
.
start_cpu_core_id
());
scope_queues_
.
resize
(
section_num_
);
pipeline_scopes_
.
resize
(
pipeline_num_
);
for
(
auto
&
var
:
main_program
.
Block
(
0
).
AllVars
())
{
if
(
var
->
Persistable
())
{
persistable_vars_
.
push_back
(
var
->
Name
());
}
}
PADDLE_ENFORCE_NOT_NULL
(
root_scope_
,
platform
::
errors
::
InvalidArgument
(
"root_scope pointer can not be nullptr"
));
auto
start_cpu_id
=
trainer_desc_
.
section_param
().
start_cpu_core_id
();
SectionWorker
::
cpu_id_
.
store
(
start_cpu_id
);
minibatch_scopes_
.
resize
(
section_num_
);
microbatch_scopes_
.
resize
(
section_num_
);
skip_vars_
.
resize
(
section_num_
);
VLOG
(
3
)
<<
"Init ScopeQueues and create all scopes"
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
scope_queues_
[
i
].
emplace_back
(
new
ScopeQueue
(
scope_queue_size_
))
;
if
(
i
==
0
)
{
pipeline_scopes_
[
j
]
=
&
root_scope_
->
NewScope
(
);
CopyParameters
(
*
root_scope_
,
j
);
InitFirstScopeQueue
(
scope_queues_
[
0
].
back
().
get
(),
j
,
main_program
,
*
root_scope_
);
}
minibatch_scopes_
[
i
]
=
&
root_scope_
->
NewScope
();
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program
;
program
.
reset
(
new
ProgramDesc
(
trainer_desc_
.
section_param
().
section_config
(
i
).
program_desc
())
);
microbatch_scopes_
[
i
].
resize
(
num_microbatches_
);
for
(
int
j
=
0
;
j
<
num_microbatches_
;
++
j
)
{
microbatch_scopes_
[
i
][
j
]
=
&
minibatch_scopes_
[
i
]
->
NewScope
(
);
CopyParameters
(
i
,
j
,
*
program
,
places_
[
i
]);
}
GetSkipVars
(
i
,
*
program
);
}
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
for
(
size_t
k
=
0
;
k
<
workers_
[
i
][
j
].
size
();
++
k
)
{
auto
this_worker
=
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
][
j
][
k
]);
workers_
[
i
]);
this_worker
->
SetRootScope
(
root_scope_
);
this_worker
->
SetCountMutex
(
worker_count_mutex_
[
i
][
j
].
get
());
this_worker
->
SetWorkerCount
(
worker_count_
[
i
][
j
]);
this_worker
->
SetScopeQueue
(
scope_queues_
[
i
][
j
].
get
(),
(
i
==
section_num_
-
1
)
?
scope_queues_
[
0
][
j
].
get
()
:
scope_queues_
[
i
+
1
][
j
].
get
());
this_worker
->
SetVarNames
(
*
in_var_names_
[
i
],
*
out_var_names_
[
i
]);
if
(
i
!=
section_num_
-
1
)
{
// For data copy in adjacent different place
this_worker
->
SetNextSectionPlace
(
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
+
1
][
j
][
0
])
->
place
());
}
}
}
}
if
(
pipeline_num_
>
1
&&
sync_steps_
!=
-
1
)
{
construct_sync_functor
();
}
}
void
PipelineTrainer
::
construct_sync_functor
()
{
std
::
vector
<
platform
::
Place
>
cuda_places
;
for
(
int
i
=
0
;
i
<
pipeline_num_
;
++
i
)
{
cuda_places
.
emplace_back
(
platform
::
CUDAPlace
(
i
));
}
nccl_ctx_map_
.
reset
(
new
platform
::
NCCLContextMap
(
cuda_places
));
sync_functors_
.
resize
(
pipeline_num_
);
SyncFunctor
::
sync_flag_
=
0
;
SyncFunctor
::
pipeline_scopes_
.
resize
(
0
);
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
SyncFunctor
*
sync_function
=
new
SyncFunctor
(
j
,
pipeline_num_
,
sync_steps_
);
sync_function
->
SetSyncParam
(
*
param_need_sync_
);
sync_function
->
SetNcclCtxMap
(
nccl_ctx_map_
.
get
());
SyncFunctor
::
pipeline_scopes_
.
push_back
(
this
->
pipeline_scopes_
[
j
]);
sync_functors_
[
j
].
reset
(
sync_function
);
}
for
(
int
i
=
section_num_
-
1
;
i
>=
0
;
--
i
)
{
if
(
SectionConfig
::
CUDAPlace
==
pipeline_config_
.
section_config
(
i
).
place
())
{
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
for
(
size_t
k
=
0
;
k
<
workers_
[
i
][
j
].
size
();
++
k
)
{
auto
this_worker
=
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
][
j
][
k
]);
this_worker
->
SetSyncFunctor
(
sync_functors_
[
j
].
get
());
}
}
break
;
}
this_worker
->
SetMinibatchScope
(
minibatch_scopes_
[
i
]);
this_worker
->
SetMicrobatchScopes
(
microbatch_scopes_
[
i
]);
this_worker
->
SetSkipVars
(
skip_vars_
[
i
]);
}
}
void
PipelineTrainer
::
Run
()
{
VLOG
(
3
)
<<
"Going to run"
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
for
(
size_t
k
=
0
;
k
<
workers_
[
i
][
j
].
size
();
++
k
)
{
if
(
!
debug_
)
{
section_threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFiles
,
workers_
[
i
][
j
][
k
].
get
()));
std
::
thread
(
&
DeviceWorker
::
TrainFiles
,
workers_
[
i
].
get
()));
}
else
{
section_threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFilesWithProfiler
,
workers_
[
i
][
j
][
k
].
get
()));
}
}
&
DeviceWorker
::
TrainFilesWithProfiler
,
workers_
[
i
].
get
()));
}
}
}
...
...
@@ -309,18 +229,31 @@ void PipelineTrainer::Finalize() {
if
(
need_dump_field_
)
{
FinalizeDumpEnv
();
}
for
(
const
auto
&
var
:
persistable_vars_
)
{
auto
*
root_tensor
=
root_scope_
->
Var
(
var
)
->
GetMutable
<
LoDTensor
>
();
// TODO(hutuxian): Add a final all-reduce?
const
auto
&
thread_tensor
=
pipeline_scopes_
[
0
]
->
FindVar
(
var
)
->
Get
<
LoDTensor
>
();
TensorCopySync
(
thread_tensor
,
platform
::
CPUPlace
(),
root_tensor
);
VLOG
(
3
)
<<
"copying back parameters. "
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program
;
program
.
reset
(
new
ProgramDesc
(
trainer_desc_
.
section_param
().
section_config
(
i
).
program_desc
()));
for
(
int
j
=
0
;
j
<
num_microbatches_
;
++
j
)
{
auto
&
global_block
=
program
->
Block
(
0
);
for
(
auto
&
var
:
global_block
.
AllVars
())
{
if
(
var
->
Persistable
())
{
auto
*
ptr
=
root_scope_
->
FindVar
(
var
->
Name
());
LoDTensor
*
root_tensor
=
ptr
->
GetMutable
<
LoDTensor
>
();
auto
*
minibatch_ptr
=
minibatch_scopes_
[
i
]
->
Var
(
var
->
Name
());
const
LoDTensor
&
minibatch_tensor
=
minibatch_ptr
->
Get
<
LoDTensor
>
();
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
minibatch_tensor
),
places_
[
0
],
static_cast
<
Tensor
*>
(
root_tensor
));
VLOG
(
4
)
<<
"Copy persitable var "
<<
var
->
Name
()
<<
" to root scope"
;
}
}
}
}
root_scope_
->
DropKids
();
}
Scope
*
PipelineTrainer
::
GetWorkerScope
(
int
thread_id
)
{
return
pipeline_scopes_
[
thread_id
];
return
microbatch_scopes_
[
thread_id
][
0
];
}
}
// end namespace framework
...
...
paddle/fluid/framework/section_worker.cc
浏览文件 @
e39aa70e
此差异已折叠。
点击以展开。
paddle/fluid/framework/trainer.h
浏览文件 @
e39aa70e
...
...
@@ -137,49 +137,31 @@ class PipelineTrainer : public TrainerBase {
virtual
Scope
*
GetWorkerScope
(
int
thread_id
);
void
InitDumpEnv
()
override
;
virtual
std
::
string
GetDumpPath
(
int
tid
);
void
GetSkipVars
(
int
section_id
,
const
ProgramDesc
&
main_program
);
protected:
int
section_num_
;
int
pipeline_num_
;
int
scope_queue_size_
;
int
sync_steps_
;
int
num_microbatches_
;
int
start_cpu_core_id_
;
std
::
vector
<
std
::
string
>
feed_var_names_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
std
::
vector
<
std
::
string
>>
skip_vars_
;
TrainerDesc
trainer_desc_
;
SectionWorkerParameter
pipeline_config_
;
// The in/output var names for each section
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
string
>>>
in_var_names_
;
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
string
>>>
out_var_names_
;
// Counter for the running thread
std
::
vector
<
std
::
vector
<
int
*>>
worker_count_
;
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
std
::
mutex
>>>
worker_count_mutex_
;
// worker: [section_id][pipeline_id][thread_id]
std
::
vector
<
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
paddle
::
framework
::
DeviceWorker
>>>>
workers_
;
std
::
vector
<
std
::
thread
>
section_threads_
;
// We use scope to maintain context info, and scopes
// will be deliverd between different sections.
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
ScopeQueue
>>>
scope_queues_
;
std
::
vector
<
Scope
*>
pipeline_scopes_
;
// The parameters that should be syncronized between different cards using
// nccl all-reduce
std
::
shared_ptr
<
std
::
vector
<
std
::
string
>>
param_need_sync_
;
std
::
vector
<
std
::
string
>
persistable_vars_
;
std
::
vector
<
std
::
unique_ptr
<
SyncFunctor
>>
sync_functors_
;
std
::
shared_ptr
<
platform
::
NCCLContextMap
>
nccl_ctx_map_
;
std
::
vector
<
DataFeed
*>
readers_
;
void
InitFirstScopeQueue
(
ScopeQueue
*
scope_queue
,
int
pipeline_id
,
const
ProgramDesc
&
main_program
,
const
Scope
&
root_scope
);
void
CopyParameters
(
const
Scope
&
root_scope
,
int
pipeline_id
);
void
construct_sync_functor
();
// worker: [section_id]
std
::
vector
<
std
::
shared_ptr
<
paddle
::
framework
::
DeviceWorker
>>
workers_
;
// minibatch_scopes_: [section_id]
std
::
vector
<
Scope
*>
minibatch_scopes_
;
// microbatch_scopes_: [section_id][microbatch_id]
std
::
vector
<
std
::
vector
<
Scope
*>>
microbatch_scopes_
;
void
CopyParameters
(
int
section_id
,
int
microbatch_id
,
const
ProgramDesc
&
program
,
const
platform
::
Place
&
place
);
bool
isPersistableVarGrad
(
std
::
string
name
);
bool
isPersistable
(
VarDesc
*
var
);
};
#endif
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/trainer_desc.proto
浏览文件 @
e39aa70e
...
...
@@ -83,6 +83,7 @@ message SectionWorkerParameter {
optional
int64
sync_steps
=
3
[
default
=
1
];
optional
int32
start_cpu_core_id
=
4
[
default
=
1
];
repeated
string
param_need_sync
=
5
;
optional
int32
num_microbatches
=
6
;
}
message
SectionConfig
{
...
...
@@ -99,6 +100,7 @@ message SectionConfig {
optional
int32
concurrency
=
3
[
default
=
1
];
repeated
string
section_in_var_names
=
4
;
repeated
string
section_out_var_names
=
5
;
optional
int32
place_id
=
6
[
default
=
-
1
];
}
message
FetchConfig
{
...
...
python/paddle/fluid/device_worker.py
浏览文件 @
e39aa70e
...
...
@@ -403,11 +403,8 @@ class Section(DeviceWorker):
trainer_desc
.
device_worker_name
=
"SectionWorker"
pipeline_opt
=
self
.
_program
.
_pipeline_opt
section_param
=
trainer_desc
.
section_param
section_param
.
queue_size
=
pipeline_opt
[
"queue_size"
]
section_param
.
sync_steps
=
pipeline_opt
[
"sync_steps"
]
section_param
.
num_microbatches
=
pipeline_opt
[
"num_microbatches"
]
section_param
.
start_cpu_core_id
=
pipeline_opt
[
"start_cpu_core_id"
]
for
e
in
pipeline_opt
[
"param_need_sync"
]:
section_param
.
param_need_sync
.
append
(
e
)
for
i
,
program
in
enumerate
(
pipeline_opt
[
"section_program_list"
]):
cfg
=
section_param
.
section_config
.
add
()
cfg
.
program_desc
.
ParseFromString
(
program
[
"program"
].
_get_desc
()
...
...
@@ -415,6 +412,7 @@ class Section(DeviceWorker):
# TODO: why does not work
# cfg.program_desc.CopyFrom(program.program._get_desc())
place
=
pipeline_opt
[
"place_list"
][
i
]
place_id
=
pipeline_opt
[
"place_id_list"
][
i
]
if
isinstance
(
place
,
core
.
CPUPlace
):
cfg
.
place
=
cfg
.
CPUPlace
elif
isinstance
(
place
,
core
.
CUDAPlace
):
...
...
@@ -425,12 +423,7 @@ class Section(DeviceWorker):
raise
NotImplementedError
(
"SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
)
cfg
.
concurrency
=
pipeline_opt
[
"concurrency_list"
][
i
]
for
var
in
program
[
"input_set"
]:
cfg
.
section_in_var_names
.
append
(
var
)
for
var
in
program
[
"output_set"
]:
cfg
.
section_out_var_names
.
append
(
var
)
cfg
.
place_id
=
place_id
class
DeviceWorkerFactory
(
object
):
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
e39aa70e
...
...
@@ -4474,7 +4474,7 @@ class PipelineOptimizer(object):
"place_list"
:
place_list
,
"place_id_list"
:
place_id_list
,
"sync_steps"
:
-
1
,
"
queue_size
"
:
self
.
_num_microbatches
,
"
num_microbatches
"
:
self
.
_num_microbatches
,
"start_cpu_core_id"
:
self
.
_start_cpu_core_id
,
}
return
optimize_ops
,
params_grads
,
program_list
...
...
python/paddle/fluid/tests/unittests/test_pipeline.py
浏览文件 @
e39aa70e
...
...
@@ -100,7 +100,7 @@ def build_network(input, layers=50, class_dim=1000):
pool_type
=
'max'
)
if
layers
>=
50
:
for
block
in
range
(
len
(
depth
)):
with
fluid
.
device_guard
(
"
cpu
"
):
with
fluid
.
device_guard
(
"
gpu:0
"
):
for
i
in
range
(
depth
[
block
]):
conv
=
bottleneck_block
(
input
=
conv
,
...
...
@@ -118,7 +118,7 @@ def build_network(input, layers=50, class_dim=1000):
initializer
=
fluid
.
initializer
.
Uniform
(
-
stdv
,
stdv
)))
else
:
for
block
in
range
(
len
(
depth
)):
with
fluid
.
device_guard
(
"
cpu
"
):
with
fluid
.
device_guard
(
"
gpu:0
"
):
for
i
in
range
(
depth
[
block
]):
conv
=
basic_block
(
input
=
conv
,
...
...
@@ -140,11 +140,15 @@ def build_network(input, layers=50, class_dim=1000):
class
TestPipeline
(
unittest
.
TestCase
):
""" TestCases for Pipeline Training. """
def
test_pipeline
(
self
):
def
_run
(
self
,
debug
):
main_prog
=
fluid
.
Program
()
startup_prog
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
with
fluid
.
device_guard
(
"cpu"
):
image
=
fluid
.
layers
.
data
(
name
=
"image"
,
shape
=
[
3
,
224
,
224
],
dtype
=
"float32"
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
[
image
,
label
],
capacity
=
64
,
...
...
@@ -165,7 +169,7 @@ class TestPipeline(unittest.TestCase):
bd
=
[
steps_per_pass
*
p
for
p
in
passes
]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
lr_val
=
fluid
.
layers
.
piecewise_decay
(
boundaries
=
bd
,
values
=
lr
)
optimizer
=
fluid
.
optimizer
.
Momentum
(
optimizer
=
fluid
.
optimizer
.
MomentumOptimizer
(
lr_val
,
momentum
=
0.9
,
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-4
))
...
...
@@ -173,6 +177,32 @@ class TestPipeline(unittest.TestCase):
optimizer
,
num_microbatches
=
2
)
optimizer
.
minimize
(
loss
)
def
train_reader
():
for
_
in
range
(
4
):
img
=
np
.
random
.
random
(
size
=
[
3
,
224
,
224
]).
astype
(
'float32'
)
label
=
np
.
random
.
random
(
size
=
[
1
]).
astype
(
'int64'
)
yield
img
,
label
data_loader
.
set_sample_generator
(
train_reader
,
batch_size
=
1
)
place
=
fluid
.
CPUPlace
()
# The following dataset is only used for the
# interface 'train_from_dataset'.
# And it has no actual meaning.
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
'FileInstantDataset'
)
dataset
.
set_batch_size
(
1
)
dataset
.
set_thread
(
1
)
dataset
.
set_filelist
([
'/tmp/tmp_2.txt'
])
dataset
.
set_use_var
([
image
,
label
])
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
data_loader
.
start
()
exe
.
train_from_dataset
(
main_prog
,
dataset
,
debug
=
debug
)
def
test_pipeline
(
self
):
self
.
_run
(
False
)
self
.
_run
(
True
)
def
test_pipeline_noneoptimizer
(
self
):
with
fluid
.
device_guard
(
"gpu:0"
):
x
=
fluid
.
layers
.
data
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录