Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
e39aa70e
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
e39aa70e
编写于
7月 07, 2020
作者:
L
lilong12
提交者:
GitHub
7月 07, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add the support for pipeline (#24560)
* add device_worker for pipeline, test=develop
上级
0dcb8754
变更
8
展开全部
隐藏空白更改
内联
并排
Showing
8 changed file
with
720 addition
and
706 deletion
+720
-706
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+19
-65
paddle/fluid/framework/pipeline_trainer.cc
paddle/fluid/framework/pipeline_trainer.cc
+153
-220
paddle/fluid/framework/section_worker.cc
paddle/fluid/framework/section_worker.cc
+460
-340
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+19
-37
paddle/fluid/framework/trainer_desc.proto
paddle/fluid/framework/trainer_desc.proto
+2
-0
python/paddle/fluid/device_worker.py
python/paddle/fluid/device_worker.py
+3
-10
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+1
-1
python/paddle/fluid/tests/unittests/test_pipeline.py
python/paddle/fluid/tests/unittests/test_pipeline.py
+63
-33
未找到文件。
paddle/fluid/framework/device_worker.h
浏览文件 @
e39aa70e
...
...
@@ -51,10 +51,6 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
class
FleetWrapper
;
#define SEC_LOG \
VLOG(3) << "[s" << section_id_ << "p" << pipeline_id_ << "t" << thread_id_ \
<< "]: "
class
PullDenseWorker
{
public:
virtual
~
PullDenseWorker
()
{}
...
...
@@ -311,40 +307,9 @@ class DownpourWorkerOpt : public DownpourWorker {
};
#if defined(PADDLE_WITH_NCCL)
using
ScopeQueue
=
operators
::
reader
::
BlockingQueue
<
Scope
*>
;
class
SyncFunctor
{
public:
SyncFunctor
(
int
rank_id
,
int
rank_num
,
int
sync_steps
);
virtual
~
SyncFunctor
()
{}
void
SetSyncParam
(
const
std
::
vector
<
std
::
string
>&
sync_param
)
{
sync_param_
=
&
sync_param
;
}
void
SetNcclCtxMap
(
platform
::
NCCLContextMap
*
nccl_ctx_map
)
{
nccl_ctx_map_
=
nccl_ctx_map
;
}
int
operator
()(
Scope
*
scope
);
static
std
::
vector
<
Scope
*>
pipeline_scopes_
;
static
uint64_t
sync_flag_
;
protected:
const
int
rank_id_
;
const
int
rank_num_
;
const
std
::
vector
<
std
::
string
>*
sync_param_
=
nullptr
;
platform
::
NCCLContextMap
*
nccl_ctx_map_
=
nullptr
;
uint64_t
sync_signal_
;
const
int
sync_steps_
;
int
counter_
;
void
Synchronize
();
};
class
SectionWorker
:
public
DeviceWorker
{
public:
SectionWorker
()
{}
SectionWorker
()
{
local_batch_id_
=
0
;
}
~
SectionWorker
()
override
{}
void
Initialize
(
const
TrainerDesc
&
desc
)
override
;
...
...
@@ -360,50 +325,39 @@ class SectionWorker : public DeviceWorker {
const
platform
::
Place
&
place
()
const
{
return
place_
;
}
void
SetSectionIndex
(
int
section_id
)
{
section_id_
=
section_id
;
}
void
SetDeviceIndex
(
int
tid
)
override
{
pipeline_id_
=
tid
;
}
void
SetDeviceIndex
(
int
tid
)
override
{}
void
SetThreadIndex
(
int
thread_id
)
{
thread_id_
=
thread_id
;
}
void
SetVarNames
(
const
std
::
vector
<
std
::
string
>&
in_var_names
,
const
std
::
vector
<
std
::
string
>&
out_var_names
)
{
in_var_names_
=
&
in_var_names
;
out_var_names_
=
&
out_var_names
;
}
void
SetScopeQueue
(
ScopeQueue
*
in_scope_queue
,
ScopeQueue
*
out_scope_queue
)
{
in_scope_queue_
=
in_scope_queue
;
out_scope_queue_
=
out_scope_queue
;
void
SetMicrobatchNum
(
int
num
)
{
num_microbatches_
=
num
;
}
void
SetMicrobatchScopes
(
const
std
::
vector
<
Scope
*>&
scope
)
{
microbatch_scopes_
=
scope
;
}
void
SetCountMutex
(
std
::
mutex
*
mutex
)
{
worker_count_mutex_
=
mutex
;
}
void
SetWorkerCount
(
int
*
worker_count
)
{
worker_count_
=
worker_count
;
}
void
SetSectionNum
(
int
section_num
)
{
section_num_
=
section_num
;
}
void
SetPipelineNum
(
int
pipeline_num
)
{
pipeline_num_
=
pipeline_num
;
}
void
SetNextSectionPlace
(
const
paddle
::
platform
::
Place
&
place
)
{
next_section_place_
=
place
;
void
SetMinibatchScope
(
const
Scope
*
scope
)
{
minibatch_scope_
=
scope
;
}
void
SetSkipVars
(
const
std
::
vector
<
std
::
string
>&
skip_vars
)
{
skip_vars_
=
skip_vars
;
}
SyncFunctor
*
sync_func_
=
nullptr
;
void
SetSyncFunctor
(
SyncFunctor
*
sync_func
)
{
sync_func_
=
sync_func
;
}
static
std
::
atomic
<
int
>
cpu_id_
;
protected:
void
AutoSetCPUAffinity
(
bool
reuse
);
int
section_id_
;
int
pipeline_id_
;
int
section_num_
;
int
pipeline_num_
;
int
thread_id_
;
// This worker will consume scope from in_scope_queue_
// and produce scope to out_scope_queue_
ScopeQueue
*
in_scope_queue_
=
nullptr
;
ScopeQueue
*
out_scope_queue_
=
nullptr
;
const
std
::
vector
<
std
::
string
>*
in_var_names_
=
nullptr
;
const
std
::
vector
<
std
::
string
>*
out_var_names_
=
nullptr
;
std
::
mutex
*
worker_count_mutex_
=
nullptr
;
int
*
worker_count_
=
nullptr
;
paddle
::
platform
::
Place
next_section_place_
;
int
num_microbatches_
;
std
::
vector
<
Scope
*>
microbatch_scopes_
;
std
::
vector
<
std
::
string
>
skip_vars_
;
const
Scope
*
minibatch_scope_
;
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops_
;
static
std
::
mutex
thread_mutex
;
static
std
::
condition_variable
thread_condition
;
static
bool
threads_completed
;
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program_
;
static
uint64_t
batch_id_
;
uint64_t
local_batch_id_
;
platform
::
DeviceContext
*
dev_ctx_
=
nullptr
;
};
#endif
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/pipeline_trainer.cc
浏览文件 @
e39aa70e
...
...
@@ -23,8 +23,13 @@ namespace framework {
void
PipelineTrainer
::
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
dataset
)
{
pipeline_num_
=
trainer_desc
.
thread_num
();
VLOG
(
3
)
<<
"pipeline num: "
<<
pipeline_num_
;
const
auto
&
section_params
=
trainer_desc
.
section_param
();
num_microbatches_
=
section_params
.
num_microbatches
();
VLOG
(
3
)
<<
"Number of microbatches per minibatch: "
<<
num_microbatches_
;
section_num_
=
section_params
.
section_config_size
();
VLOG
(
3
)
<<
"Number of program sections: "
<<
section_num_
;
trainer_desc_
=
trainer_desc
;
start_cpu_core_id_
=
section_params
.
start_cpu_core_id
();
SetDataset
(
dataset
);
ParseDumpConfig
(
trainer_desc
);
...
...
@@ -32,96 +37,62 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
const
std
::
vector
<
paddle
::
framework
::
DataFeed
*>
readers
=
dataset
->
GetReaders
();
VLOG
(
3
)
<<
"readers num: "
<<
readers
.
size
();
pipeline_config_
=
trainer_desc
.
section_param
();
scope_queue_size_
=
pipeline_config_
.
queue_size
();
sync_steps_
=
pipeline_config_
.
sync_steps
();
section_num_
=
pipeline_config_
.
section_config_size
();
VLOG
(
3
)
<<
"scope_queue_size: "
<<
scope_queue_size_
;
VLOG
(
3
)
<<
"section num: "
<<
section_num_
;
VLOG
(
3
)
<<
"sync_steps: "
<<
sync_steps_
;
int
num_readers
=
readers
.
size
();
PADDLE_ENFORCE_EQ
(
num_readers
,
1
,
platform
::
errors
::
InvalidArgument
(
"Number of dataset readers for pipeline "
"must be 1 now, but the value you give is %d."
,
num_readers
));
auto
*
reader
=
readers
[
0
];
feed_var_names_
=
reader
->
GetUseSlotAlias
();
workers_
.
resize
(
section_num_
);
in_var_names_
.
resize
(
section_num_
);
out_var_names_
.
resize
(
section_num_
);
worker_count_
.
resize
(
section_num_
);
worker_count_mutex_
.
resize
(
section_num_
);
param_need_sync_
.
reset
(
new
std
::
vector
<
std
::
string
>
);
int
reader_index
=
0
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
const
auto
&
section_config
=
pipeline_config_
.
section_config
(
i
);
int
concurrency
=
section_config
.
concurrency
();
VLOG
(
3
)
<<
"the thread num of each pipeline in section "
<<
i
<<
" is: "
<<
concurrency
;
in_var_names_
[
i
].
reset
(
new
std
::
vector
<
std
::
string
>
(
section_config
.
section_in_var_names
().
begin
(),
section_config
.
section_in_var_names
().
end
()));
out_var_names_
[
i
].
reset
(
new
std
::
vector
<
std
::
string
>
(
section_config
.
section_out_var_names
().
begin
(),
section_config
.
section_out_var_names
().
end
()));
worker_count_
[
i
].
resize
(
pipeline_num_
);
worker_count_mutex_
[
i
].
resize
(
pipeline_num_
);
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
worker_count_
[
i
][
j
]
=
new
int
(
concurrency
);
worker_count_mutex_
[
i
][
j
].
reset
(
new
std
::
mutex
);
}
const
auto
&
section_config
=
section_params
.
section_config
(
i
);
platform
::
Place
place
;
workers_
[
i
].
resize
(
pipeline_num_
);
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
workers_
[
i
][
j
].
resize
(
concurrency
);
switch
(
section_config
.
place
())
{
case
SectionConfig
::
CPUPlace
:
place
=
platform
::
CPUPlace
();
break
;
case
SectionConfig
::
CUDAPlace
:
// Note that one section has at most one GPU place in one pipeline
place
=
platform
::
CUDAPlace
(
j
);
break
;
case
SectionConfig
::
CUDAPinnedPlace
:
place
=
platform
::
CUDAPinnedPlace
();
break
;
default:
PADDLE_ENFORCE
(
false
,
"Unkown place type in SectionConfig: %d"
,
section_config
.
place
());
}
int
place_id
=
section_config
.
place_id
();
switch
(
section_config
.
place
())
{
case
SectionConfig
::
CPUPlace
:
place
=
platform
::
CPUPlace
();
break
;
case
SectionConfig
::
CUDAPlace
:
// Note that one section has at most one GPU place in one pipeline
PADDLE_ENFORCE_GE
(
place_id
,
0
,
platform
::
errors
::
InvalidArgument
(
"The place_id value for CUDAPlace shoud be greater "
"than or equal to 0, but the value you give is %d."
,
place_id
));
place
=
platform
::
CUDAPlace
(
place_id
);
break
;
case
SectionConfig
::
CUDAPinnedPlace
:
place
=
platform
::
CUDAPinnedPlace
();
break
;
default:
PADDLE_ENFORCE_NOT_NULL
(
nullptr
,
platform
::
errors
::
InvalidArgument
(
"Unkown place type in SectionConfig: %d"
,
section_config
.
place
()));
}
places_
.
emplace_back
(
place
);
VLOG
(
3
)
<<
"Device worker place: "
<<
place
<<
", device id: "
<<
place_id
<<
", section: "
<<
i
;
for
(
int
k
=
0
;
k
<
concurrency
;
++
k
)
{
workers_
[
i
][
j
][
k
]
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
trainer_desc
.
device_worker_name
());
auto
this_worker
=
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
][
j
][
k
]);
this_worker
->
SetSectionIndex
(
i
);
this_worker
->
SetDeviceIndex
(
j
);
this_worker
->
SetThreadIndex
(
k
);
this_worker
->
SetSectionNum
(
section_num_
);
this_worker
->
SetPipelineNum
(
pipeline_num_
);
if
(
i
==
0
)
{
this_worker
->
SetDataFeed
(
readers
[
reader_index
++
]);
this_worker
->
SetReaderPlace
(
place
);
}
if
(
i
==
section_num_
-
1
)
{
this_worker
->
SetNeedDumpField
(
need_dump_field_
);
this_worker
->
SetNeedDumpParam
(
need_dump_param_
);
this_worker
->
SetDumpFieldVector
(
dump_fields_
);
this_worker
->
SetDumpParamVector
(
dump_param_
);
}
this_worker
->
SetPlace
(
place
);
this_worker
->
Initialize
(
trainer_desc
);
this_worker
->
InitRandomDumpConfig
(
trainer_desc
);
}
workers_
[
i
]
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
trainer_desc
.
device_worker_name
());
auto
this_worker
=
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
]);
if
(
i
==
0
)
{
// we only set reader for the first section
this_worker
->
SetDataFeed
(
reader
);
this_worker
->
SetReaderPlace
(
place
);
}
}
param_need_sync_
.
reset
(
new
std
::
vector
<
std
::
string
>
(
pipeline_config_
.
param_need_sync
().
begin
(),
pipeline_config_
.
param_need_sync
().
end
()));
VLOG
(
3
)
<<
"param_need_sync_ have: "
;
for
(
const
std
::
string
&
name
:
*
param_need_sync_
)
{
VLOG
(
3
)
<<
name
;
this_worker
->
SetThreadIndex
(
i
);
this_worker
->
SetSectionIndex
(
i
);
this_worker
->
SetPlace
(
place
);
this_worker
->
Initialize
(
trainer_desc
);
this_worker
->
SetMicrobatchNum
(
num_microbatches_
);
}
// set debug here
SetDebug
(
trainer_desc
.
debug
());
...
...
@@ -140,13 +111,7 @@ std::string PipelineTrainer::GetDumpPath(int tid) {
void
PipelineTrainer
::
InitDumpEnv
()
{
queue_
=
paddle
::
framework
::
MakeChannel
<
std
::
string
>
();
// Only set dump channel on the last section
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
for
(
size_t
k
=
0
;
k
<
workers_
[
section_num_
-
1
][
j
].
size
();
++
k
)
{
workers_
[
section_num_
-
1
][
j
][
k
]
->
SetChannelWriter
(
queue_
.
get
());
}
}
// TODO(hutuxian): should make it as a config
// TODO(sandyhouse): should make it as a config
dump_thread_num_
=
1
;
for
(
int
i
=
0
;
i
<
dump_thread_num_
;
i
++
)
{
dump_thread_
.
push_back
(
...
...
@@ -154,150 +119,105 @@ void PipelineTrainer::InitDumpEnv() {
}
}
void
PipelineTrainer
::
InitFirstScopeQueue
(
ScopeQueue
*
scope_queue
,
int
pipeline_id
,
const
ProgramDesc
&
main_program
,
const
Scope
&
root_scope
)
{
for
(
int
i
=
0
;
i
<
scope_queue_size_
;
++
i
)
{
Scope
*
scope
=
&
pipeline_scopes_
[
pipeline_id
]
->
NewScope
();
for
(
auto
&
var
:
main_program
.
Block
(
0
).
AllVars
())
{
if
(
!
var
->
Persistable
())
{
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
void
PipelineTrainer
::
CopyParameters
(
int
section_id
,
int
microbatch_id
,
const
ProgramDesc
&
program
,
const
platform
::
Place
&
place
)
{
auto
&
global_block
=
program
.
Block
(
0
);
for
(
auto
&
var
:
global_block
.
AllVars
())
{
int
is_feed_var
=
std
::
count
(
feed_var_names_
.
begin
(),
feed_var_names_
.
end
(),
var
->
Name
());
if
((
var
->
Persistable
()
||
is_feed_var
)
&&
microbatch_id
==
0
)
{
if
(
is_feed_var
)
{
auto
*
new_ptr
=
minibatch_scopes_
[
section_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"data name: "
<<
var
->
Name
()
<<
", ptr: "
<<
new_ptr
;
InitializeVariable
(
new_ptr
,
var
->
GetType
());
}
else
{
if
(
section_num_
==
1
)
{
// Means only one section and it must be
// CUDAPlace, so copy all persistable vars to
// pipeline scope
const
LoDTensor
&
root_tensor
=
root_scope
.
FindVar
(
var
->
Name
())
->
Get
<
LoDTensor
>
();
LoDTensor
*
gpu_tensor
=
pipeline_scopes_
[
pipeline_id
]
->
Var
(
var
->
Name
())
->
GetMutable
<
LoDTensor
>
();
platform
::
Place
place
=
platform
::
CUDAPlace
(
pipeline_id
);
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
root_tensor
),
place
,
static_cast
<
Tensor
*>
(
gpu_tensor
));
}
auto
*
ptr
=
root_scope_
->
FindVar
(
var
->
Name
());
auto
*
new_ptr
=
minibatch_scopes_
[
section_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"Create persistable var "
<<
var
->
Name
()
<<
" for minibatch "
<<
section_id
<<
", which pointer is "
<<
new_ptr
;
InitializeVariable
(
new_ptr
,
var
->
GetType
());
const
LoDTensor
&
root_tensor
=
ptr
->
Get
<
LoDTensor
>
();
LoDTensor
*
minibatch_tensor
=
new_ptr
->
GetMutable
<
LoDTensor
>
();
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
root_tensor
),
place
,
static_cast
<
Tensor
*>
(
minibatch_tensor
));
}
}
else
if
(
!
var
->
Persistable
()
&&
!
is_feed_var
)
{
auto
*
ptr
=
microbatch_scopes_
[
section_id
][
microbatch_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"Create variable "
<<
var
->
Name
()
<<
" for section "
<<
section_id
<<
" microbatch "
<<
microbatch_id
<<
", which pointer is "
<<
ptr
;
InitializeVariable
(
ptr
,
var
->
GetType
());
}
scope_queue
->
Send
(
scope
);
}
}
void
PipelineTrainer
::
CopyParameters
(
const
Scope
&
root_scope
,
int
pipeline_id
)
{
for
(
const
std
::
string
&
name
:
*
param_need_sync_
)
{
const
LoDTensor
&
root_tensor
=
root_scope
.
FindVar
(
name
)
->
Get
<
LoDTensor
>
();
// TODO(hutxian): check a new var of the same name is created in
// pipeline_scope
LoDTensor
*
gpu_tensor
=
pipeline_scopes_
[
pipeline_id
]
->
Var
(
name
)
->
GetMutable
<
LoDTensor
>
();
platform
::
Place
place
=
platform
::
CUDAPlace
(
pipeline_id
);
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
root_tensor
),
place
,
static_cast
<
Tensor
*>
(
gpu_tensor
));
void
PipelineTrainer
::
GetSkipVars
(
int
section_id
,
const
ProgramDesc
&
program
)
{
auto
&
global_block
=
program
.
Block
(
0
);
for
(
auto
&
op
:
global_block
.
AllOps
())
{
if
(
op
->
Type
()
!=
"enqueue"
)
{
continue
;
}
auto
input_arg_names
=
op
->
InputArgumentNames
();
PADDLE_ENFORCE_EQ
(
input_arg_names
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Number of input arguments for enqueue op must be 1, "
"but the value is %d."
,
input_arg_names
.
size
()));
std
::
string
input_arg_name
=
input_arg_names
[
0
];
if
(
input_arg_name
.
rfind
(
"@GRAD"
)
!=
input_arg_name
.
size
()
-
5
)
{
skip_vars_
[
section_id
].
emplace_back
(
input_arg_name
);
VLOG
(
3
)
<<
"add skip var name: "
<<
input_arg_name
;
}
}
}
void
PipelineTrainer
::
InitTrainerEnv
(
const
ProgramDesc
&
main_program
,
const
platform
::
Place
&
place
)
{
PADDLE_ENFORCE
(
root_scope_
,
"Null root_scope pointer"
);
SectionWorker
::
cpu_id_
.
store
(
pipeline_config_
.
start_cpu_core_id
());
scope_queues_
.
resize
(
section_num_
);
pipeline_scopes_
.
resize
(
pipeline_num_
);
for
(
auto
&
var
:
main_program
.
Block
(
0
).
AllVars
())
{
if
(
var
->
Persistable
())
{
persistable_vars_
.
push_back
(
var
->
Name
());
}
}
PADDLE_ENFORCE_NOT_NULL
(
root_scope_
,
platform
::
errors
::
InvalidArgument
(
"root_scope pointer can not be nullptr"
));
auto
start_cpu_id
=
trainer_desc_
.
section_param
().
start_cpu_core_id
();
SectionWorker
::
cpu_id_
.
store
(
start_cpu_id
);
minibatch_scopes_
.
resize
(
section_num_
);
microbatch_scopes_
.
resize
(
section_num_
);
skip_vars_
.
resize
(
section_num_
);
VLOG
(
3
)
<<
"Init ScopeQueues and create all scopes"
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
scope_queues_
[
i
].
emplace_back
(
new
ScopeQueue
(
scope_queue_size_
))
;
if
(
i
==
0
)
{
pipeline_scopes_
[
j
]
=
&
root_scope_
->
NewScope
(
);
CopyParameters
(
*
root_scope_
,
j
);
InitFirstScopeQueue
(
scope_queues_
[
0
].
back
().
get
(),
j
,
main_program
,
*
root_scope_
);
}
minibatch_scopes_
[
i
]
=
&
root_scope_
->
NewScope
();
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program
;
program
.
reset
(
new
ProgramDesc
(
trainer_desc_
.
section_param
().
section_config
(
i
).
program_desc
())
);
microbatch_scopes_
[
i
].
resize
(
num_microbatches_
);
for
(
int
j
=
0
;
j
<
num_microbatches_
;
++
j
)
{
microbatch_scopes_
[
i
][
j
]
=
&
minibatch_scopes_
[
i
]
->
NewScope
(
);
CopyParameters
(
i
,
j
,
*
program
,
places_
[
i
]);
}
GetSkipVars
(
i
,
*
program
);
}
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
for
(
size_t
k
=
0
;
k
<
workers_
[
i
][
j
].
size
();
++
k
)
{
auto
this_worker
=
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
][
j
][
k
]);
this_worker
->
SetRootScope
(
root_scope_
);
this_worker
->
SetCountMutex
(
worker_count_mutex_
[
i
][
j
].
get
());
this_worker
->
SetWorkerCount
(
worker_count_
[
i
][
j
]);
this_worker
->
SetScopeQueue
(
scope_queues_
[
i
][
j
].
get
(),
(
i
==
section_num_
-
1
)
?
scope_queues_
[
0
][
j
].
get
()
:
scope_queues_
[
i
+
1
][
j
].
get
());
this_worker
->
SetVarNames
(
*
in_var_names_
[
i
],
*
out_var_names_
[
i
]);
if
(
i
!=
section_num_
-
1
)
{
// For data copy in adjacent different place
this_worker
->
SetNextSectionPlace
(
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
+
1
][
j
][
0
])
->
place
());
}
}
}
}
if
(
pipeline_num_
>
1
&&
sync_steps_
!=
-
1
)
{
construct_sync_functor
();
}
}
void
PipelineTrainer
::
construct_sync_functor
()
{
std
::
vector
<
platform
::
Place
>
cuda_places
;
for
(
int
i
=
0
;
i
<
pipeline_num_
;
++
i
)
{
cuda_places
.
emplace_back
(
platform
::
CUDAPlace
(
i
));
}
nccl_ctx_map_
.
reset
(
new
platform
::
NCCLContextMap
(
cuda_places
));
sync_functors_
.
resize
(
pipeline_num_
);
SyncFunctor
::
sync_flag_
=
0
;
SyncFunctor
::
pipeline_scopes_
.
resize
(
0
);
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
SyncFunctor
*
sync_function
=
new
SyncFunctor
(
j
,
pipeline_num_
,
sync_steps_
);
sync_function
->
SetSyncParam
(
*
param_need_sync_
);
sync_function
->
SetNcclCtxMap
(
nccl_ctx_map_
.
get
());
SyncFunctor
::
pipeline_scopes_
.
push_back
(
this
->
pipeline_scopes_
[
j
]);
sync_functors_
[
j
].
reset
(
sync_function
);
}
for
(
int
i
=
section_num_
-
1
;
i
>=
0
;
--
i
)
{
if
(
SectionConfig
::
CUDAPlace
==
pipeline_config_
.
section_config
(
i
).
place
())
{
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
for
(
size_t
k
=
0
;
k
<
workers_
[
i
][
j
].
size
();
++
k
)
{
auto
this_worker
=
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
][
j
][
k
]);
this_worker
->
SetSyncFunctor
(
sync_functors_
[
j
].
get
());
}
}
break
;
}
auto
this_worker
=
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
]);
this_worker
->
SetRootScope
(
root_scope_
);
this_worker
->
SetMinibatchScope
(
minibatch_scopes_
[
i
]);
this_worker
->
SetMicrobatchScopes
(
microbatch_scopes_
[
i
]);
this_worker
->
SetSkipVars
(
skip_vars_
[
i
]);
}
}
void
PipelineTrainer
::
Run
()
{
VLOG
(
3
)
<<
"Going to run"
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
for
(
size_t
k
=
0
;
k
<
workers_
[
i
][
j
].
size
();
++
k
)
{
if
(
!
debug_
)
{
section_threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFiles
,
workers_
[
i
][
j
][
k
].
get
()));
}
else
{
section_threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFilesWithProfiler
,
workers_
[
i
][
j
][
k
].
get
()));
}
}
if
(
!
debug_
)
{
section_threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFiles
,
workers_
[
i
].
get
()));
}
else
{
section_threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFilesWithProfiler
,
workers_
[
i
].
get
()));
}
}
}
...
...
@@ -309,18 +229,31 @@ void PipelineTrainer::Finalize() {
if
(
need_dump_field_
)
{
FinalizeDumpEnv
();
}
for
(
const
auto
&
var
:
persistable_vars_
)
{
auto
*
root_tensor
=
root_scope_
->
Var
(
var
)
->
GetMutable
<
LoDTensor
>
();
// TODO(hutuxian): Add a final all-reduce?
const
auto
&
thread_tensor
=
pipeline_scopes_
[
0
]
->
FindVar
(
var
)
->
Get
<
LoDTensor
>
();
TensorCopySync
(
thread_tensor
,
platform
::
CPUPlace
(),
root_tensor
);
VLOG
(
3
)
<<
"copying back parameters. "
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program
;
program
.
reset
(
new
ProgramDesc
(
trainer_desc_
.
section_param
().
section_config
(
i
).
program_desc
()));
for
(
int
j
=
0
;
j
<
num_microbatches_
;
++
j
)
{
auto
&
global_block
=
program
->
Block
(
0
);
for
(
auto
&
var
:
global_block
.
AllVars
())
{
if
(
var
->
Persistable
())
{
auto
*
ptr
=
root_scope_
->
FindVar
(
var
->
Name
());
LoDTensor
*
root_tensor
=
ptr
->
GetMutable
<
LoDTensor
>
();
auto
*
minibatch_ptr
=
minibatch_scopes_
[
i
]
->
Var
(
var
->
Name
());
const
LoDTensor
&
minibatch_tensor
=
minibatch_ptr
->
Get
<
LoDTensor
>
();
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
minibatch_tensor
),
places_
[
0
],
static_cast
<
Tensor
*>
(
root_tensor
));
VLOG
(
4
)
<<
"Copy persitable var "
<<
var
->
Name
()
<<
" to root scope"
;
}
}
}
}
root_scope_
->
DropKids
();
}
Scope
*
PipelineTrainer
::
GetWorkerScope
(
int
thread_id
)
{
return
pipeline_scopes_
[
thread_id
];
return
microbatch_scopes_
[
thread_id
][
0
];
}
}
// end namespace framework
...
...
paddle/fluid/framework/section_worker.cc
浏览文件 @
e39aa70e
此差异已折叠。
点击以展开。
paddle/fluid/framework/trainer.h
浏览文件 @
e39aa70e
...
...
@@ -137,49 +137,31 @@ class PipelineTrainer : public TrainerBase {
virtual
Scope
*
GetWorkerScope
(
int
thread_id
);
void
InitDumpEnv
()
override
;
virtual
std
::
string
GetDumpPath
(
int
tid
);
void
GetSkipVars
(
int
section_id
,
const
ProgramDesc
&
main_program
);
protected:
int
section_num_
;
int
pipeline_num_
;
int
scope_queue_size_
;
int
sync_steps_
;
int
num_microbatches_
;
int
start_cpu_core_id_
;
std
::
vector
<
std
::
string
>
feed_var_names_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
std
::
vector
<
std
::
string
>>
skip_vars_
;
TrainerDesc
trainer_desc_
;
SectionWorkerParameter
pipeline_config_
;
// The in/output var names for each section
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
string
>>>
in_var_names_
;
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
string
>>>
out_var_names_
;
// Counter for the running thread
std
::
vector
<
std
::
vector
<
int
*>>
worker_count_
;
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
std
::
mutex
>>>
worker_count_mutex_
;
// worker: [section_id][pipeline_id][thread_id]
std
::
vector
<
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
paddle
::
framework
::
DeviceWorker
>>>>
workers_
;
std
::
vector
<
std
::
thread
>
section_threads_
;
// We use scope to maintain context info, and scopes
// will be deliverd between different sections.
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
ScopeQueue
>>>
scope_queues_
;
std
::
vector
<
Scope
*>
pipeline_scopes_
;
// The parameters that should be syncronized between different cards using
// nccl all-reduce
std
::
shared_ptr
<
std
::
vector
<
std
::
string
>>
param_need_sync_
;
std
::
vector
<
std
::
string
>
persistable_vars_
;
std
::
vector
<
std
::
unique_ptr
<
SyncFunctor
>>
sync_functors_
;
std
::
shared_ptr
<
platform
::
NCCLContextMap
>
nccl_ctx_map_
;
std
::
vector
<
DataFeed
*>
readers_
;
void
InitFirstScopeQueue
(
ScopeQueue
*
scope_queue
,
int
pipeline_id
,
const
ProgramDesc
&
main_program
,
const
Scope
&
root_scope
);
void
CopyParameters
(
const
Scope
&
root_scope
,
int
pipeline_id
);
void
construct_sync_functor
();
// worker: [section_id]
std
::
vector
<
std
::
shared_ptr
<
paddle
::
framework
::
DeviceWorker
>>
workers_
;
// minibatch_scopes_: [section_id]
std
::
vector
<
Scope
*>
minibatch_scopes_
;
// microbatch_scopes_: [section_id][microbatch_id]
std
::
vector
<
std
::
vector
<
Scope
*>>
microbatch_scopes_
;
void
CopyParameters
(
int
section_id
,
int
microbatch_id
,
const
ProgramDesc
&
program
,
const
platform
::
Place
&
place
);
bool
isPersistableVarGrad
(
std
::
string
name
);
bool
isPersistable
(
VarDesc
*
var
);
};
#endif
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/trainer_desc.proto
浏览文件 @
e39aa70e
...
...
@@ -83,6 +83,7 @@ message SectionWorkerParameter {
optional
int64
sync_steps
=
3
[
default
=
1
];
optional
int32
start_cpu_core_id
=
4
[
default
=
1
];
repeated
string
param_need_sync
=
5
;
optional
int32
num_microbatches
=
6
;
}
message
SectionConfig
{
...
...
@@ -99,6 +100,7 @@ message SectionConfig {
optional
int32
concurrency
=
3
[
default
=
1
];
repeated
string
section_in_var_names
=
4
;
repeated
string
section_out_var_names
=
5
;
optional
int32
place_id
=
6
[
default
=
-
1
];
}
message
FetchConfig
{
...
...
python/paddle/fluid/device_worker.py
浏览文件 @
e39aa70e
...
...
@@ -403,11 +403,8 @@ class Section(DeviceWorker):
trainer_desc
.
device_worker_name
=
"SectionWorker"
pipeline_opt
=
self
.
_program
.
_pipeline_opt
section_param
=
trainer_desc
.
section_param
section_param
.
queue_size
=
pipeline_opt
[
"queue_size"
]
section_param
.
sync_steps
=
pipeline_opt
[
"sync_steps"
]
section_param
.
num_microbatches
=
pipeline_opt
[
"num_microbatches"
]
section_param
.
start_cpu_core_id
=
pipeline_opt
[
"start_cpu_core_id"
]
for
e
in
pipeline_opt
[
"param_need_sync"
]:
section_param
.
param_need_sync
.
append
(
e
)
for
i
,
program
in
enumerate
(
pipeline_opt
[
"section_program_list"
]):
cfg
=
section_param
.
section_config
.
add
()
cfg
.
program_desc
.
ParseFromString
(
program
[
"program"
].
_get_desc
()
...
...
@@ -415,6 +412,7 @@ class Section(DeviceWorker):
# TODO: why does not work
# cfg.program_desc.CopyFrom(program.program._get_desc())
place
=
pipeline_opt
[
"place_list"
][
i
]
place_id
=
pipeline_opt
[
"place_id_list"
][
i
]
if
isinstance
(
place
,
core
.
CPUPlace
):
cfg
.
place
=
cfg
.
CPUPlace
elif
isinstance
(
place
,
core
.
CUDAPlace
):
...
...
@@ -425,12 +423,7 @@ class Section(DeviceWorker):
raise
NotImplementedError
(
"SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
)
cfg
.
concurrency
=
pipeline_opt
[
"concurrency_list"
][
i
]
for
var
in
program
[
"input_set"
]:
cfg
.
section_in_var_names
.
append
(
var
)
for
var
in
program
[
"output_set"
]:
cfg
.
section_out_var_names
.
append
(
var
)
cfg
.
place_id
=
place_id
class
DeviceWorkerFactory
(
object
):
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
e39aa70e
...
...
@@ -4474,7 +4474,7 @@ class PipelineOptimizer(object):
"place_list"
:
place_list
,
"place_id_list"
:
place_id_list
,
"sync_steps"
:
-
1
,
"
queue_size
"
:
self
.
_num_microbatches
,
"
num_microbatches
"
:
self
.
_num_microbatches
,
"start_cpu_core_id"
:
self
.
_start_cpu_core_id
,
}
return
optimize_ops
,
params_grads
,
program_list
...
...
python/paddle/fluid/tests/unittests/test_pipeline.py
浏览文件 @
e39aa70e
...
...
@@ -100,7 +100,7 @@ def build_network(input, layers=50, class_dim=1000):
pool_type
=
'max'
)
if
layers
>=
50
:
for
block
in
range
(
len
(
depth
)):
with
fluid
.
device_guard
(
"
cpu
"
):
with
fluid
.
device_guard
(
"
gpu:0
"
):
for
i
in
range
(
depth
[
block
]):
conv
=
bottleneck_block
(
input
=
conv
,
...
...
@@ -118,7 +118,7 @@ def build_network(input, layers=50, class_dim=1000):
initializer
=
fluid
.
initializer
.
Uniform
(
-
stdv
,
stdv
)))
else
:
for
block
in
range
(
len
(
depth
)):
with
fluid
.
device_guard
(
"
cpu
"
):
with
fluid
.
device_guard
(
"
gpu:0
"
):
for
i
in
range
(
depth
[
block
]):
conv
=
basic_block
(
input
=
conv
,
...
...
@@ -140,38 +140,68 @@ def build_network(input, layers=50, class_dim=1000):
class
TestPipeline
(
unittest
.
TestCase
):
""" TestCases for Pipeline Training. """
def
_run
(
self
,
debug
):
main_prog
=
fluid
.
Program
()
startup_prog
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
with
fluid
.
device_guard
(
"cpu"
):
image
=
fluid
.
layers
.
data
(
name
=
"image"
,
shape
=
[
3
,
224
,
224
],
dtype
=
"float32"
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
[
image
,
label
],
capacity
=
64
,
use_double_buffer
=
True
,
iterable
=
False
)
fc
=
build_network
(
image
,
layers
=
50
)
with
fluid
.
device_guard
(
"gpu:0"
):
out
,
prob
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
fc
,
label
=
label
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
out
)
acc_top1
=
fluid
.
layers
.
accuracy
(
input
=
prob
,
label
=
label
,
k
=
1
)
acc_top5
=
fluid
.
layers
.
accuracy
(
input
=
prob
,
label
=
label
,
k
=
5
)
base_lr
=
0.1
passes
=
[
30
,
60
,
80
,
90
]
total_images
=
1281167
steps_per_pass
=
total_images
//
128
bd
=
[
steps_per_pass
*
p
for
p
in
passes
]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
lr_val
=
fluid
.
layers
.
piecewise_decay
(
boundaries
=
bd
,
values
=
lr
)
optimizer
=
fluid
.
optimizer
.
MomentumOptimizer
(
lr_val
,
momentum
=
0.9
,
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-4
))
optimizer
=
fluid
.
optimizer
.
PipelineOptimizer
(
optimizer
,
num_microbatches
=
2
)
optimizer
.
minimize
(
loss
)
def
train_reader
():
for
_
in
range
(
4
):
img
=
np
.
random
.
random
(
size
=
[
3
,
224
,
224
]).
astype
(
'float32'
)
label
=
np
.
random
.
random
(
size
=
[
1
]).
astype
(
'int64'
)
yield
img
,
label
data_loader
.
set_sample_generator
(
train_reader
,
batch_size
=
1
)
place
=
fluid
.
CPUPlace
()
# The following dataset is only used for the
# interface 'train_from_dataset'.
# And it has no actual meaning.
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
'FileInstantDataset'
)
dataset
.
set_batch_size
(
1
)
dataset
.
set_thread
(
1
)
dataset
.
set_filelist
([
'/tmp/tmp_2.txt'
])
dataset
.
set_use_var
([
image
,
label
])
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
data_loader
.
start
()
exe
.
train_from_dataset
(
main_prog
,
dataset
,
debug
=
debug
)
def
test_pipeline
(
self
):
with
fluid
.
device_guard
(
"cpu"
):
image
=
fluid
.
layers
.
data
(
name
=
"image"
,
shape
=
[
3
,
224
,
224
],
dtype
=
"float32"
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
[
image
,
label
],
capacity
=
64
,
use_double_buffer
=
True
,
iterable
=
False
)
fc
=
build_network
(
image
,
layers
=
50
)
with
fluid
.
device_guard
(
"gpu:0"
):
out
,
prob
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
fc
,
label
=
label
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
out
)
acc_top1
=
fluid
.
layers
.
accuracy
(
input
=
prob
,
label
=
label
,
k
=
1
)
acc_top5
=
fluid
.
layers
.
accuracy
(
input
=
prob
,
label
=
label
,
k
=
5
)
base_lr
=
0.1
passes
=
[
30
,
60
,
80
,
90
]
total_images
=
1281167
steps_per_pass
=
total_images
//
128
bd
=
[
steps_per_pass
*
p
for
p
in
passes
]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
lr_val
=
fluid
.
layers
.
piecewise_decay
(
boundaries
=
bd
,
values
=
lr
)
optimizer
=
fluid
.
optimizer
.
Momentum
(
lr_val
,
momentum
=
0.9
,
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-4
))
optimizer
=
fluid
.
optimizer
.
PipelineOptimizer
(
optimizer
,
num_microbatches
=
2
)
optimizer
.
minimize
(
loss
)
self
.
_run
(
False
)
self
.
_run
(
True
)
def
test_pipeline_noneoptimizer
(
self
):
with
fluid
.
device_guard
(
"gpu:0"
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录