Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
e39aa70e
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
e39aa70e
编写于
7月 07, 2020
作者:
L
lilong12
提交者:
GitHub
7月 07, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add the support for pipeline (#24560)
* add device_worker for pipeline, test=develop
上级
0dcb8754
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
720 addition
and
706 deletion
+720
-706
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+19
-65
paddle/fluid/framework/pipeline_trainer.cc
paddle/fluid/framework/pipeline_trainer.cc
+153
-220
paddle/fluid/framework/section_worker.cc
paddle/fluid/framework/section_worker.cc
+460
-340
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+19
-37
paddle/fluid/framework/trainer_desc.proto
paddle/fluid/framework/trainer_desc.proto
+2
-0
python/paddle/fluid/device_worker.py
python/paddle/fluid/device_worker.py
+3
-10
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+1
-1
python/paddle/fluid/tests/unittests/test_pipeline.py
python/paddle/fluid/tests/unittests/test_pipeline.py
+63
-33
未找到文件。
paddle/fluid/framework/device_worker.h
浏览文件 @
e39aa70e
...
@@ -51,10 +51,6 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
...
@@ -51,10 +51,6 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
class
FleetWrapper
;
class
FleetWrapper
;
#define SEC_LOG \
VLOG(3) << "[s" << section_id_ << "p" << pipeline_id_ << "t" << thread_id_ \
<< "]: "
class
PullDenseWorker
{
class
PullDenseWorker
{
public:
public:
virtual
~
PullDenseWorker
()
{}
virtual
~
PullDenseWorker
()
{}
...
@@ -311,40 +307,9 @@ class DownpourWorkerOpt : public DownpourWorker {
...
@@ -311,40 +307,9 @@ class DownpourWorkerOpt : public DownpourWorker {
};
};
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
using
ScopeQueue
=
operators
::
reader
::
BlockingQueue
<
Scope
*>
;
class
SyncFunctor
{
public:
SyncFunctor
(
int
rank_id
,
int
rank_num
,
int
sync_steps
);
virtual
~
SyncFunctor
()
{}
void
SetSyncParam
(
const
std
::
vector
<
std
::
string
>&
sync_param
)
{
sync_param_
=
&
sync_param
;
}
void
SetNcclCtxMap
(
platform
::
NCCLContextMap
*
nccl_ctx_map
)
{
nccl_ctx_map_
=
nccl_ctx_map
;
}
int
operator
()(
Scope
*
scope
);
static
std
::
vector
<
Scope
*>
pipeline_scopes_
;
static
uint64_t
sync_flag_
;
protected:
const
int
rank_id_
;
const
int
rank_num_
;
const
std
::
vector
<
std
::
string
>*
sync_param_
=
nullptr
;
platform
::
NCCLContextMap
*
nccl_ctx_map_
=
nullptr
;
uint64_t
sync_signal_
;
const
int
sync_steps_
;
int
counter_
;
void
Synchronize
();
};
class
SectionWorker
:
public
DeviceWorker
{
class
SectionWorker
:
public
DeviceWorker
{
public:
public:
SectionWorker
()
{}
SectionWorker
()
{
local_batch_id_
=
0
;
}
~
SectionWorker
()
override
{}
~
SectionWorker
()
override
{}
void
Initialize
(
const
TrainerDesc
&
desc
)
override
;
void
Initialize
(
const
TrainerDesc
&
desc
)
override
;
...
@@ -360,50 +325,39 @@ class SectionWorker : public DeviceWorker {
...
@@ -360,50 +325,39 @@ class SectionWorker : public DeviceWorker {
const
platform
::
Place
&
place
()
const
{
return
place_
;
}
const
platform
::
Place
&
place
()
const
{
return
place_
;
}
void
SetSectionIndex
(
int
section_id
)
{
section_id_
=
section_id
;
}
void
SetSectionIndex
(
int
section_id
)
{
section_id_
=
section_id
;
}
void
SetDeviceIndex
(
int
tid
)
override
{
pipeline_id_
=
tid
;
}
void
SetDeviceIndex
(
int
tid
)
override
{}
void
SetThreadIndex
(
int
thread_id
)
{
thread_id_
=
thread_id
;
}
void
SetThreadIndex
(
int
thread_id
)
{
thread_id_
=
thread_id
;
}
void
SetVarNames
(
const
std
::
vector
<
std
::
string
>&
in_var_names
,
void
SetMicrobatchNum
(
int
num
)
{
num_microbatches_
=
num
;
}
const
std
::
vector
<
std
::
string
>&
out_var_names
)
{
void
SetMicrobatchScopes
(
const
std
::
vector
<
Scope
*>&
scope
)
{
in_var_names_
=
&
in_var_names
;
microbatch_scopes_
=
scope
;
out_var_names_
=
&
out_var_names
;
}
void
SetScopeQueue
(
ScopeQueue
*
in_scope_queue
,
ScopeQueue
*
out_scope_queue
)
{
in_scope_queue_
=
in_scope_queue
;
out_scope_queue_
=
out_scope_queue
;
}
}
void
SetCountMutex
(
std
::
mutex
*
mutex
)
{
worker_count_mutex_
=
mutex
;
}
void
SetMinibatchScope
(
const
Scope
*
scope
)
{
minibatch_scope_
=
scope
;
}
void
SetWorkerCount
(
int
*
worker_count
)
{
worker_count_
=
worker_count
;
}
void
SetSkipVars
(
const
std
::
vector
<
std
::
string
>&
skip_vars
)
{
void
SetSectionNum
(
int
section_num
)
{
section_num_
=
section_num
;
}
skip_vars_
=
skip_vars
;
void
SetPipelineNum
(
int
pipeline_num
)
{
pipeline_num_
=
pipeline_num
;
}
void
SetNextSectionPlace
(
const
paddle
::
platform
::
Place
&
place
)
{
next_section_place_
=
place
;
}
}
SyncFunctor
*
sync_func_
=
nullptr
;
void
SetSyncFunctor
(
SyncFunctor
*
sync_func
)
{
sync_func_
=
sync_func
;
}
static
std
::
atomic
<
int
>
cpu_id_
;
static
std
::
atomic
<
int
>
cpu_id_
;
protected:
protected:
void
AutoSetCPUAffinity
(
bool
reuse
);
void
AutoSetCPUAffinity
(
bool
reuse
);
int
section_id_
;
int
section_id_
;
int
pipeline_id_
;
int
section_num_
;
int
pipeline_num_
;
int
thread_id_
;
int
thread_id_
;
// This worker will consume scope from in_scope_queue_
int
num_microbatches_
;
// and produce scope to out_scope_queue_
std
::
vector
<
Scope
*>
microbatch_scopes_
;
ScopeQueue
*
in_scope_queue_
=
nullptr
;
std
::
vector
<
std
::
string
>
skip_vars_
;
ScopeQueue
*
out_scope_queue_
=
nullptr
;
const
Scope
*
minibatch_scope_
;
const
std
::
vector
<
std
::
string
>*
in_var_names_
=
nullptr
;
const
std
::
vector
<
std
::
string
>*
out_var_names_
=
nullptr
;
std
::
mutex
*
worker_count_mutex_
=
nullptr
;
int
*
worker_count_
=
nullptr
;
paddle
::
platform
::
Place
next_section_place_
;
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops_
;
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops_
;
static
std
::
mutex
thread_mutex
;
static
std
::
condition_variable
thread_condition
;
static
bool
threads_completed
;
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program_
;
static
uint64_t
batch_id_
;
uint64_t
local_batch_id_
;
platform
::
DeviceContext
*
dev_ctx_
=
nullptr
;
platform
::
DeviceContext
*
dev_ctx_
=
nullptr
;
};
};
#endif
#endif
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/pipeline_trainer.cc
浏览文件 @
e39aa70e
...
@@ -23,8 +23,13 @@ namespace framework {
...
@@ -23,8 +23,13 @@ namespace framework {
void
PipelineTrainer
::
Initialize
(
const
TrainerDesc
&
trainer_desc
,
void
PipelineTrainer
::
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
dataset
)
{
Dataset
*
dataset
)
{
pipeline_num_
=
trainer_desc
.
thread_num
();
const
auto
&
section_params
=
trainer_desc
.
section_param
();
VLOG
(
3
)
<<
"pipeline num: "
<<
pipeline_num_
;
num_microbatches_
=
section_params
.
num_microbatches
();
VLOG
(
3
)
<<
"Number of microbatches per minibatch: "
<<
num_microbatches_
;
section_num_
=
section_params
.
section_config_size
();
VLOG
(
3
)
<<
"Number of program sections: "
<<
section_num_
;
trainer_desc_
=
trainer_desc
;
start_cpu_core_id_
=
section_params
.
start_cpu_core_id
();
SetDataset
(
dataset
);
SetDataset
(
dataset
);
ParseDumpConfig
(
trainer_desc
);
ParseDumpConfig
(
trainer_desc
);
...
@@ -32,96 +37,62 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
...
@@ -32,96 +37,62 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
const
std
::
vector
<
paddle
::
framework
::
DataFeed
*>
readers
=
const
std
::
vector
<
paddle
::
framework
::
DataFeed
*>
readers
=
dataset
->
GetReaders
();
dataset
->
GetReaders
();
VLOG
(
3
)
<<
"readers num: "
<<
readers
.
size
();
VLOG
(
3
)
<<
"readers num: "
<<
readers
.
size
();
int
num_readers
=
readers
.
size
();
pipeline_config_
=
trainer_desc
.
section_param
();
PADDLE_ENFORCE_EQ
(
num_readers
,
1
,
scope_queue_size_
=
pipeline_config_
.
queue_size
();
platform
::
errors
::
InvalidArgument
(
sync_steps_
=
pipeline_config_
.
sync_steps
();
"Number of dataset readers for pipeline "
section_num_
=
pipeline_config_
.
section_config_size
();
"must be 1 now, but the value you give is %d."
,
num_readers
));
VLOG
(
3
)
<<
"scope_queue_size: "
<<
scope_queue_size_
;
auto
*
reader
=
readers
[
0
];
VLOG
(
3
)
<<
"section num: "
<<
section_num_
;
feed_var_names_
=
reader
->
GetUseSlotAlias
();
VLOG
(
3
)
<<
"sync_steps: "
<<
sync_steps_
;
workers_
.
resize
(
section_num_
);
workers_
.
resize
(
section_num_
);
in_var_names_
.
resize
(
section_num_
);
out_var_names_
.
resize
(
section_num_
);
worker_count_
.
resize
(
section_num_
);
worker_count_mutex_
.
resize
(
section_num_
);
param_need_sync_
.
reset
(
new
std
::
vector
<
std
::
string
>
);
int
reader_index
=
0
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
const
auto
&
section_config
=
pipeline_config_
.
section_config
(
i
);
const
auto
&
section_config
=
section_params
.
section_config
(
i
);
int
concurrency
=
section_config
.
concurrency
();
VLOG
(
3
)
<<
"the thread num of each pipeline in section "
<<
i
<<
" is: "
<<
concurrency
;
in_var_names_
[
i
].
reset
(
new
std
::
vector
<
std
::
string
>
(
section_config
.
section_in_var_names
().
begin
(),
section_config
.
section_in_var_names
().
end
()));
out_var_names_
[
i
].
reset
(
new
std
::
vector
<
std
::
string
>
(
section_config
.
section_out_var_names
().
begin
(),
section_config
.
section_out_var_names
().
end
()));
worker_count_
[
i
].
resize
(
pipeline_num_
);
worker_count_mutex_
[
i
].
resize
(
pipeline_num_
);
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
worker_count_
[
i
][
j
]
=
new
int
(
concurrency
);
worker_count_mutex_
[
i
][
j
].
reset
(
new
std
::
mutex
);
}
platform
::
Place
place
;
platform
::
Place
place
;
workers_
[
i
].
resize
(
pipeline_num_
);
int
place_id
=
section_config
.
place_id
();
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
switch
(
section_config
.
place
())
{
workers_
[
i
][
j
].
resize
(
concurrency
);
case
SectionConfig
::
CPUPlace
:
place
=
platform
::
CPUPlace
();
switch
(
section_config
.
place
())
{
break
;
case
SectionConfig
::
CPUPlace
:
case
SectionConfig
::
CUDAPlace
:
place
=
platform
::
CPUPlace
();
// Note that one section has at most one GPU place in one pipeline
break
;
PADDLE_ENFORCE_GE
(
case
SectionConfig
::
CUDAPlace
:
place_id
,
0
,
// Note that one section has at most one GPU place in one pipeline
platform
::
errors
::
InvalidArgument
(
place
=
platform
::
CUDAPlace
(
j
);
"The place_id value for CUDAPlace shoud be greater "
break
;
"than or equal to 0, but the value you give is %d."
,
case
SectionConfig
::
CUDAPinnedPlace
:
place_id
));
place
=
platform
::
CUDAPinnedPlace
();
place
=
platform
::
CUDAPlace
(
place_id
);
break
;
break
;
default:
case
SectionConfig
::
CUDAPinnedPlace
:
PADDLE_ENFORCE
(
false
,
"Unkown place type in SectionConfig: %d"
,
place
=
platform
::
CUDAPinnedPlace
();
section_config
.
place
());
break
;
}
default:
PADDLE_ENFORCE_NOT_NULL
(
nullptr
,
platform
::
errors
::
InvalidArgument
(
"Unkown place type in SectionConfig: %d"
,
section_config
.
place
()));
}
places_
.
emplace_back
(
place
);
VLOG
(
3
)
<<
"Device worker place: "
<<
place
<<
", device id: "
<<
place_id
<<
", section: "
<<
i
;
for
(
int
k
=
0
;
k
<
concurrency
;
++
k
)
{
workers_
[
i
]
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
workers_
[
i
][
j
][
k
]
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
trainer_desc
.
device_worker_name
());
trainer_desc
.
device_worker_name
());
auto
this_worker
=
auto
this_worker
=
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
]);
workers_
[
i
][
j
][
k
]);
if
(
i
==
0
)
{
this_worker
->
SetSectionIndex
(
i
);
// we only set reader for the first section
this_worker
->
SetDeviceIndex
(
j
);
this_worker
->
SetDataFeed
(
reader
);
this_worker
->
SetThreadIndex
(
k
);
this_worker
->
SetReaderPlace
(
place
);
this_worker
->
SetSectionNum
(
section_num_
);
this_worker
->
SetPipelineNum
(
pipeline_num_
);
if
(
i
==
0
)
{
this_worker
->
SetDataFeed
(
readers
[
reader_index
++
]);
this_worker
->
SetReaderPlace
(
place
);
}
if
(
i
==
section_num_
-
1
)
{
this_worker
->
SetNeedDumpField
(
need_dump_field_
);
this_worker
->
SetNeedDumpParam
(
need_dump_param_
);
this_worker
->
SetDumpFieldVector
(
dump_fields_
);
this_worker
->
SetDumpParamVector
(
dump_param_
);
}
this_worker
->
SetPlace
(
place
);
this_worker
->
Initialize
(
trainer_desc
);
this_worker
->
InitRandomDumpConfig
(
trainer_desc
);
}
}
}
}
this_worker
->
SetThreadIndex
(
i
);
param_need_sync_
.
reset
(
this_worker
->
SetSectionIndex
(
i
);
new
std
::
vector
<
std
::
string
>
(
pipeline_config_
.
param_need_sync
().
begin
(),
this_worker
->
SetPlace
(
place
);
pipeline_config_
.
param_need_sync
().
end
()));
this_worker
->
Initialize
(
trainer_desc
);
VLOG
(
3
)
<<
"param_need_sync_ have: "
;
this_worker
->
SetMicrobatchNum
(
num_microbatches_
);
for
(
const
std
::
string
&
name
:
*
param_need_sync_
)
{
VLOG
(
3
)
<<
name
;
}
}
// set debug here
// set debug here
SetDebug
(
trainer_desc
.
debug
());
SetDebug
(
trainer_desc
.
debug
());
...
@@ -140,13 +111,7 @@ std::string PipelineTrainer::GetDumpPath(int tid) {
...
@@ -140,13 +111,7 @@ std::string PipelineTrainer::GetDumpPath(int tid) {
void
PipelineTrainer
::
InitDumpEnv
()
{
void
PipelineTrainer
::
InitDumpEnv
()
{
queue_
=
paddle
::
framework
::
MakeChannel
<
std
::
string
>
();
queue_
=
paddle
::
framework
::
MakeChannel
<
std
::
string
>
();
// Only set dump channel on the last section
// TODO(sandyhouse): should make it as a config
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
for
(
size_t
k
=
0
;
k
<
workers_
[
section_num_
-
1
][
j
].
size
();
++
k
)
{
workers_
[
section_num_
-
1
][
j
][
k
]
->
SetChannelWriter
(
queue_
.
get
());
}
}
// TODO(hutuxian): should make it as a config
dump_thread_num_
=
1
;
dump_thread_num_
=
1
;
for
(
int
i
=
0
;
i
<
dump_thread_num_
;
i
++
)
{
for
(
int
i
=
0
;
i
<
dump_thread_num_
;
i
++
)
{
dump_thread_
.
push_back
(
dump_thread_
.
push_back
(
...
@@ -154,150 +119,105 @@ void PipelineTrainer::InitDumpEnv() {
...
@@ -154,150 +119,105 @@ void PipelineTrainer::InitDumpEnv() {
}
}
}
}
void
PipelineTrainer
::
InitFirstScopeQueue
(
ScopeQueue
*
scope_queue
,
void
PipelineTrainer
::
CopyParameters
(
int
section_id
,
int
microbatch_id
,
int
pipeline_id
,
const
ProgramDesc
&
program
,
const
ProgramDesc
&
main_program
,
const
platform
::
Place
&
place
)
{
const
Scope
&
root_scope
)
{
auto
&
global_block
=
program
.
Block
(
0
);
for
(
int
i
=
0
;
i
<
scope_queue_size_
;
++
i
)
{
for
(
auto
&
var
:
global_block
.
AllVars
())
{
Scope
*
scope
=
&
pipeline_scopes_
[
pipeline_id
]
->
NewScope
();
int
is_feed_var
=
for
(
auto
&
var
:
main_program
.
Block
(
0
).
AllVars
())
{
std
::
count
(
feed_var_names_
.
begin
(),
feed_var_names_
.
end
(),
var
->
Name
());
if
(
!
var
->
Persistable
())
{
if
((
var
->
Persistable
()
||
is_feed_var
)
&&
microbatch_id
==
0
)
{
auto
*
ptr
=
scope
->
Var
(
var
->
Name
());
if
(
is_feed_var
)
{
InitializeVariable
(
ptr
,
var
->
GetType
());
auto
*
new_ptr
=
minibatch_scopes_
[
section_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"data name: "
<<
var
->
Name
()
<<
", ptr: "
<<
new_ptr
;
InitializeVariable
(
new_ptr
,
var
->
GetType
());
}
else
{
}
else
{
if
(
section_num_
==
1
)
{
// Means only one section and it must be
auto
*
ptr
=
root_scope_
->
FindVar
(
var
->
Name
());
// CUDAPlace, so copy all persistable vars to
auto
*
new_ptr
=
minibatch_scopes_
[
section_id
]
->
Var
(
var
->
Name
());
// pipeline scope
VLOG
(
3
)
<<
"Create persistable var "
<<
var
->
Name
()
<<
" for minibatch "
const
LoDTensor
&
root_tensor
=
<<
section_id
<<
", which pointer is "
<<
new_ptr
;
root_scope
.
FindVar
(
var
->
Name
())
->
Get
<
LoDTensor
>
();
InitializeVariable
(
new_ptr
,
var
->
GetType
());
LoDTensor
*
gpu_tensor
=
pipeline_scopes_
[
pipeline_id
]
const
LoDTensor
&
root_tensor
=
ptr
->
Get
<
LoDTensor
>
();
->
Var
(
var
->
Name
())
LoDTensor
*
minibatch_tensor
=
new_ptr
->
GetMutable
<
LoDTensor
>
();
->
GetMutable
<
LoDTensor
>
();
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
root_tensor
),
place
,
platform
::
Place
place
=
platform
::
CUDAPlace
(
pipeline_id
);
static_cast
<
Tensor
*>
(
minibatch_tensor
));
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
root_tensor
),
place
,
static_cast
<
Tensor
*>
(
gpu_tensor
));
}
}
}
}
else
if
(
!
var
->
Persistable
()
&&
!
is_feed_var
)
{
auto
*
ptr
=
microbatch_scopes_
[
section_id
][
microbatch_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"Create variable "
<<
var
->
Name
()
<<
" for section "
<<
section_id
<<
" microbatch "
<<
microbatch_id
<<
", which pointer is "
<<
ptr
;
InitializeVariable
(
ptr
,
var
->
GetType
());
}
}
scope_queue
->
Send
(
scope
);
}
}
}
}
void
PipelineTrainer
::
CopyParameters
(
const
Scope
&
root_scope
,
int
pipeline_id
)
{
void
PipelineTrainer
::
GetSkipVars
(
int
section_id
,
const
ProgramDesc
&
program
)
{
for
(
const
std
::
string
&
name
:
*
param_need_sync_
)
{
auto
&
global_block
=
program
.
Block
(
0
);
const
LoDTensor
&
root_tensor
=
root_scope
.
FindVar
(
name
)
->
Get
<
LoDTensor
>
();
for
(
auto
&
op
:
global_block
.
AllOps
())
{
if
(
op
->
Type
()
!=
"enqueue"
)
{
// TODO(hutxian): check a new var of the same name is created in
continue
;
// pipeline_scope
}
LoDTensor
*
gpu_tensor
=
auto
input_arg_names
=
op
->
InputArgumentNames
();
pipeline_scopes_
[
pipeline_id
]
->
Var
(
name
)
->
GetMutable
<
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
input_arg_names
.
size
(),
1
,
platform
::
Place
place
=
platform
::
CUDAPlace
(
pipeline_id
);
platform
::
errors
::
InvalidArgument
(
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
root_tensor
),
place
,
"Number of input arguments for enqueue op must be 1, "
static_cast
<
Tensor
*>
(
gpu_tensor
));
"but the value is %d."
,
input_arg_names
.
size
()));
std
::
string
input_arg_name
=
input_arg_names
[
0
];
if
(
input_arg_name
.
rfind
(
"@GRAD"
)
!=
input_arg_name
.
size
()
-
5
)
{
skip_vars_
[
section_id
].
emplace_back
(
input_arg_name
);
VLOG
(
3
)
<<
"add skip var name: "
<<
input_arg_name
;
}
}
}
}
}
void
PipelineTrainer
::
InitTrainerEnv
(
const
ProgramDesc
&
main_program
,
void
PipelineTrainer
::
InitTrainerEnv
(
const
ProgramDesc
&
main_program
,
const
platform
::
Place
&
place
)
{
const
platform
::
Place
&
place
)
{
PADDLE_ENFORCE
(
root_scope_
,
"Null root_scope pointer"
);
PADDLE_ENFORCE_NOT_NULL
(
root_scope_
,
SectionWorker
::
cpu_id_
.
store
(
pipeline_config_
.
start_cpu_core_id
());
platform
::
errors
::
InvalidArgument
(
scope_queues_
.
resize
(
section_num_
);
"root_scope pointer can not be nullptr"
));
pipeline_scopes_
.
resize
(
pipeline_num_
);
auto
start_cpu_id
=
trainer_desc_
.
section_param
().
start_cpu_core_id
();
for
(
auto
&
var
:
main_program
.
Block
(
0
).
AllVars
())
{
SectionWorker
::
cpu_id_
.
store
(
start_cpu_id
);
if
(
var
->
Persistable
())
{
minibatch_scopes_
.
resize
(
section_num_
);
persistable_vars_
.
push_back
(
var
->
Name
());
microbatch_scopes_
.
resize
(
section_num_
);
}
skip_vars_
.
resize
(
section_num_
);
}
VLOG
(
3
)
<<
"Init ScopeQueues and create all scopes"
;
VLOG
(
3
)
<<
"Init ScopeQueues and create all scopes"
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
minibatch_scopes_
[
i
]
=
&
root_scope_
->
NewScope
();
scope_queues_
[
i
].
emplace_back
(
new
ScopeQueue
(
scope_queue_size_
))
;
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program
;
if
(
i
==
0
)
{
program
.
reset
(
new
ProgramDesc
(
pipeline_scopes_
[
j
]
=
&
root_scope_
->
NewScope
(
);
trainer_desc_
.
section_param
().
section_config
(
i
).
program_desc
())
);
CopyParameters
(
*
root_scope_
,
j
);
microbatch_scopes_
[
i
].
resize
(
num_microbatches_
);
InitFirstScopeQueue
(
scope_queues_
[
0
].
back
().
get
(),
j
,
main_program
,
for
(
int
j
=
0
;
j
<
num_microbatches_
;
++
j
)
{
*
root_scope_
);
microbatch_scopes_
[
i
][
j
]
=
&
minibatch_scopes_
[
i
]
->
NewScope
(
);
}
CopyParameters
(
i
,
j
,
*
program
,
places_
[
i
]);
}
}
GetSkipVars
(
i
,
*
program
);
}
}
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
auto
this_worker
=
for
(
size_t
k
=
0
;
k
<
workers_
[
i
][
j
].
size
();
++
k
)
{
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
auto
this_worker
=
workers_
[
i
]);
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
this_worker
->
SetRootScope
(
root_scope_
);
workers_
[
i
][
j
][
k
]);
this_worker
->
SetMinibatchScope
(
minibatch_scopes_
[
i
]);
this_worker
->
SetRootScope
(
root_scope_
);
this_worker
->
SetMicrobatchScopes
(
microbatch_scopes_
[
i
]);
this_worker
->
SetCountMutex
(
worker_count_mutex_
[
i
][
j
].
get
());
this_worker
->
SetSkipVars
(
skip_vars_
[
i
]);
this_worker
->
SetWorkerCount
(
worker_count_
[
i
][
j
]);
this_worker
->
SetScopeQueue
(
scope_queues_
[
i
][
j
].
get
(),
(
i
==
section_num_
-
1
)
?
scope_queues_
[
0
][
j
].
get
()
:
scope_queues_
[
i
+
1
][
j
].
get
());
this_worker
->
SetVarNames
(
*
in_var_names_
[
i
],
*
out_var_names_
[
i
]);
if
(
i
!=
section_num_
-
1
)
{
// For data copy in adjacent different place
this_worker
->
SetNextSectionPlace
(
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
+
1
][
j
][
0
])
->
place
());
}
}
}
}
if
(
pipeline_num_
>
1
&&
sync_steps_
!=
-
1
)
{
construct_sync_functor
();
}
}
void
PipelineTrainer
::
construct_sync_functor
()
{
std
::
vector
<
platform
::
Place
>
cuda_places
;
for
(
int
i
=
0
;
i
<
pipeline_num_
;
++
i
)
{
cuda_places
.
emplace_back
(
platform
::
CUDAPlace
(
i
));
}
nccl_ctx_map_
.
reset
(
new
platform
::
NCCLContextMap
(
cuda_places
));
sync_functors_
.
resize
(
pipeline_num_
);
SyncFunctor
::
sync_flag_
=
0
;
SyncFunctor
::
pipeline_scopes_
.
resize
(
0
);
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
SyncFunctor
*
sync_function
=
new
SyncFunctor
(
j
,
pipeline_num_
,
sync_steps_
);
sync_function
->
SetSyncParam
(
*
param_need_sync_
);
sync_function
->
SetNcclCtxMap
(
nccl_ctx_map_
.
get
());
SyncFunctor
::
pipeline_scopes_
.
push_back
(
this
->
pipeline_scopes_
[
j
]);
sync_functors_
[
j
].
reset
(
sync_function
);
}
for
(
int
i
=
section_num_
-
1
;
i
>=
0
;
--
i
)
{
if
(
SectionConfig
::
CUDAPlace
==
pipeline_config_
.
section_config
(
i
).
place
())
{
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
for
(
size_t
k
=
0
;
k
<
workers_
[
i
][
j
].
size
();
++
k
)
{
auto
this_worker
=
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
][
j
][
k
]);
this_worker
->
SetSyncFunctor
(
sync_functors_
[
j
].
get
());
}
}
break
;
}
}
}
}
}
void
PipelineTrainer
::
Run
()
{
void
PipelineTrainer
::
Run
()
{
VLOG
(
3
)
<<
"Going to run"
;
VLOG
(
3
)
<<
"Going to run"
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
pipeline_num_
;
++
j
)
{
if
(
!
debug_
)
{
for
(
size_t
k
=
0
;
k
<
workers_
[
i
][
j
].
size
();
++
k
)
{
section_threads_
.
push_back
(
if
(
!
debug_
)
{
std
::
thread
(
&
DeviceWorker
::
TrainFiles
,
workers_
[
i
].
get
()));
section_threads_
.
push_back
(
}
else
{
std
::
thread
(
&
DeviceWorker
::
TrainFiles
,
workers_
[
i
][
j
][
k
].
get
()));
section_threads_
.
push_back
(
std
::
thread
(
}
else
{
&
DeviceWorker
::
TrainFilesWithProfiler
,
workers_
[
i
].
get
()));
section_threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFilesWithProfiler
,
workers_
[
i
][
j
][
k
].
get
()));
}
}
}
}
}
}
}
}
...
@@ -309,18 +229,31 @@ void PipelineTrainer::Finalize() {
...
@@ -309,18 +229,31 @@ void PipelineTrainer::Finalize() {
if
(
need_dump_field_
)
{
if
(
need_dump_field_
)
{
FinalizeDumpEnv
();
FinalizeDumpEnv
();
}
}
for
(
const
auto
&
var
:
persistable_vars_
)
{
VLOG
(
3
)
<<
"copying back parameters. "
;
auto
*
root_tensor
=
root_scope_
->
Var
(
var
)
->
GetMutable
<
LoDTensor
>
();
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
// TODO(hutuxian): Add a final all-reduce?
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program
;
const
auto
&
thread_tensor
=
program
.
reset
(
new
ProgramDesc
(
pipeline_scopes_
[
0
]
->
FindVar
(
var
)
->
Get
<
LoDTensor
>
();
trainer_desc_
.
section_param
().
section_config
(
i
).
program_desc
()));
TensorCopySync
(
thread_tensor
,
platform
::
CPUPlace
(),
root_tensor
);
for
(
int
j
=
0
;
j
<
num_microbatches_
;
++
j
)
{
auto
&
global_block
=
program
->
Block
(
0
);
for
(
auto
&
var
:
global_block
.
AllVars
())
{
if
(
var
->
Persistable
())
{
auto
*
ptr
=
root_scope_
->
FindVar
(
var
->
Name
());
LoDTensor
*
root_tensor
=
ptr
->
GetMutable
<
LoDTensor
>
();
auto
*
minibatch_ptr
=
minibatch_scopes_
[
i
]
->
Var
(
var
->
Name
());
const
LoDTensor
&
minibatch_tensor
=
minibatch_ptr
->
Get
<
LoDTensor
>
();
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
minibatch_tensor
),
places_
[
0
],
static_cast
<
Tensor
*>
(
root_tensor
));
VLOG
(
4
)
<<
"Copy persitable var "
<<
var
->
Name
()
<<
" to root scope"
;
}
}
}
}
}
root_scope_
->
DropKids
();
root_scope_
->
DropKids
();
}
}
Scope
*
PipelineTrainer
::
GetWorkerScope
(
int
thread_id
)
{
Scope
*
PipelineTrainer
::
GetWorkerScope
(
int
thread_id
)
{
return
pipeline_scopes_
[
thread_id
];
return
microbatch_scopes_
[
thread_id
][
0
];
}
}
}
// end namespace framework
}
// end namespace framework
...
...
paddle/fluid/framework/section_worker.cc
浏览文件 @
e39aa70e
...
@@ -10,6 +10,11 @@ See the License for the specific language governing permissions and
...
@@ -10,6 +10,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL)
#include <float.h>
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/program_desc.h"
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
#include "google/protobuf/text_format.h"
...
@@ -25,82 +30,17 @@ limitations under the License. */
...
@@ -25,82 +30,17 @@ limitations under the License. */
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
uint64_t
SyncFunctor
::
sync_flag_
=
0
;
std
::
vector
<
Scope
*>
SyncFunctor
::
pipeline_scopes_
;
SyncFunctor
::
SyncFunctor
(
int
rank_id
,
int
rank_num
,
int
sync_steps
)
:
rank_id_
(
rank_id
),
rank_num_
(
rank_num
),
sync_steps_
(
sync_steps
)
{
PADDLE_ENFORCE
(
rank_num
>
1
,
"rank_num should larger than 1"
);
counter_
=
0
;
sync_signal_
=
0
;
uint8_t
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
&
sync_signal_
);
for
(
int
i
=
0
;
i
<
rank_num_
;
++
i
)
{
ptr
[
i
]
=
0xFF
;
}
}
int
SyncFunctor
::
operator
()(
Scope
*
scope
)
{
++
counter_
;
if
(
counter_
<
sync_steps_
)
{
return
0
;
}
if
(
counter_
==
sync_steps_
)
{
reinterpret_cast
<
uint8_t
*>
(
&
sync_flag_
)[
rank_id_
]
=
0xFF
;
}
if
(
sync_flag_
==
sync_signal_
)
{
static
std
::
mutex
mutex
;
if
(
mutex
.
try_lock
())
{
if
(
sync_flag_
==
sync_signal_
)
{
Synchronize
();
sync_flag_
=
0
;
}
mutex
.
unlock
();
}
}
if
(
sync_flag_
==
0
)
{
counter_
=
0
;
}
return
0
;
}
void
SyncFunctor
::
Synchronize
()
{
for
(
const
std
::
string
&
name
:
*
sync_param_
)
{
platform
::
NCCLGroupGuard
guard
;
for
(
int
i
=
0
;
i
<
rank_num_
;
++
i
)
{
const
platform
::
NCCLContext
&
nccl_ctx
=
nccl_ctx_map_
->
at
(
i
);
LoDTensor
*
tensor
=
pipeline_scopes_
[
i
]
->
Var
(
name
)
->
GetMutable
<
LoDTensor
>
();
// TODO(hutuxian): do not depend on data type explicitly
float
*
data
=
tensor
->
mutable_data
<
float
>
(
nccl_ctx_map_
->
DevCtx
(
i
)
->
GetPlace
());
const
int
numel
=
tensor
->
numel
();
paddle
::
framework
::
AttributeMap
attrs
;
attrs
.
insert
({
"scale"
,
static_cast
<
float
>
(
1.
/
rank_num_
)});
auto
scale_op
=
framework
::
OpRegistry
::
CreateOp
(
"scale"
,
{{
"X"
,
{
name
}}},
{{
"Out"
,
{
name
}}},
attrs
);
scale_op
->
Run
(
*
(
pipeline_scopes_
[
i
]),
nccl_ctx_map_
->
DevCtx
(
i
)
->
GetPlace
());
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
data
,
data
,
numel
,
ncclFloat
,
ncclSum
,
nccl_ctx
.
comm
(),
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
i
)))
->
stream
()));
}
}
nccl_ctx_map_
->
WaitAll
();
}
std
::
atomic
<
int
>
SectionWorker
::
cpu_id_
(
0
);
std
::
atomic
<
int
>
SectionWorker
::
cpu_id_
(
0
);
std
::
mutex
SectionWorker
::
thread_mutex
;
std
::
condition_variable
SectionWorker
::
thread_condition
;
bool
SectionWorker
::
threads_completed
=
false
;
uint64_t
SectionWorker
::
batch_id_
(
0
);
void
SectionWorker
::
Initialize
(
const
TrainerDesc
&
desc
)
{
void
SectionWorker
::
Initialize
(
const
TrainerDesc
&
desc
)
{
dev_ctx_
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
);
dev_ctx_
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
);
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program
;
program_
.
reset
(
new
ProgramDesc
(
program
.
reset
(
new
ProgramDesc
(
desc
.
section_param
().
section_config
(
section_id_
).
program_desc
()));
desc
.
section_param
().
section_config
(
section_id_
).
program_desc
()));
for
(
auto
&
op_desc
:
program
->
Block
(
0
).
AllOps
())
{
for
(
auto
&
op_desc
:
program
_
->
Block
(
0
).
AllOps
())
{
ops_
.
push_back
(
OpRegistry
::
CreateOp
(
*
op_desc
));
ops_
.
push_back
(
OpRegistry
::
CreateOp
(
*
op_desc
));
}
}
}
}
...
@@ -136,314 +76,494 @@ void SectionWorker::AutoSetCPUAffinity(bool reuse) {
...
@@ -136,314 +76,494 @@ void SectionWorker::AutoSetCPUAffinity(bool reuse) {
(
0
==
CPU_ISSET
(
proc
,
&
mask
)))
{
(
0
==
CPU_ISSET
(
proc
,
&
mask
)))
{
LOG
(
WARNING
)
<<
"Fail to set thread affinity to CPU "
<<
proc
;
LOG
(
WARNING
)
<<
"Fail to set thread affinity to CPU "
<<
proc
;
}
}
SEC_LOG
<<
"Set "
<<
thread_cpu_id
<<
"th thread affinity to CPU "
<<
proc
;
VLOG
(
3
)
<<
"Set "
<<
thread_cpu_id
<<
"th thread affinity to CPU "
<<
proc
;
}
}
void
SectionWorker
::
TrainFiles
()
{
void
SectionWorker
::
TrainFiles
()
{
SEC_LOG
<<
"begin section_worker TrainFiles"
;
VLOG
(
3
)
<<
"begin section_worker TrainFiles"
;
AutoSetCPUAffinity
(
true
);
AutoSetCPUAffinity
(
true
);
int64_t
step_cnt
=
0
;
int64_t
max_memory_size
=
0
;
int64_t
accum_num
=
0
;
std
::
unique_ptr
<
GarbageCollector
>
gc
;
int
batch_size
=
0
;
auto
unused_vars_
=
GetUnusedVars
(
program_
->
Block
(
0
),
ops_
,
skip_vars_
);
Scope
*
scope
=
nullptr
;
#ifdef PADDLE_WITH_CUDA
if
(
device_reader_
!=
nullptr
)
{
if
(
platform
::
is_gpu_place
(
place_
))
{
device_reader_
->
Start
();
if
(
IsFastEagerDeletionModeEnabled
())
{
}
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
while
(
in_scope_queue_
->
Receive
(
&
scope
))
{
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place_
),
max_memory_size
));
if
(
device_reader_
!=
nullptr
)
{
device_reader_
->
AssignFeedVar
(
*
scope
);
batch_size
=
device_reader_
->
Next
();
if
(
batch_size
<=
0
)
{
break
;
}
SEC_LOG
<<
"read batch size: "
<<
batch_size
;
}
else
{
}
else
{
// TODO(hutuxian): Keep batch_size in scope? Or is there a better way to
gc
.
reset
(
new
DefaultStreamGarbageCollector
(
// fetch batch_size? Some variables may not have batch_size.
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place_
),
max_memory_size
));
PADDLE_ENFORCE
(
in_var_names_
->
size
(),
"Section without a reader or in variable is not supported by now"
);
const
LoDTensor
&
tensor
=
scope
->
FindVar
(
in_var_names_
->
at
(
0
))
->
Get
<
LoDTensor
>
();
batch_size
=
tensor
.
lod
().
size
()
?
tensor
.
lod
()[
0
].
size
()
-
1
:
tensor
.
dims
()[
0
];
SEC_LOG
<<
"input batch size: "
<<
batch_size
;
}
}
}
else
if
(
platform
::
is_cpu_place
(
place_
))
{
#endif
gc
.
reset
(
new
CPUGarbageCollector
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
place_
),
max_memory_size
));
#ifdef PADDLE_WITH_CUDA
}
#endif
Scope
*
exe_scope
=
scope
;
if
(
thread_id_
==
0
)
{
if
(
section_id_
>
0
&&
platform
::
is_gpu_place
(
place_
))
{
while
(
true
)
{
SEC_LOG
<<
"CPU2GPU memory copy"
;
// Start a minibatch.
for
(
int
i
=
0
;
i
<
num_microbatches_
;
++
i
)
{
if
(
scope
->
kids
().
empty
())
{
try
{
exe_scope
=
&
scope
->
NewScope
();
for
(
auto
&
op
:
ops_
)
{
}
else
{
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
exe_scope
=
scope
->
kids
().
front
();
// We run op with op_role = kLRSched only for the first microbatch
PADDLE_ENFORCE
(
scope
->
kids
().
size
()
==
1
,
"scope->kids().size(): %zu"
,
// to avoid increasing the @LR_DECAY_STEP@ multiple times.
scope
->
kids
().
size
());
bool
run_first_mbatch
=
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
))
||
op_role
==
static_cast
<
int
>
(
OpRole
::
kLRSched
);
bool
run_others
=
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
));
if
((
i
==
0
&&
run_first_mbatch
)
||
(
i
!=
0
&&
run_others
))
{
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
<<
" for scope "
<<
i
;
op
->
Run
(
*
microbatch_scopes_
[
i
],
place_
);
if
(
gc
)
{
DeleteUnusedTensors
(
*
microbatch_scopes_
[
i
],
op
.
get
(),
unused_vars_
,
gc
.
get
());
}
}
}
}
catch
(
platform
::
EOFException
&
)
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
thread_mutex
);
threads_completed
=
true
;
VLOG
(
3
)
<<
"thread "
<<
thread_id_
<<
" completed."
;
VLOG
(
3
)
<<
"called notify all"
;
thread_condition
.
notify_all
();
VLOG
(
0
)
<<
"EOF encountered"
;
return
;
}
if
(
i
==
0
)
{
VLOG
(
3
)
<<
"called notify all"
;
std
::
unique_lock
<
std
::
mutex
>
lk
(
thread_mutex
);
batch_id_
+=
1
;
thread_condition
.
notify_all
();
}
}
}
// backward pass
for
(
const
std
::
string
&
name
:
*
in_var_names_
)
{
for
(
int
i
=
0
;
i
<
num_microbatches_
;
++
i
)
{
const
LoDTensor
&
src_tensor
=
scope
->
FindVar
(
name
)
->
Get
<
LoDTensor
>
();
for
(
auto
&
op
:
ops_
)
{
if
(
platform
::
is_gpu_place
(
src_tensor
.
place
()))
{
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
continue
;
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kBackward
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kBackward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
)))
{
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
<<
" for scope "
<<
i
;
op
->
Run
(
*
microbatch_scopes_
[
i
],
place_
);
if
(
gc
)
{
DeleteUnusedTensors
(
*
microbatch_scopes_
[
i
],
op
.
get
(),
unused_vars_
,
gc
.
get
());
}
}
}
}
LoDTensor
*
gpu_tensor
=
exe_scope
->
Var
(
name
)
->
GetMutable
<
LoDTensor
>
();
gpu_tensor
->
set_lod
(
src_tensor
.
lod
());
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
src_tensor
),
place_
,
*
dev_ctx_
,
static_cast
<
Tensor
*>
(
gpu_tensor
));
}
}
}
// update pass
for
(
auto
&
op
:
ops_
)
{
SEC_LOG
<<
"begin running ops"
;
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kOptimize
))
{
for
(
auto
&
op
:
ops_
)
{
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
op
->
Run
(
*
exe_scope
,
place_
);
<<
" for minibatch scope"
;
}
op
->
Run
(
*
microbatch_scopes_
[
0
],
place_
);
exe_scope
->
DropKids
();
if
(
gc
)
{
// Wait for GPU calc finising, as the cudaMemcpy and GPU calc may be in
DeleteUnusedTensors
(
*
microbatch_scopes_
[
num_microbatches_
-
1
],
// different streams
op
.
get
(),
unused_vars_
,
gc
.
get
());
// No effect when it is a CPUDeviceContext
}
dev_ctx_
->
Wait
();
}
#ifdef PADDLE_WITH_BOX_PS
auto
box_ptr
=
BoxWrapper
::
GetInstance
();
auto
&
metric_list
=
box_ptr
->
GetMetricList
();
for
(
auto
iter
=
metric_list
.
begin
();
iter
!=
metric_list
.
end
();
iter
++
)
{
auto
*
metric_msg
=
iter
->
second
;
if
(
box_ptr
->
Phase
()
!=
metric_msg
->
MetricPhase
())
{
continue
;
}
}
metric_msg
->
add_data
(
exe_scope
);
dev_ctx_
->
Wait
(
);
}
}
#endif
}
else
{
if
(
section_id_
!=
section_num_
-
1
&&
platform
::
is_gpu_place
(
place_
))
{
while
(
true
)
{
// FIXME: Temporarily we assume two adjacent sections are in different
{
// places,
PADDLE_ENFORCE_LE
(
// and we do data transformation only in sections in GPU place, so the
local_batch_id_
,
batch_id_
,
// data is
platform
::
errors
::
InvalidArgument
(
// transform from GPU to CPU
"local_batch_id_ (%d) must be less than or equal to "
// A better way to handle such a data transformation is to record each
"batch_id_ (%d)"
,
// place of
local_batch_id_
,
batch_id_
));
// joint-out variables, and do transform as required
std
::
unique_lock
<
std
::
mutex
>
lk
(
thread_mutex
);
if
(
local_batch_id_
==
batch_id_
&&
!
threads_completed
)
{
SEC_LOG
<<
"GPU2CPU memory copy"
;
thread_condition
.
wait
(
lk
);
}
for
(
const
std
::
string
&
name
:
*
out_var_names_
)
{
VLOG
(
3
)
<<
"thread "
<<
thread_id_
<<
" local_batch_id_ "
const
LoDTensor
&
src_tensor
=
<<
local_batch_id_
<<
" batch_id_ "
<<
batch_id_
;
exe_scope
->
FindVar
(
name
)
->
Get
<
LoDTensor
>
();
if
(
threads_completed
)
{
LoDTensor
*
dst_tensor
=
scope
->
Var
(
name
)
->
GetMutable
<
LoDTensor
>
();
VLOG
(
3
)
<<
"thread "
<<
thread_id_
<<
" completed."
;
dst_tensor
->
set_lod
(
src_tensor
.
lod
());
lk
.
unlock
();
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
src_tensor
),
threads_completed
=
false
;
next_section_place_
,
*
dev_ctx_
,
return
;
static_cast
<
Tensor
*>
(
dst_tensor
));
}
lk
.
unlock
();
local_batch_id_
+=
1
;
}
}
// forward pass:
for
(
int
i
=
0
;
i
<
num_microbatches_
;
++
i
)
{
for
(
auto
&
op
:
ops_
)
{
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
// We run op with op_role = kLRSched only for the first microbatch
// to avoid increasing the @LR_DECAY_STEP@ multiple times.
bool
run_first_mbatch
=
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
))
||
op_role
==
static_cast
<
int
>
(
OpRole
::
kLRSched
);
bool
run_others
=
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
));
if
((
i
==
0
&&
run_first_mbatch
)
||
(
i
!=
0
&&
run_others
))
{
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
<<
" for scope "
<<
i
;
op
->
Run
(
*
microbatch_scopes_
[
i
],
place_
);
if
(
gc
)
{
DeleteUnusedTensors
(
*
microbatch_scopes_
[
i
],
op
.
get
(),
unused_vars_
,
gc
.
get
());
}
}
}
}
// backward pass
for
(
int
i
=
0
;
i
<
num_microbatches_
;
++
i
)
{
for
(
auto
&
op
:
ops_
)
{
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kBackward
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kBackward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
)))
{
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
<<
" for scope "
<<
i
;
op
->
Run
(
*
microbatch_scopes_
[
i
],
place_
);
if
(
gc
)
{
DeleteUnusedTensors
(
*
microbatch_scopes_
[
i
],
op
.
get
(),
unused_vars_
,
gc
.
get
());
}
}
}
}
// update pass
for
(
auto
&
op
:
ops_
)
{
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kOptimize
))
{
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
<<
" for minibatch scope"
;
op
->
Run
(
*
microbatch_scopes_
[
0
],
place_
);
if
(
gc
)
{
DeleteUnusedTensors
(
*
microbatch_scopes_
[
num_microbatches_
-
1
],
op
.
get
(),
unused_vars_
,
gc
.
get
());
}
}
}
dev_ctx_
->
Wait
();
}
}
out_scope_queue_
->
Send
(
scope
);
if
(
sync_func_
)
{
(
*
sync_func_
)(
scope
);
}
++
step_cnt
;
accum_num
+=
batch_size
;
}
worker_count_mutex_
->
lock
();
--
(
*
worker_count_
);
worker_count_mutex_
->
unlock
();
if
(
*
worker_count_
<=
0
)
{
while
(
section_id_
<
section_num_
-
1
&&
out_scope_queue_
->
Size
())
{
sleep
(
1
);
}
out_scope_queue_
->
Close
();
}
}
}
}
void
SectionWorker
::
TrainFilesWithProfiler
()
{
void
SectionWorker
::
TrainFilesWithProfiler
()
{
SEC_LOG
<<
"begin section_worker TrainFiles with profiler"
;
VLOG
(
3
)
<<
"begin section_worker TrainFiles with profiler"
;
AutoSetCPUAffinity
(
true
);
AutoSetCPUAffinity
(
true
);
int64_t
step_cnt
=
0
;
platform
::
Timer
batch_timer
;
int64_t
accum_num
=
0
;
platform
::
Timer
timeline
;
int
batch_size
=
0
;
Scope
*
scope
=
nullptr
;
platform
::
Timer
reader_timer
;
platform
::
Timer
cal_timer
;
platform
::
Timer
trans_timer
;
platform
::
Timer
sync_timer
;
platform
::
Timer
main_timer
;
platform
::
Timer
outer_timer
;
std
::
vector
<
double
>
op_total_time
;
std
::
vector
<
double
>
op_total_time
;
std
::
vector
<
std
::
string
>
op_name
;
std
::
vector
<
std
::
string
>
op_name
;
std
::
vector
<
double
>
op_max_time
;
std
::
vector
<
double
>
op_min_time
;
std
::
vector
<
uint64_t
>
op_count
;
for
(
auto
&
op
:
ops_
)
{
for
(
auto
&
op
:
ops_
)
{
op_name
.
push_back
(
op
->
Type
());
op_name
.
push_back
(
op
->
Type
());
}
}
op_total_time
.
resize
(
ops_
.
size
());
op_total_time
.
resize
(
ops_
.
size
());
for
(
size_t
i
=
0
;
i
<
op_total_time
.
size
();
++
i
)
{
op_max_time
.
resize
(
ops_
.
size
());
op_total_time
[
i
]
=
0.0
;
op_min_time
.
resize
(
ops_
.
size
());
}
for
(
size_t
i
=
0
;
i
<
op_min_time
.
size
();
++
i
)
{
platform
::
Timer
timeline
;
op_min_time
[
i
]
=
DBL_MAX
;
if
(
device_reader_
!=
nullptr
)
{
device_reader_
->
Start
();
}
}
op_count
.
resize
(
ops_
.
size
());
bool
started
=
false
;
while
(
in_scope_queue_
->
Receive
(
&
scope
))
{
int64_t
max_memory_size
=
0
;
if
(
UNLIKELY
(
!
started
))
{
std
::
unique_ptr
<
GarbageCollector
>
gc
;
outer_timer
.
Start
();
// const std::vector<std::string> keep_vars;
started
=
true
;
auto
unused_vars_
=
GetUnusedVars
(
program_
->
Block
(
0
),
ops_
,
skip_vars_
);
}
#ifdef PADDLE_WITH_CUDA
main_timer
.
Resume
();
if
(
platform
::
is_gpu_place
(
place_
))
{
if
(
IsFastEagerDeletionModeEnabled
())
{
if
(
device_reader_
!=
nullptr
)
{
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
reader_timer
.
Resume
();
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place_
),
max_memory_size
));
device_reader_
->
AssignFeedVar
(
*
scope
);
batch_size
=
device_reader_
->
Next
();
reader_timer
.
Pause
();
if
(
batch_size
<=
0
)
{
break
;
}
SEC_LOG
<<
"read batch size: "
<<
batch_size
;
}
else
{
}
else
{
PADDLE_ENFORCE
(
gc
.
reset
(
new
DefaultStreamGarbageCollector
(
in_var_names_
->
size
(),
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place_
),
max_memory_size
));
"Section without a reader or in variable is not supported by now"
);
const
LoDTensor
&
tensor
=
scope
->
FindVar
(
in_var_names_
->
at
(
0
))
->
Get
<
LoDTensor
>
();
batch_size
=
tensor
.
lod
().
size
()
?
tensor
.
lod
()[
0
].
size
()
-
1
:
tensor
.
dims
()[
0
];
SEC_LOG
<<
"input batch size: "
<<
batch_size
;
}
}
}
else
if
(
platform
::
is_cpu_place
(
place_
))
{
#endif
gc
.
reset
(
new
CPUGarbageCollector
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
place_
),
max_memory_size
));
#ifdef PADDLE_WITH_CUDA
}
#endif
Scope
*
exe_scope
=
scope
;
if
(
thread_id_
==
0
)
{
if
(
section_id_
>
0
&&
platform
::
is_gpu_place
(
place_
))
{
while
(
true
)
{
SEC_LOG
<<
"CPU2GPU memory copy"
;
// Start a minibatch.
trans_timer
.
Resume
();
// int batch_size = 0;
if
(
scope
->
kids
().
empty
())
{
batch_timer
.
Start
();
exe_scope
=
&
scope
->
NewScope
();
for
(
int
i
=
0
;
i
<
num_microbatches_
;
++
i
)
{
}
else
{
try
{
exe_scope
=
scope
->
kids
().
front
();
int
op_idx
=
0
;
PADDLE_ENFORCE
(
scope
->
kids
().
size
()
==
1
,
"scope->kids().size(): %zu"
,
for
(
auto
&
op
:
ops_
)
{
scope
->
kids
().
size
());
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
// We run op with op_role = kLRSched only for the first microbatch
// to avoid increasing the @LR_DECAY_STEP@ multiple times.
bool
run_first_mbatch
=
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
))
||
op_role
==
static_cast
<
int
>
(
OpRole
::
kLRSched
);
bool
run_others
=
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
));
if
((
i
==
0
&&
run_first_mbatch
)
||
(
i
!=
0
&&
run_others
))
{
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
<<
" for scope "
<<
i
;
timeline
.
Start
();
op
->
Run
(
*
microbatch_scopes_
[
i
],
place_
);
if
(
gc
)
{
DeleteUnusedTensors
(
*
microbatch_scopes_
[
i
],
op
.
get
(),
unused_vars_
,
gc
.
get
());
}
timeline
.
Pause
();
auto
time
=
timeline
.
ElapsedUS
();
op_total_time
[
op_idx
]
+=
time
;
if
(
time
>
op_max_time
[
op_idx
])
{
op_max_time
[
op_idx
]
=
time
;
}
if
(
time
<
op_min_time
[
op_idx
])
{
op_min_time
[
op_idx
]
=
time
;
}
op_count
[
op_idx
]
+=
1
;
op_total_time
[
op_idx
]
+=
time
;
}
op_idx
++
;
}
}
catch
(
platform
::
EOFException
&
)
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
thread_mutex
);
threads_completed
=
true
;
VLOG
(
3
)
<<
"thread "
<<
thread_id_
<<
" completed."
;
VLOG
(
3
)
<<
"called notify all"
;
thread_condition
.
notify_all
();
VLOG
(
0
)
<<
"EOF encountered"
;
VLOG
(
0
)
<<
"============timeline============"
;
for
(
size_t
i
=
0
;
i
<
ops_
.
size
();
++
i
)
{
VLOG
(
0
)
<<
"op: "
<<
op_name
[
i
]
<<
", max_time: "
<<
op_max_time
[
i
]
<<
", min_time: "
<<
op_min_time
[
i
]
<<
", mean_time: "
<<
op_total_time
[
i
]
/
op_count
[
i
];
}
VLOG
(
0
)
<<
"================================"
;
return
;
}
if
(
i
==
0
)
{
VLOG
(
3
)
<<
"called notify all"
;
std
::
unique_lock
<
std
::
mutex
>
lk
(
thread_mutex
);
batch_id_
+=
1
;
thread_condition
.
notify_all
();
}
}
}
// backward pass
for
(
const
std
::
string
&
name
:
*
in_var_names_
)
{
for
(
int
i
=
0
;
i
<
num_microbatches_
;
++
i
)
{
const
LoDTensor
&
src_tensor
=
scope
->
FindVar
(
name
)
->
Get
<
LoDTensor
>
();
int
op_idx
=
0
;
if
(
platform
::
is_gpu_place
(
src_tensor
.
place
()))
{
for
(
auto
&
op
:
ops_
)
{
continue
;
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kBackward
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kBackward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
)))
{
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
<<
" for scope "
<<
i
;
timeline
.
Start
();
op
->
Run
(
*
microbatch_scopes_
[
i
],
place_
);
if
(
gc
)
{
DeleteUnusedTensors
(
*
microbatch_scopes_
[
i
],
op
.
get
(),
unused_vars_
,
gc
.
get
());
}
timeline
.
Pause
();
auto
time
=
timeline
.
ElapsedUS
();
op_total_time
[
op_idx
]
+=
time
;
if
(
time
>
op_max_time
[
op_idx
])
{
op_max_time
[
op_idx
]
=
time
;
}
if
(
time
<
op_min_time
[
op_idx
])
{
op_min_time
[
op_idx
]
=
time
;
}
op_count
[
op_idx
]
+=
1
;
op_total_time
[
op_idx
]
+=
time
;
}
op_idx
++
;
}
}
LoDTensor
*
gpu_tensor
=
exe_scope
->
Var
(
name
)
->
GetMutable
<
LoDTensor
>
();
gpu_tensor
->
set_lod
(
src_tensor
.
lod
());
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
src_tensor
),
place_
,
*
dev_ctx_
,
static_cast
<
Tensor
*>
(
gpu_tensor
));
}
}
trans_timer
.
Pause
();
// update pass
}
int
op_idx
=
0
;
for
(
auto
&
op
:
ops_
)
{
SEC_LOG
<<
"begin running ops"
;
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
cal_timer
.
Resume
();
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kOptimize
))
{
int
op_id
=
0
;
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
dev_ctx_
->
Wait
();
<<
" for minibatch scope"
;
for
(
auto
&
op
:
ops_
)
{
timeline
.
Start
();
timeline
.
Start
();
op
->
Run
(
*
microbatch_scopes_
[
0
],
place_
);
op
->
Run
(
*
exe_scope
,
place_
);
if
(
gc
)
{
DeleteUnusedTensors
(
*
microbatch_scopes_
[
num_microbatches_
-
1
],
op
.
get
(),
unused_vars_
,
gc
.
get
());
}
timeline
.
Pause
();
auto
time
=
timeline
.
ElapsedUS
();
op_total_time
[
op_idx
]
+=
time
;
if
(
time
>
op_max_time
[
op_idx
])
{
op_max_time
[
op_idx
]
=
time
;
}
if
(
time
<
op_min_time
[
op_idx
])
{
op_min_time
[
op_idx
]
=
time
;
}
op_count
[
op_idx
]
+=
1
;
op_total_time
[
op_idx
]
+=
time
;
}
op_idx
++
;
}
dev_ctx_
->
Wait
();
dev_ctx_
->
Wait
();
timeline
.
Pause
();
batch_timer
.
Pause
();
op_total_time
[
op_id
++
]
+=
timeline
.
ElapsedUS
();
VLOG
(
0
)
<<
"batch time: "
<<
batch_timer
.
ElapsedUS
();
}
}
exe_scope
->
DropKids
();
}
else
{
// Wait for GPU calc finising, as the cudaMemcpy and GPU calc may be in
while
(
true
)
{
// different streams
{
// No effect when it is a CPUDeviceContext
PADDLE_ENFORCE_LE
(
dev_ctx_
->
Wait
();
local_batch_id_
,
batch_id_
,
cal_timer
.
Pause
();
platform
::
errors
::
InvalidArgument
(
#ifdef PADDLE_WITH_BOX_PS
"local_batch_id_ (%d) must be less than or equal to "
auto
box_ptr
=
BoxWrapper
::
GetInstance
();
"batch_id_ (%d)"
,
auto
&
metric_list
=
box_ptr
->
GetMetricList
();
local_batch_id_
,
batch_id_
));
for
(
auto
iter
=
metric_list
.
begin
();
iter
!=
metric_list
.
end
();
iter
++
)
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
thread_mutex
);
auto
*
metric_msg
=
iter
->
second
;
if
(
local_batch_id_
==
batch_id_
&&
!
threads_completed
)
{
if
(
box_ptr
->
Phase
()
!=
metric_msg
->
MetricPhase
())
{
thread_condition
.
wait
(
lk
);
continue
;
}
VLOG
(
3
)
<<
"thread "
<<
thread_id_
<<
" local_batch_id_ "
<<
local_batch_id_
<<
" batch_id_ "
<<
batch_id_
;
if
(
threads_completed
)
{
VLOG
(
3
)
<<
"thread "
<<
thread_id_
<<
" completed."
;
lk
.
unlock
();
VLOG
(
0
)
<<
"============timeline============"
;
for
(
size_t
i
=
0
;
i
<
ops_
.
size
();
++
i
)
{
VLOG
(
0
)
<<
"op: "
<<
op_name
[
i
]
<<
", max_time: "
<<
op_max_time
[
i
]
<<
", min_time: "
<<
op_min_time
[
i
]
<<
", mean_time: "
<<
op_total_time
[
i
]
/
op_count
[
i
];
}
VLOG
(
0
)
<<
"================================"
;
threads_completed
=
false
;
return
;
}
lk
.
unlock
();
local_batch_id_
+=
1
;
}
}
metric_msg
->
add_data
(
exe_scope
);
// forward pass:
}
for
(
int
i
=
0
;
i
<
num_microbatches_
;
++
i
)
{
#endif
int
op_idx
=
0
;
if
(
need_dump_field_
)
{
for
(
auto
&
op
:
ops_
)
{
DumpField
(
*
scope
,
dump_mode_
,
dump_interval_
);
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
}
// We run op with op_role = kLRSched only for the first microbatch
if
(
need_dump_param_
&&
pipeline_id_
==
0
)
{
// to avoid increasing the @LR_DECAY_STEP@ multiple times.
DumpParam
(
*
scope
,
step_cnt
);
bool
run_first_mbatch
=
}
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
if
(
section_id_
!=
section_num_
-
1
&&
platform
::
is_gpu_place
(
place_
))
{
static_cast
<
int
>
(
OpRole
::
kLoss
))
||
// FIXME: Temporarily we assume two adjacent sections are in different
op_role
==
static_cast
<
int
>
(
OpRole
::
kLRSched
);
// places,
bool
run_others
=
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
)
||
// and we do data transformation only in sections in GPU place, so the
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
// data is
static_cast
<
int
>
(
OpRole
::
kLoss
));
// transform from GPU to CPU
if
((
i
==
0
&&
run_first_mbatch
)
||
(
i
!=
0
&&
run_others
))
{
// A better way to handle such a data transformation is to record each
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
// place of
<<
" for scope "
<<
i
;
// joint-out variables, and do transform as required
timeline
.
Start
();
op
->
Run
(
*
microbatch_scopes_
[
i
],
place_
);
SEC_LOG
<<
"GPU2CPU memory copy"
;
if
(
gc
)
{
trans_timer
.
Resume
();
DeleteUnusedTensors
(
*
microbatch_scopes_
[
i
],
op
.
get
(),
for
(
const
std
::
string
&
name
:
*
out_var_names_
)
{
unused_vars_
,
gc
.
get
());
const
LoDTensor
&
src_tensor
=
}
exe_scope
->
FindVar
(
name
)
->
Get
<
LoDTensor
>
();
timeline
.
Pause
();
LoDTensor
*
dst_tensor
=
scope
->
Var
(
name
)
->
GetMutable
<
LoDTensor
>
();
auto
time
=
timeline
.
ElapsedUS
();
dst_tensor
->
set_lod
(
src_tensor
.
lod
());
op_total_time
[
op_idx
]
+=
time
;
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
src_tensor
),
if
(
time
>
op_max_time
[
op_idx
])
{
next_section_place_
,
*
dev_ctx_
,
op_max_time
[
op_idx
]
=
time
;
static_cast
<
Tensor
*>
(
dst_tensor
));
}
if
(
time
<
op_min_time
[
op_idx
])
{
op_min_time
[
op_idx
]
=
time
;
}
op_count
[
op_idx
]
+=
1
;
op_total_time
[
op_idx
]
+=
time
;
}
op_idx
++
;
}
}
}
trans_timer
.
Pause
();
// backward pass
}
for
(
int
i
=
0
;
i
<
num_microbatches_
;
++
i
)
{
int
op_idx
=
0
;
out_scope_queue_
->
Send
(
scope
);
for
(
auto
&
op
:
ops_
)
{
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
if
(
sync_func_
)
{
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kBackward
)
||
sync_timer
.
Resume
();
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kBackward
)
|
(
*
sync_func_
)(
scope
);
static_cast
<
int
>
(
OpRole
::
kLoss
)))
{
sync_timer
.
Pause
();
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
}
<<
" for scope "
<<
i
;
timeline
.
Start
();
++
step_cnt
;
op
->
Run
(
*
microbatch_scopes_
[
i
],
place_
);
accum_num
+=
batch_size
;
if
(
gc
)
{
main_timer
.
Pause
();
DeleteUnusedTensors
(
*
microbatch_scopes_
[
i
],
op
.
get
(),
}
unused_vars_
,
gc
.
get
());
if
(
need_dump_field_
||
need_dump_param_
)
{
}
writer_
.
Flush
();
timeline
.
Pause
();
}
auto
time
=
timeline
.
ElapsedUS
();
outer_timer
.
Pause
();
op_total_time
[
op_idx
]
+=
time
;
if
(
time
>
op_max_time
[
op_idx
])
{
worker_count_mutex_
->
lock
();
op_max_time
[
op_idx
]
=
time
;
--
(
*
worker_count_
);
}
worker_count_mutex_
->
unlock
();
if
(
time
<
op_min_time
[
op_idx
])
{
op_min_time
[
op_idx
]
=
time
;
if
(
*
worker_count_
<=
0
)
{
}
while
(
section_id_
<
section_num_
-
1
&&
out_scope_queue_
->
Size
())
{
op_count
[
op_idx
]
+=
1
;
sleep
(
1
);
op_total_time
[
op_idx
]
+=
time
;
}
op_idx
++
;
}
}
// update pass
int
op_idx
=
0
;
for
(
auto
&
op
:
ops_
)
{
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kOptimize
))
{
VLOG
(
3
)
<<
"running an op "
<<
op
->
Type
()
<<
" for "
<<
thread_id_
<<
" for minibatch scope"
;
timeline
.
Start
();
op
->
Run
(
*
microbatch_scopes_
[
0
],
place_
);
if
(
gc
)
{
DeleteUnusedTensors
(
*
microbatch_scopes_
[
num_microbatches_
-
1
],
op
.
get
(),
unused_vars_
,
gc
.
get
());
}
timeline
.
Pause
();
auto
time
=
timeline
.
ElapsedUS
();
op_total_time
[
op_idx
]
+=
time
;
if
(
time
>
op_max_time
[
op_idx
])
{
op_max_time
[
op_idx
]
=
time
;
}
if
(
time
<
op_min_time
[
op_idx
])
{
op_min_time
[
op_idx
]
=
time
;
}
op_count
[
op_idx
]
+=
1
;
op_total_time
[
op_idx
]
+=
time
;
}
op_idx
++
;
}
dev_ctx_
->
Wait
();
}
}
out_scope_queue_
->
Close
();
}
LOG
(
ERROR
)
<<
"log_for_profile"
<<
" card:"
<<
pipeline_id_
<<
" thread:"
<<
thread_id_
<<
" section:"
<<
section_id_
<<
" step_count:"
<<
step_cnt
<<
" batch_count:"
<<
accum_num
<<
" read_time:"
<<
reader_timer
.
ElapsedUS
()
<<
" trans_time:"
<<
trans_timer
.
ElapsedUS
()
<<
" cal_time:"
<<
cal_timer
.
ElapsedUS
()
<<
" sync_time:"
<<
sync_timer
.
ElapsedUS
()
<<
" main_time:"
<<
main_timer
.
ElapsedUS
()
<<
" outer_time:"
<<
outer_timer
.
ElapsedUS
();
for
(
size_t
i
=
0
;
i
<
ops_
.
size
();
++
i
)
{
LOG
(
ERROR
)
<<
"op: "
<<
op_name
[
i
]
<<
", mean time: "
<<
op_total_time
[
i
]
/
accum_num
;
}
}
}
}
}
// namespace framework
}
// namespace framework
...
...
paddle/fluid/framework/trainer.h
浏览文件 @
e39aa70e
...
@@ -137,49 +137,31 @@ class PipelineTrainer : public TrainerBase {
...
@@ -137,49 +137,31 @@ class PipelineTrainer : public TrainerBase {
virtual
Scope
*
GetWorkerScope
(
int
thread_id
);
virtual
Scope
*
GetWorkerScope
(
int
thread_id
);
void
InitDumpEnv
()
override
;
void
InitDumpEnv
()
override
;
virtual
std
::
string
GetDumpPath
(
int
tid
);
virtual
std
::
string
GetDumpPath
(
int
tid
);
void
GetSkipVars
(
int
section_id
,
const
ProgramDesc
&
main_program
);
protected:
protected:
int
section_num_
;
int
section_num_
;
int
pipeline_num_
;
int
num_microbatches_
;
int
scope_queue_size_
;
int
start_cpu_core_id_
;
int
sync_steps_
;
std
::
vector
<
std
::
string
>
feed_var_names_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
std
::
vector
<
std
::
string
>>
skip_vars_
;
TrainerDesc
trainer_desc_
;
SectionWorkerParameter
pipeline_config_
;
// The in/output var names for each section
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
string
>>>
in_var_names_
;
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
string
>>>
out_var_names_
;
// Counter for the running thread
std
::
vector
<
std
::
vector
<
int
*>>
worker_count_
;
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
std
::
mutex
>>>
worker_count_mutex_
;
// worker: [section_id][pipeline_id][thread_id]
std
::
vector
<
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
paddle
::
framework
::
DeviceWorker
>>>>
workers_
;
std
::
vector
<
std
::
thread
>
section_threads_
;
std
::
vector
<
std
::
thread
>
section_threads_
;
// worker: [section_id]
// We use scope to maintain context info, and scopes
std
::
vector
<
std
::
shared_ptr
<
paddle
::
framework
::
DeviceWorker
>>
workers_
;
// will be deliverd between different sections.
// minibatch_scopes_: [section_id]
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
ScopeQueue
>>>
scope_queues_
;
std
::
vector
<
Scope
*>
minibatch_scopes_
;
std
::
vector
<
Scope
*>
pipeline_scopes_
;
// microbatch_scopes_: [section_id][microbatch_id]
std
::
vector
<
std
::
vector
<
Scope
*>>
microbatch_scopes_
;
// The parameters that should be syncronized between different cards using
// nccl all-reduce
void
CopyParameters
(
int
section_id
,
int
microbatch_id
,
std
::
shared_ptr
<
std
::
vector
<
std
::
string
>>
param_need_sync_
;
const
ProgramDesc
&
program
,
const
platform
::
Place
&
place
);
std
::
vector
<
std
::
string
>
persistable_vars_
;
bool
isPersistableVarGrad
(
std
::
string
name
);
std
::
vector
<
std
::
unique_ptr
<
SyncFunctor
>>
sync_functors_
;
bool
isPersistable
(
VarDesc
*
var
);
std
::
shared_ptr
<
platform
::
NCCLContextMap
>
nccl_ctx_map_
;
std
::
vector
<
DataFeed
*>
readers_
;
void
InitFirstScopeQueue
(
ScopeQueue
*
scope_queue
,
int
pipeline_id
,
const
ProgramDesc
&
main_program
,
const
Scope
&
root_scope
);
void
CopyParameters
(
const
Scope
&
root_scope
,
int
pipeline_id
);
void
construct_sync_functor
();
};
};
#endif
#endif
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/trainer_desc.proto
浏览文件 @
e39aa70e
...
@@ -83,6 +83,7 @@ message SectionWorkerParameter {
...
@@ -83,6 +83,7 @@ message SectionWorkerParameter {
optional
int64
sync_steps
=
3
[
default
=
1
];
optional
int64
sync_steps
=
3
[
default
=
1
];
optional
int32
start_cpu_core_id
=
4
[
default
=
1
];
optional
int32
start_cpu_core_id
=
4
[
default
=
1
];
repeated
string
param_need_sync
=
5
;
repeated
string
param_need_sync
=
5
;
optional
int32
num_microbatches
=
6
;
}
}
message
SectionConfig
{
message
SectionConfig
{
...
@@ -99,6 +100,7 @@ message SectionConfig {
...
@@ -99,6 +100,7 @@ message SectionConfig {
optional
int32
concurrency
=
3
[
default
=
1
];
optional
int32
concurrency
=
3
[
default
=
1
];
repeated
string
section_in_var_names
=
4
;
repeated
string
section_in_var_names
=
4
;
repeated
string
section_out_var_names
=
5
;
repeated
string
section_out_var_names
=
5
;
optional
int32
place_id
=
6
[
default
=
-
1
];
}
}
message
FetchConfig
{
message
FetchConfig
{
...
...
python/paddle/fluid/device_worker.py
浏览文件 @
e39aa70e
...
@@ -403,11 +403,8 @@ class Section(DeviceWorker):
...
@@ -403,11 +403,8 @@ class Section(DeviceWorker):
trainer_desc
.
device_worker_name
=
"SectionWorker"
trainer_desc
.
device_worker_name
=
"SectionWorker"
pipeline_opt
=
self
.
_program
.
_pipeline_opt
pipeline_opt
=
self
.
_program
.
_pipeline_opt
section_param
=
trainer_desc
.
section_param
section_param
=
trainer_desc
.
section_param
section_param
.
queue_size
=
pipeline_opt
[
"queue_size"
]
section_param
.
num_microbatches
=
pipeline_opt
[
"num_microbatches"
]
section_param
.
sync_steps
=
pipeline_opt
[
"sync_steps"
]
section_param
.
start_cpu_core_id
=
pipeline_opt
[
"start_cpu_core_id"
]
section_param
.
start_cpu_core_id
=
pipeline_opt
[
"start_cpu_core_id"
]
for
e
in
pipeline_opt
[
"param_need_sync"
]:
section_param
.
param_need_sync
.
append
(
e
)
for
i
,
program
in
enumerate
(
pipeline_opt
[
"section_program_list"
]):
for
i
,
program
in
enumerate
(
pipeline_opt
[
"section_program_list"
]):
cfg
=
section_param
.
section_config
.
add
()
cfg
=
section_param
.
section_config
.
add
()
cfg
.
program_desc
.
ParseFromString
(
program
[
"program"
].
_get_desc
()
cfg
.
program_desc
.
ParseFromString
(
program
[
"program"
].
_get_desc
()
...
@@ -415,6 +412,7 @@ class Section(DeviceWorker):
...
@@ -415,6 +412,7 @@ class Section(DeviceWorker):
# TODO: why does not work
# TODO: why does not work
# cfg.program_desc.CopyFrom(program.program._get_desc())
# cfg.program_desc.CopyFrom(program.program._get_desc())
place
=
pipeline_opt
[
"place_list"
][
i
]
place
=
pipeline_opt
[
"place_list"
][
i
]
place_id
=
pipeline_opt
[
"place_id_list"
][
i
]
if
isinstance
(
place
,
core
.
CPUPlace
):
if
isinstance
(
place
,
core
.
CPUPlace
):
cfg
.
place
=
cfg
.
CPUPlace
cfg
.
place
=
cfg
.
CPUPlace
elif
isinstance
(
place
,
core
.
CUDAPlace
):
elif
isinstance
(
place
,
core
.
CUDAPlace
):
...
@@ -425,12 +423,7 @@ class Section(DeviceWorker):
...
@@ -425,12 +423,7 @@ class Section(DeviceWorker):
raise
NotImplementedError
(
raise
NotImplementedError
(
"SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
"SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
)
)
cfg
.
place_id
=
place_id
cfg
.
concurrency
=
pipeline_opt
[
"concurrency_list"
][
i
]
for
var
in
program
[
"input_set"
]:
cfg
.
section_in_var_names
.
append
(
var
)
for
var
in
program
[
"output_set"
]:
cfg
.
section_out_var_names
.
append
(
var
)
class
DeviceWorkerFactory
(
object
):
class
DeviceWorkerFactory
(
object
):
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
e39aa70e
...
@@ -4474,7 +4474,7 @@ class PipelineOptimizer(object):
...
@@ -4474,7 +4474,7 @@ class PipelineOptimizer(object):
"place_list"
:
place_list
,
"place_list"
:
place_list
,
"place_id_list"
:
place_id_list
,
"place_id_list"
:
place_id_list
,
"sync_steps"
:
-
1
,
"sync_steps"
:
-
1
,
"
queue_size
"
:
self
.
_num_microbatches
,
"
num_microbatches
"
:
self
.
_num_microbatches
,
"start_cpu_core_id"
:
self
.
_start_cpu_core_id
,
"start_cpu_core_id"
:
self
.
_start_cpu_core_id
,
}
}
return
optimize_ops
,
params_grads
,
program_list
return
optimize_ops
,
params_grads
,
program_list
...
...
python/paddle/fluid/tests/unittests/test_pipeline.py
浏览文件 @
e39aa70e
...
@@ -100,7 +100,7 @@ def build_network(input, layers=50, class_dim=1000):
...
@@ -100,7 +100,7 @@ def build_network(input, layers=50, class_dim=1000):
pool_type
=
'max'
)
pool_type
=
'max'
)
if
layers
>=
50
:
if
layers
>=
50
:
for
block
in
range
(
len
(
depth
)):
for
block
in
range
(
len
(
depth
)):
with
fluid
.
device_guard
(
"
cpu
"
):
with
fluid
.
device_guard
(
"
gpu:0
"
):
for
i
in
range
(
depth
[
block
]):
for
i
in
range
(
depth
[
block
]):
conv
=
bottleneck_block
(
conv
=
bottleneck_block
(
input
=
conv
,
input
=
conv
,
...
@@ -118,7 +118,7 @@ def build_network(input, layers=50, class_dim=1000):
...
@@ -118,7 +118,7 @@ def build_network(input, layers=50, class_dim=1000):
initializer
=
fluid
.
initializer
.
Uniform
(
-
stdv
,
stdv
)))
initializer
=
fluid
.
initializer
.
Uniform
(
-
stdv
,
stdv
)))
else
:
else
:
for
block
in
range
(
len
(
depth
)):
for
block
in
range
(
len
(
depth
)):
with
fluid
.
device_guard
(
"
cpu
"
):
with
fluid
.
device_guard
(
"
gpu:0
"
):
for
i
in
range
(
depth
[
block
]):
for
i
in
range
(
depth
[
block
]):
conv
=
basic_block
(
conv
=
basic_block
(
input
=
conv
,
input
=
conv
,
...
@@ -140,38 +140,68 @@ def build_network(input, layers=50, class_dim=1000):
...
@@ -140,38 +140,68 @@ def build_network(input, layers=50, class_dim=1000):
class
TestPipeline
(
unittest
.
TestCase
):
class
TestPipeline
(
unittest
.
TestCase
):
""" TestCases for Pipeline Training. """
""" TestCases for Pipeline Training. """
def
_run
(
self
,
debug
):
main_prog
=
fluid
.
Program
()
startup_prog
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
with
fluid
.
device_guard
(
"cpu"
):
image
=
fluid
.
layers
.
data
(
name
=
"image"
,
shape
=
[
3
,
224
,
224
],
dtype
=
"float32"
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
[
image
,
label
],
capacity
=
64
,
use_double_buffer
=
True
,
iterable
=
False
)
fc
=
build_network
(
image
,
layers
=
50
)
with
fluid
.
device_guard
(
"gpu:0"
):
out
,
prob
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
fc
,
label
=
label
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
out
)
acc_top1
=
fluid
.
layers
.
accuracy
(
input
=
prob
,
label
=
label
,
k
=
1
)
acc_top5
=
fluid
.
layers
.
accuracy
(
input
=
prob
,
label
=
label
,
k
=
5
)
base_lr
=
0.1
passes
=
[
30
,
60
,
80
,
90
]
total_images
=
1281167
steps_per_pass
=
total_images
//
128
bd
=
[
steps_per_pass
*
p
for
p
in
passes
]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
lr_val
=
fluid
.
layers
.
piecewise_decay
(
boundaries
=
bd
,
values
=
lr
)
optimizer
=
fluid
.
optimizer
.
MomentumOptimizer
(
lr_val
,
momentum
=
0.9
,
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-4
))
optimizer
=
fluid
.
optimizer
.
PipelineOptimizer
(
optimizer
,
num_microbatches
=
2
)
optimizer
.
minimize
(
loss
)
def
train_reader
():
for
_
in
range
(
4
):
img
=
np
.
random
.
random
(
size
=
[
3
,
224
,
224
]).
astype
(
'float32'
)
label
=
np
.
random
.
random
(
size
=
[
1
]).
astype
(
'int64'
)
yield
img
,
label
data_loader
.
set_sample_generator
(
train_reader
,
batch_size
=
1
)
place
=
fluid
.
CPUPlace
()
# The following dataset is only used for the
# interface 'train_from_dataset'.
# And it has no actual meaning.
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
'FileInstantDataset'
)
dataset
.
set_batch_size
(
1
)
dataset
.
set_thread
(
1
)
dataset
.
set_filelist
([
'/tmp/tmp_2.txt'
])
dataset
.
set_use_var
([
image
,
label
])
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
data_loader
.
start
()
exe
.
train_from_dataset
(
main_prog
,
dataset
,
debug
=
debug
)
def
test_pipeline
(
self
):
def
test_pipeline
(
self
):
with
fluid
.
device_guard
(
"cpu"
):
self
.
_run
(
False
)
image
=
fluid
.
layers
.
data
(
self
.
_run
(
True
)
name
=
"image"
,
shape
=
[
3
,
224
,
224
],
dtype
=
"float32"
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
[
image
,
label
],
capacity
=
64
,
use_double_buffer
=
True
,
iterable
=
False
)
fc
=
build_network
(
image
,
layers
=
50
)
with
fluid
.
device_guard
(
"gpu:0"
):
out
,
prob
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
fc
,
label
=
label
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
out
)
acc_top1
=
fluid
.
layers
.
accuracy
(
input
=
prob
,
label
=
label
,
k
=
1
)
acc_top5
=
fluid
.
layers
.
accuracy
(
input
=
prob
,
label
=
label
,
k
=
5
)
base_lr
=
0.1
passes
=
[
30
,
60
,
80
,
90
]
total_images
=
1281167
steps_per_pass
=
total_images
//
128
bd
=
[
steps_per_pass
*
p
for
p
in
passes
]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
lr_val
=
fluid
.
layers
.
piecewise_decay
(
boundaries
=
bd
,
values
=
lr
)
optimizer
=
fluid
.
optimizer
.
Momentum
(
lr_val
,
momentum
=
0.9
,
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-4
))
optimizer
=
fluid
.
optimizer
.
PipelineOptimizer
(
optimizer
,
num_microbatches
=
2
)
optimizer
.
minimize
(
loss
)
def
test_pipeline_noneoptimizer
(
self
):
def
test_pipeline_noneoptimizer
(
self
):
with
fluid
.
device_guard
(
"gpu:0"
):
with
fluid
.
device_guard
(
"gpu:0"
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录