Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
f77a78cd
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
f77a78cd
编写于
11月 23, 2020
作者:
L
lilong12
提交者:
GitHub
11月 23, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
enable pipeline to run with Executor.run() (#28373)
* update, test=develop
上级
9f642ed8
变更
14
展开全部
显示空白变更内容
内联
并排
Showing
14 changed file
with
780 addition
and
1257 deletion
+780
-1257
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+2
-12
paddle/fluid/framework/pipeline_trainer.cc
paddle/fluid/framework/pipeline_trainer.cc
+62
-175
paddle/fluid/framework/section_worker.cc
paddle/fluid/framework/section_worker.cc
+56
-502
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+10
-17
paddle/fluid/framework/trainer_desc.proto
paddle/fluid/framework/trainer_desc.proto
+1
-1
python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
...e/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+98
-36
python/paddle/fluid/device_worker.py
python/paddle/fluid/device_worker.py
+11
-19
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+42
-12
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+214
-270
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-2
python/paddle/fluid/tests/unittests/pipeline_mnist.py
python/paddle/fluid/tests/unittests/pipeline_mnist.py
+136
-0
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+119
-1
python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
...uid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
+4
-7
python/paddle/fluid/tests/unittests/test_pipeline.py
python/paddle/fluid/tests/unittests/test_pipeline.py
+23
-203
未找到文件。
paddle/fluid/framework/device_worker.h
浏览文件 @
f77a78cd
...
...
@@ -540,7 +540,7 @@ class HeterBoxWorker : public HogwildWorker {
#if defined(PADDLE_WITH_NCCL)
class
SectionWorker
:
public
DeviceWorker
{
public:
SectionWorker
()
{
local_batch_id_
=
0
;
}
SectionWorker
()
{}
~
SectionWorker
()
override
{}
void
Initialize
(
const
TrainerDesc
&
desc
)
override
;
...
...
@@ -549,13 +549,12 @@ class SectionWorker : public DeviceWorker {
void
CreateDeviceResource
(
const
ProgramDesc
&
main_prog
)
override
{};
void
TrainFiles
()
override
;
void
TrainFilesWithProfiler
()
override
;
void
TrainFilesWithProfiler
()
override
{}
;
void
PrintFetchVars
()
override
{}
const
platform
::
Place
&
place
()
const
{
return
place_
;
}
void
SetSectionIndex
(
int
section_id
)
{
section_id_
=
section_id
;
}
void
SetDeviceIndex
(
int
tid
)
override
{}
void
SetThreadIndex
(
int
thread_id
)
{
thread_id_
=
thread_id
;
}
void
SetMicrobatchNum
(
int
num
)
{
num_microbatches_
=
num
;
}
...
...
@@ -566,13 +565,8 @@ class SectionWorker : public DeviceWorker {
void
SetSkipVars
(
const
std
::
vector
<
std
::
string
>&
skip_vars
)
{
skip_vars_
=
skip_vars
;
}
static
void
ResetBatchId
()
{
batch_id_
=
0
;
}
static
void
ResetThreadCompletedFlag
()
{
threads_completed
=
false
;
}
static
std
::
atomic
<
int
>
cpu_id_
;
protected:
void
AutoSetCPUAffinity
(
bool
reuse
);
int
section_id_
;
int
thread_id_
;
int
num_microbatches_
;
...
...
@@ -581,12 +575,8 @@ class SectionWorker : public DeviceWorker {
const
Scope
*
minibatch_scope_
;
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops_
;
static
std
::
mutex
thread_mutex
;
static
std
::
condition_variable
thread_condition
;
static
bool
threads_completed
;
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program_
;
static
uint64_t
batch_id_
;
uint64_t
local_batch_id_
;
platform
::
DeviceContext
*
dev_ctx_
=
nullptr
;
};
...
...
paddle/fluid/framework/pipeline_trainer.cc
浏览文件 @
f77a78cd
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#if defined(PADDLE_WITH_NCCL)
#include <map>
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h"
...
...
@@ -26,83 +27,25 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
const
auto
&
section_params
=
trainer_desc
.
section_param
();
num_microbatches_
=
section_params
.
num_microbatches
();
VLOG
(
3
)
<<
"Number of microbatches per minibatch: "
<<
num_microbatches_
;
section_num_
=
section_params
.
section_config_size
();
VLOG
(
3
)
<<
"Number of program sections: "
<<
section_num_
;
trainer_desc_
=
trainer_desc
;
start_cpu_core_id_
=
section_params
.
start_cpu_core_id
();
SetDataset
(
dataset
);
ParseDumpConfig
(
trainer_desc
);
// get filelist from trainer_desc here
const
std
::
vector
<
paddle
::
framework
::
DataFeed
*>
readers
=
dataset
->
GetReaders
();
VLOG
(
3
)
<<
"readers num: "
<<
readers
.
size
();
int
num_readers
=
readers
.
size
();
PADDLE_ENFORCE_EQ
(
num_readers
,
1
,
platform
::
errors
::
InvalidArgument
(
"Number of dataset readers for pipeline "
"must be 1 now, but the value you give is %d."
,
num_readers
));
auto
*
reader
=
readers
[
0
];
feed_var_names_
=
reader
->
GetUseSlotAlias
();
workers_
.
resize
(
section_num_
);
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
const
auto
&
section_config
=
section_params
.
section_config
(
i
);
platform
::
Place
place
;
const
auto
&
section_config
=
section_params
.
section_config
();
int
place_id
=
section_config
.
place_id
();
switch
(
section_config
.
place
())
{
case
SectionConfig
::
CPUPlace
:
place
=
platform
::
CPUPlace
();
break
;
case
SectionConfig
::
CUDAPlace
:
// Note that one section has at most one GPU place in one pipeline
PADDLE_ENFORCE_GE
(
place_id
,
0
,
platform
::
errors
::
InvalidArgument
(
"The place_id value for CUDAPlace shoud be greater "
"than or equal to 0, but the value you give is %d."
,
place_id
));
place
=
platform
::
CUDAPlace
(
place_id
);
break
;
case
SectionConfig
::
CUDAPinnedPlace
:
place
=
platform
::
CUDAPinnedPlace
();
break
;
default:
PADDLE_ENFORCE_NOT_NULL
(
nullptr
,
platform
::
errors
::
InvalidArgument
(
"Unkown place type in SectionConfig: %d"
,
section_config
.
place
()));
}
places_
.
emplace_back
(
place
);
VLOG
(
3
)
<<
"Device worker place: "
<<
place
<<
", device id: "
<<
place_id
<<
", section: "
<<
i
;
workers_
[
i
]
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
place_
=
platform
::
CUDAPlace
(
place_id
);
worker_
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
trainer_desc
.
device_worker_name
());
auto
this_worker
=
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
]);
if
(
i
==
0
)
{
// we only set reader for the first section
this_worker
->
SetDataFeed
(
reader
);
this_worker
->
SetReaderPlace
(
place
);
}
this_worker
->
SetThreadIndex
(
i
);
this_worker
->
SetSectionIndex
(
i
);
this_worker
->
SetPlace
(
place
);
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
worker_
);
this_worker
->
SetPlace
(
place_
);
this_worker
->
Initialize
(
trainer_desc
);
this_worker
->
SetMicrobatchNum
(
num_microbatches_
);
}
// set debug here
SetDebug
(
trainer_desc
.
debug
());
}
void
PipelineTrainer
::
InitOtherEnv
(
const
ProgramDesc
&
main_program
)
{
if
(
need_dump_field_
)
{
InitDumpEnv
();
}
VLOG
(
3
)
<<
"init other env done."
;
}
std
::
string
PipelineTrainer
::
GetDumpPath
(
int
tid
)
{
...
...
@@ -119,143 +62,87 @@ void PipelineTrainer::InitDumpEnv() {
}
}
void
PipelineTrainer
::
CopyParameters
(
int
section_id
,
int
microbatch_id
,
void
PipelineTrainer
::
CopyParameters
(
int
microbatch_id
,
const
ProgramDesc
&
program
,
const
platform
::
Place
&
place
)
{
auto
&
global_block
=
program
.
Block
(
0
);
std
::
map
<
std
::
string
,
int
>
param_map
;
for
(
auto
&
var
:
global_block
.
AllVars
())
{
int
is_feed_var
=
std
::
count
(
feed_var_names_
.
begin
(),
feed_var_names_
.
end
(),
var
->
Name
());
if
((
var
->
Persistable
()
||
is_feed_var
)
&&
microbatch_id
==
0
)
{
if
(
is_feed_var
)
{
auto
*
new_ptr
=
minibatch_scopes_
[
section_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"data name: "
<<
var
->
Name
()
<<
", ptr: "
<<
new_ptr
;
InitializeVariable
(
new_ptr
,
var
->
GetType
());
}
else
{
auto
*
ptr
=
root_scope_
->
FindVar
(
var
->
Name
());
auto
*
new_ptr
=
minibatch_scopes_
[
section_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"Create persistable var "
<<
var
->
Name
()
<<
" for minibatch "
<<
section_id
<<
", which pointer is "
<<
new_ptr
;
InitializeVariable
(
new_ptr
,
var
->
GetType
());
const
LoDTensor
&
root_tensor
=
ptr
->
Get
<
LoDTensor
>
();
LoDTensor
*
minibatch_tensor
=
new_ptr
->
GetMutable
<
LoDTensor
>
();
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
root_tensor
),
place
,
static_cast
<
Tensor
*>
(
minibatch_tensor
));
}
}
else
if
(
!
var
->
Persistable
()
&&
!
is_feed_var
)
{
auto
*
ptr
=
microbatch_scopes_
[
section_id
][
microbatch_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"Create variable "
<<
var
->
Name
()
<<
" for section "
<<
section_id
<<
" microbatch "
<<
microbatch_id
<<
", which pointer is "
<<
ptr
;
InitializeVariable
(
ptr
,
var
->
GetType
());
if
(
var
->
Persistable
())
{
param_map
[
var
->
Name
()]
=
1
;
}
}
}
void
PipelineTrainer
::
GetSkipVars
(
int
section_id
,
const
ProgramDesc
&
program
)
{
auto
&
global_block
=
program
.
Block
(
0
);
for
(
auto
&
op
:
global_block
.
AllOps
())
{
if
(
op
->
Type
()
!=
"enqueue"
)
{
continue
;
for
(
auto
&
var
:
global_block
.
AllVars
())
{
bool
is_param_grad
=
false
;
size_t
pos
=
0
;
if
((
pos
=
var
->
Name
().
find
(
kGradVarSuffix
))
!=
std
::
string
::
npos
)
{
auto
prefix_name
=
var
->
Name
().
substr
(
0
,
pos
);
if
(
param_map
.
find
(
prefix_name
)
!=
param_map
.
end
())
{
is_param_grad
=
true
;
}
auto
input_arg_names
=
op
->
InputArgumentNames
();
PADDLE_ENFORCE_EQ
(
input_arg_names
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Number of input arguments for enqueue op must be 1, "
"but the value is %d."
,
input_arg_names
.
size
()));
std
::
string
input_arg_name
=
input_arg_names
[
0
];
if
(
input_arg_name
.
rfind
(
"@GRAD"
)
!=
input_arg_name
.
size
()
-
5
)
{
skip_vars_
[
section_id
].
emplace_back
(
input_arg_name
);
VLOG
(
3
)
<<
"add skip var name: "
<<
input_arg_name
;
}
if
(
var
->
Persistable
()
&&
microbatch_id
==
0
)
{
auto
*
ptr
=
root_scope_
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create persistable var: "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
}
else
if
(
is_param_grad
&&
microbatch_id
==
0
)
{
auto
*
ptr
=
minibatch_scope_
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create grad for persistable var: "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
}
else
if
(
!
var
->
Persistable
()
&&
!
is_param_grad
)
{
auto
*
ptr
=
microbatch_scopes_
[
microbatch_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"Create variable "
<<
var
->
Name
()
<<
" for microbatch "
<<
microbatch_id
<<
", which pointer is "
<<
ptr
;
InitializeVariable
(
ptr
,
var
->
GetType
());
}
}
}
void
PipelineTrainer
::
InitTrainerEnv
(
const
ProgramDesc
&
main_program
,
const
platform
::
Place
&
place
)
{
PADDLE_ENFORCE_NOT_NULL
(
root_scope_
,
platform
::
errors
::
InvalidArgument
(
"root_scope pointer can not be nullptr"
));
auto
start_cpu_id
=
trainer_desc_
.
section_param
().
start_cpu_core_id
();
SectionWorker
::
cpu_id_
.
store
(
start_cpu_id
);
minibatch_scopes_
.
resize
(
section_num_
);
microbatch_scopes_
.
resize
(
section_num_
);
skip_vars_
.
resize
(
section_num_
);
PADDLE_ENFORCE_NOT_NULL
(
root_scope_
,
platform
::
errors
::
InvalidArgument
(
"root_scope_ can not be nullptr"
));
microbatch_scopes_
.
resize
(
num_microbatches_
);
VLOG
(
3
)
<<
"Init ScopeQueues and create all scopes"
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
minibatch_scopes_
[
i
]
=
&
root_scope_
->
NewScope
();
VLOG
(
3
)
<<
"Create minibatch and microbatch scopes..."
;
minibatch_scope_
=
&
root_scope_
->
NewScope
();
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program
;
program
.
reset
(
new
ProgramDesc
(
trainer_desc_
.
section_param
().
section_config
(
i
).
program_desc
()));
microbatch_scopes_
[
i
].
resize
(
num_microbatches_
);
trainer_desc_
.
section_param
().
section_config
().
program_desc
()));
for
(
int
j
=
0
;
j
<
num_microbatches_
;
++
j
)
{
microbatch_scopes_
[
i
][
j
]
=
&
minibatch_scopes_
[
i
]
->
NewScope
();
CopyParameters
(
i
,
j
,
*
program
,
places_
[
i
]);
}
GetSkipVars
(
i
,
*
program
);
microbatch_scopes_
[
j
]
=
&
minibatch_scope_
->
NewScope
();
CopyParameters
(
j
,
*
program
,
place_
);
}
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
auto
this_worker
=
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
workers_
[
i
]);
std
::
dynamic_pointer_cast
<
paddle
::
framework
::
SectionWorker
>
(
worker_
);
this_worker
->
SetRootScope
(
root_scope_
);
this_worker
->
SetMinibatchScope
(
minibatch_scopes_
[
i
]);
this_worker
->
SetMicrobatchScopes
(
microbatch_scopes_
[
i
]);
this_worker
->
SetSkipVars
(
skip_vars_
[
i
]);
}
this_worker
->
SetMinibatchScope
(
minibatch_scope_
);
this_worker
->
SetMicrobatchScopes
(
microbatch_scopes_
);
}
void
PipelineTrainer
::
Run
()
{
VLOG
(
3
)
<<
"Going to run"
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
if
(
!
debug_
)
{
section_threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFiles
,
workers_
[
i
].
get
()));
}
else
{
section_threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFilesWithProfiler
,
workers_
[
i
].
get
()));
}
}
VLOG
(
5
)
<<
"Going to run PipelineTrainer::Run()"
;
section_thread_
=
std
::
async
(
&
DeviceWorker
::
TrainFiles
,
worker_
.
get
());
}
void
PipelineTrainer
::
Finalize
()
{
for
(
auto
&
th
:
section_threads_
)
{
th
.
join
();
try
{
section_thread_
.
get
();
}
catch
(
platform
::
EOFException
&
e
)
{
std
::
rethrow_exception
(
std
::
current_exception
());
}
if
(
need_dump_field_
)
{
FinalizeDumpEnv
();
}
VLOG
(
3
)
<<
"copying back parameters. "
;
for
(
int
i
=
0
;
i
<
section_num_
;
++
i
)
{
std
::
shared_ptr
<
framework
::
ProgramDesc
>
program
;
program
.
reset
(
new
ProgramDesc
(
trainer_desc_
.
section_param
().
section_config
(
i
).
program_desc
()));
for
(
int
j
=
0
;
j
<
num_microbatches_
;
++
j
)
{
auto
&
global_block
=
program
->
Block
(
0
);
for
(
auto
&
var
:
global_block
.
AllVars
())
{
if
(
var
->
Persistable
())
{
auto
*
ptr
=
root_scope_
->
FindVar
(
var
->
Name
());
LoDTensor
*
root_tensor
=
ptr
->
GetMutable
<
LoDTensor
>
();
auto
*
minibatch_ptr
=
minibatch_scopes_
[
i
]
->
Var
(
var
->
Name
());
const
LoDTensor
&
minibatch_tensor
=
minibatch_ptr
->
Get
<
LoDTensor
>
();
TensorCopy
(
*
static_cast
<
const
Tensor
*>
(
&
minibatch_tensor
),
places_
[
0
],
static_cast
<
Tensor
*>
(
root_tensor
));
VLOG
(
4
)
<<
"Copy persitable var "
<<
var
->
Name
()
<<
" to root scope"
;
}
}
}
}
root_scope_
->
DropKids
();
SectionWorker
::
ResetBatchId
();
SectionWorker
::
ResetThreadCompletedFlag
();
}
Scope
*
PipelineTrainer
::
GetWorkerScope
(
int
thread_id
)
{
return
microbatch_scopes_
[
thread_id
][
0
];
return
microbatch_scopes_
[
0
];
}
}
// end namespace framework
...
...
paddle/fluid/framework/section_worker.cc
浏览文件 @
f77a78cd
此差异已折叠。
点击以展开。
paddle/fluid/framework/trainer.h
浏览文件 @
f77a78cd
...
...
@@ -290,29 +290,22 @@ class PipelineTrainer : public TrainerBase {
virtual
Scope
*
GetWorkerScope
(
int
thread_id
);
void
InitDumpEnv
()
override
;
virtual
std
::
string
GetDumpPath
(
int
tid
);
void
GetSkipVars
(
int
section_id
,
const
ProgramDesc
&
main_program
);
void
GetSkipVars
(
const
ProgramDesc
&
main_program
);
protected:
int
section_num_
;
int
num_microbatches_
;
int
start_cpu_core_id_
;
std
::
vector
<
std
::
string
>
feed_var_names_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
std
::
vector
<
std
::
string
>>
skip_vars_
;
platform
::
Place
place_
;
std
::
vector
<
std
::
string
>
skip_vars_
;
TrainerDesc
trainer_desc_
;
std
::
vector
<
std
::
thread
>
section_threads_
;
// worker: [section_id]
std
::
vector
<
std
::
shared_ptr
<
paddle
::
framework
::
DeviceWorker
>>
workers_
;
// minibatch_scopes_: [section_id]
std
::
vector
<
Scope
*>
minibatch_scopes_
;
// microbatch_scopes_: [section_id][microbatch_id]
std
::
vector
<
std
::
vector
<
Scope
*>>
microbatch_scopes_
;
std
::
future
<
void
>
section_thread_
;
std
::
shared_ptr
<
paddle
::
framework
::
DeviceWorker
>
worker_
;
Scope
*
minibatch_scope_
;
// microbatch_scopes_: [microbatch_id]
std
::
vector
<
Scope
*>
microbatch_scopes_
;
void
CopyParameters
(
int
section_id
,
int
microbatch_id
,
const
ProgramDesc
&
program
,
const
platform
::
Place
&
place
);
bool
isPersistableVarGrad
(
std
::
string
name
);
bool
isPersistable
(
VarDesc
*
var
);
void
CopyParameters
(
int
microbatch_id
,
const
ProgramDesc
&
program
,
const
platform
::
Place
&
place
);
};
#endif
...
...
paddle/fluid/framework/trainer_desc.proto
浏览文件 @
f77a78cd
...
...
@@ -86,7 +86,7 @@ message DownpourWorkerParameter {
}
message
SectionWorkerParameter
{
repeated
SectionConfig
section_config
=
1
;
optional
SectionConfig
section_config
=
1
;
optional
int32
queue_size
=
2
[
default
=
1
];
optional
int64
sync_steps
=
3
[
default
=
1
];
optional
int32
start_cpu_core_id
=
4
[
default
=
1
];
...
...
python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
浏览文件 @
f77a78cd
...
...
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
from
__future__
import
print_function
from
__future__
import
division
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
,
unique_name
...
...
@@ -21,9 +22,55 @@ from .meta_optimizer_base import MetaOptimizerBase
from
.common
import
OpRole
,
OP_ROLE_KEY
,
OP_ROLE_VAR_KEY
,
CollectiveHelper
,
is_update_op
,
is_loss_grad_op
,
is_backward_op
,
is_optimizer_op
class
PipelineHelper
(
CollectiveHelper
):
def
__init__
(
self
,
role_maker
,
nrings
=
1
,
wait_port
=
'6174'
):
super
(
PipelineHelper
,
self
).
__init__
(
role_maker
,
nrings
,
wait_port
)
def
_get_node_num
(
endpoints
):
ss
=
set
()
for
ep
in
endpoints
:
ip
=
ep
.
split
(
":"
)[
0
].
strip
()
if
ip
not
in
ss
:
ss
.
add
(
ip
)
return
len
(
ss
)
class
PipelineHelper
(
object
):
def
__init__
(
self
,
role_maker
,
wait_port
=
'6174'
):
self
.
wait_port
=
wait_port
self
.
role_maker
=
role_maker
def
update_startup_program
(
self
,
startup_program
=
None
,
inner_parallelism
=
None
):
self
.
startup_program
=
startup_program
endpoints
=
self
.
role_maker
.
_get_trainer_endpoints
()
current_endpoint
=
endpoints
[
self
.
role_maker
.
_worker_index
()]
node_num
=
_get_node_num
(
endpoints
)
assert
len
(
endpoints
)
%
node_num
==
0
nranks
=
self
.
role_maker
.
_worker_num
()
rank
=
self
.
role_maker
.
_worker_index
()
# Create ring 0 for all gpus in a pipeline
pipeline_endpoints
=
[]
pipeline_rank
=
rank
%
inner_parallelism
pipeline_id
=
rank
//
inner_parallelism
for
idx
,
ep
in
enumerate
(
endpoints
):
if
idx
//
inner_parallelism
==
pipeline_id
:
pipeline_endpoints
.
append
(
ep
)
self
.
_init_communicator
(
self
.
startup_program
,
current_endpoint
,
pipeline_endpoints
,
pipeline_rank
,
0
,
self
.
wait_port
)
pipeline_num
=
len
(
endpoints
)
//
inner_parallelism
if
pipeline_num
==
1
:
return
# Create rings for gpus with the same gpu id
eps
=
[]
local_rank
=
self
.
role_maker
.
_worker_index
()
%
inner_parallelism
ring_id
=
local_rank
+
1
for
i
in
range
(
pipeline_num
):
eps
.
append
(
endpoints
[
i
*
inner_parallelism
+
local_rank
])
temp_rank
=
self
.
role_maker
.
_worker_index
()
//
inner_parallelism
self
.
_init_communicator
(
self
.
startup_program
,
current_endpoint
,
eps
,
temp_rank
,
ring_id
,
self
.
wait_port
)
self
.
_broadcast_params
(
ring_id
)
def
_init_communicator
(
self
,
program
,
current_endpoint
,
endpoints
,
rank
,
ring_id
,
wait_port
):
...
...
@@ -46,9 +93,8 @@ class PipelineHelper(CollectiveHelper):
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
,
OP_ROLE_KEY
:
OpRole
.
Forward
OP_ROLE_KEY
:
OpRole
.
Forward
,
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
...
...
@@ -58,12 +104,10 @@ class PipelineHelper(CollectiveHelper):
'rank'
:
rank
,
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
,
'device_id'
:
OpRole
.
Forward
})
def
_broadcast_params
(
self
):
def
_broadcast_params
(
self
,
ring_id
):
block
=
self
.
startup_program
.
global_block
()
ring_id
=
0
for
param
in
block
.
iter_parameters
():
if
param
.
is_distributed
:
continue
...
...
@@ -78,7 +122,6 @@ class PipelineHelper(CollectiveHelper):
OP_ROLE_KEY
:
OpRole
.
Forward
})
for
ring_id
in
range
(
self
.
nrings
):
block
.
append_op
(
type
=
'c_sync_comm_stream'
,
inputs
=
{
'X'
:
param
},
...
...
@@ -99,8 +142,8 @@ class PipelineOptimizer(MetaOptimizerBase):
user_defined_strategy
):
super
(
PipelineOptimizer
,
self
).
_set_basic_info
(
loss
,
role_maker
,
user_defined_optimizer
,
user_defined_strategy
)
num_microbatches
=
user_defined_strategy
.
pipeline_configs
[
'micro_batch'
]
self
.
wrapped_opt
=
PO
(
self
.
inner_opt
,
num_microbatches
=
num_microbatches
)
self
.
num_microbatches
=
user_defined_strategy
.
pipeline_configs
[
'micro_batch'
]
def
_can_apply
(
self
):
if
not
self
.
role_maker
.
_is_collective
:
...
...
@@ -115,29 +158,46 @@ class PipelineOptimizer(MetaOptimizerBase):
dist_strategy
.
pipeline_configs
=
{}
def
_enable_strategy
(
self
,
dist_strategy
,
context
):
# we do not support enable pipeline automatically right now
return
dist_strategy
.
pipeline
=
True
dist_strategy
.
pipeline_configs
=
{
"micro_batch"
:
1
,
}
def
_get_local_rank
(
self
,
current_endpoint
,
endpoints
):
cur_node_endpoints
=
[]
cur_ip
=
current_endpoint
.
split
(
':'
)[
0
].
strip
()
for
ep
in
endpoints
:
if
cur_ip
==
ep
.
split
(
':'
)[
0
].
strip
():
cur_node_endpoints
.
append
(
ep
)
return
cur_node_endpoints
.
index
(
current_endpoint
)
def
minimize_impl
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
optimize_ops
,
params_grads
,
prog_list
=
\
self
.
wrapped_opt
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
if
self
.
role_maker
.
_worker_num
()
==
1
:
return
optimize_ops
,
params_grads
endpoints
=
self
.
role_maker
.
_get_trainer_endpoints
()
current_endpoint
=
endpoints
[
self
.
role_maker
.
_worker_index
()]
self
.
local_rank
=
self
.
_get_local_rank
(
current_endpoint
,
endpoints
)
self
.
wrapped_opt
=
PO
(
self
.
inner_opt
,
num_microbatches
=
self
.
num_microbatches
,
start_cpu_core_id
=
self
.
local_rank
)
node_num
=
_get_node_num
(
endpoints
)
gpus_per_node
=
len
(
endpoints
)
//
node_num
self
.
startup_program
=
startup_program
self
.
local_rank
=
self
.
_get_local_rank
(
current_endpoint
,
endpoints
)
if
startup_program
is
None
:
self
.
startup_program
=
fluid
.
default_startup_program
()
loss
.
block
.
program
.
_pipeline_opt
=
dict
()
loss
.
block
.
program
.
_pipeline_opt
[
'local_rank'
]
=
self
.
local_rank
optimize_ops
,
params_grads
,
prog_list
=
\
self
.
wrapped_opt
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
assert
prog_list
self
.
main_program_list
=
prog_list
self
.
main_program
=
loss
.
block
.
program
self
.
inner_parallelism
=
loss
.
block
.
program
.
_pipeline_opt
[
'inner_parallelism'
]
nranks
=
len
(
endpoints
)
self
.
nranks
=
nranks
self
.
nrings
=
len
(
self
.
main_program_list
)
...
...
@@ -146,24 +206,26 @@ class PipelineOptimizer(MetaOptimizerBase):
self
.
endpoints
=
endpoints
self
.
current_endpoint
=
current_endpoint
pipeline_helper
=
PipelineHelper
(
self
.
role_maker
,
nrings
=
self
.
nrings
)
pipeline_helper
.
update_startup_program
(
self
.
startup_program
)
pipeline_helper
=
PipelineHelper
(
self
.
role_maker
)
pipeline_helper
.
update_startup_program
(
self
.
startup_program
.
_pipeline_opt
[
"startup_program"
],
self
.
inner_parallelism
)
self
.
_transpile_main_program
()
self
.
_transpile_main_program
(
loss
,
node_num
,
gpus_per_node
)
return
optimize_ops
,
params_grads
def
_transpile_main_program
(
self
):
self
.
_insert_loss_grad_ops
()
for
ring_id
in
range
(
self
.
nrings
):
def
_transpile_main_program
(
self
,
loss
,
node_num
,
gpus_per_node
):
self
.
_insert_loss_grad_ops
(
loss
,
gpus_per_node
,
node_num
)
for
ring_id
in
range
(
1
,
gpus_per_node
+
1
):
self
.
_insert_allreduce_ops
(
ring_id
)
def
_insert_loss_grad_ops
(
self
):
def
_insert_loss_grad_ops
(
self
,
loss
,
gpus_per_node
,
node_num
):
"""
In order to keep the learning rate consistent in different numbers of
training workers, we scale the loss grad by the number of workers
"""
block
=
self
.
main_program_list
[
self
.
nrings
-
1
][
'program'
].
global_block
(
)
block
=
self
.
main_program_list
[
gpus_per_node
-
1
][
'program'
].
global_block
(
)
for
idx
,
op
in
reversed
(
list
(
enumerate
(
block
.
ops
))):
if
is_loss_grad_op
(
op
):
loss_grad_var
=
block
.
vars
[
op
.
output_arg_names
[
0
]]
...
...
@@ -173,12 +235,12 @@ class PipelineOptimizer(MetaOptimizerBase):
inputs
=
{
'X'
:
loss_grad_var
},
outputs
=
{
'Out'
:
loss_grad_var
},
attrs
=
{
'scale'
:
1.0
/
self
.
nranks
,
'scale'
:
1.0
/
node_num
,
OP_ROLE_KEY
:
OpRole
.
Backward
})
def
_insert_allreduce_ops
(
self
,
ring_id
):
block
=
self
.
main_program_list
[
ring_id
][
'program'
].
global_block
()
block
=
self
.
main_program_list
[
ring_id
-
1
][
'program'
].
global_block
()
origin_block
=
self
.
main_program
.
global_block
()
grad
=
None
for
idx
,
op
in
reversed
(
list
(
enumerate
(
block
.
ops
))):
...
...
python/paddle/fluid/device_worker.py
浏览文件 @
f77a78cd
...
...
@@ -413,24 +413,16 @@ class Section(DeviceWorker):
section_param
=
trainer_desc
.
section_param
section_param
.
num_microbatches
=
pipeline_opt
[
"num_microbatches"
]
section_param
.
start_cpu_core_id
=
pipeline_opt
[
"start_cpu_core_id"
]
for
i
,
program
in
enumerate
(
pipeline_opt
[
"section_program_list"
]):
cfg
=
section_param
.
section_config
.
add
()
cfg
=
section_param
.
section_config
program
=
pipeline_opt
[
"section_program"
]
cfg
.
program_desc
.
ParseFromString
(
program
[
"program"
].
_get_desc
()
.
serialize_to_string
())
# TODO: why does not work
# cfg.program_desc.CopyFrom(program.program._get_desc())
place
=
pipeline_opt
[
"place_list"
][
i
]
place_id
=
pipeline_opt
[
"place_id_list"
][
i
]
if
isinstance
(
place
,
core
.
CPUPlace
):
cfg
.
place
=
cfg
.
CPUPlace
elif
isinstance
(
place
,
core
.
CUDAPlace
):
place
=
pipeline_opt
[
"place"
]
place_id
=
pipeline_opt
[
"place_id"
]
assert
isinstance
(
place
,
core
.
CUDAPlace
)
cfg
.
place
=
cfg
.
CUDAPlace
elif
isinstance
(
place
,
core
.
CUDAPinnedPlace
):
cfg
.
place
=
cfg
.
CUDAPinnedPlace
else
:
raise
NotImplementedError
(
"SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
)
cfg
.
place_id
=
place_id
...
...
python/paddle/fluid/executor.py
浏览文件 @
f77a78cd
...
...
@@ -561,6 +561,7 @@ class Executor(object):
self
.
_default_executor
=
core
.
Executor
(
p
)
self
.
_closed
=
False
self
.
pruned_program_scope_caches
=
dict
()
self
.
_prepare_to_run_called
=
False
self
.
_auto_checkpoint_name
=
unique_name
.
generate
(
"__auto_checkpoint_executor__"
)
...
...
@@ -1115,6 +1116,24 @@ class Executor(object):
use_default_main_program
=
program
is
None
if
program
is
None
:
program
=
default_main_program
()
if
fetch_list
is
not
None
:
if
isinstance
(
fetch_list
,
Variable
)
or
isinstance
(
fetch_list
,
str
)
or
isinstance
(
fetch_list
,
six
.
string_types
):
fetch_list
=
[
fetch_list
]
assert
isinstance
(
fetch_list
,
tuple
)
or
isinstance
(
fetch_list
,
list
),
\
"Currently , The fetch_list type only should be list or tuple,
\n
"
\
"but the input type is {}. For more information please refer to
\n
"
\
"the executor.run(...)."
.
format
(
type
(
fetch_list
))
else
:
fetch_list
=
[]
if
isinstance
(
program
,
Program
)
and
program
.
_pipeline_opt
:
if
"startup_program"
in
program
.
_pipeline_opt
:
program
=
program
.
_pipeline_opt
[
"startup_program"
]
else
:
return
self
.
train_from_dataset
(
program
,
fetch_list
=
fetch_list
)
if
isinstance
(
program
,
Program
)
and
\
len
(
program
.
global_block
().
ops
)
==
0
:
if
use_default_main_program
:
...
...
@@ -1131,18 +1150,6 @@ class Executor(object):
if
scope
is
None
:
scope
=
global_scope
()
if
fetch_list
is
not
None
:
if
isinstance
(
fetch_list
,
Variable
)
or
isinstance
(
fetch_list
,
str
)
or
isinstance
(
fetch_list
,
six
.
string_types
):
fetch_list
=
[
fetch_list
]
assert
isinstance
(
fetch_list
,
tuple
)
or
isinstance
(
fetch_list
,
list
),
\
"Currently , The fetch_list type only should be list or tuple,
\n
"
\
"but the input type is {}. For more information please refer to
\n
"
\
"the executor.run(...)."
.
format
(
type
(
fetch_list
))
else
:
fetch_list
=
[]
# use_prune can be overrided by putting optimize_ops in fetch_list
_origin_fetch_list
=
fetch_list
_origin_program
=
program
...
...
@@ -1449,6 +1456,25 @@ class Executor(object):
raise
RuntimeError
(
"dataset is need and should be initialized"
)
dataset
.
_prepare_to_run
()
real_fetch_list
=
[]
if
program
.
_pipeline_opt
:
real_program
=
program
.
_pipeline_opt
[
"section_program"
][
'program'
]
for
fetch_var
in
fetch_list
:
if
isinstance
(
fetch_var
,
Variable
):
fetch_var_name
=
fetch_var
.
name
else
:
fetch_var_name
=
fetch_var
if
fetch_var_name
in
real_program
.
global_block
().
vars
:
real_fetch_list
.
append
(
fetch_var
)
program
.
_pipeline_opt
[
"section_program"
][
'program'
]
=
self
.
_add_feed_fetch_ops
(
program
=
program
.
_pipeline_opt
[
"section_program"
][
'program'
],
feed
=
[],
fetch_list
=
real_fetch_list
,
feed_var_name
=
'feed'
,
fetch_var_name
=
'fetch'
)
fetch_list
=
None
scope
,
trainer
=
self
.
_prepare_trainer
(
program
=
program
,
...
...
@@ -1483,6 +1509,10 @@ class Executor(object):
dataset
.
_dynamic_adjust_after_train
()
dataset
.
_finish_to_run
()
if
real_fetch_list
:
arr
=
scope
.
find_var
(
'fetch'
).
get_fetch_list
()
tensors
=
arr
.
_move_to_list
()
return
as_numpy
(
tensors
)
return
None
...
...
python/paddle/fluid/optimizer.py
100755 → 100644
浏览文件 @
f77a78cd
此差异已折叠。
点击以展开。
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
f77a78cd
...
...
@@ -10,10 +10,12 @@ if(NOT WITH_NCCL)
endif
()
string
(
REPLACE
".py"
""
DIST_TEST_OPS
"
${
DIST_TEST_OPS
}
"
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_mnist
)
list
(
APPEND DIST_TEST_OPS test_pipeline
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_transformer
)
list
(
APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer
)
list
(
APPEND DIST_TEST_OPS test_listen_and_serv_op
)
list
(
APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer
)
set
(
MIXED_DIST_TEST_OPS
${
DIST_TEST_OPS
}
)
...
...
@@ -146,7 +148,6 @@ if (WITH_NCCL)
endif
()
if
(
NOT WITH_GPU OR WIN32
)
LIST
(
REMOVE_ITEM TEST_OPS test_pipeline
)
LIST
(
REMOVE_ITEM TEST_OPS test_boxps
)
endif
()
list
(
REMOVE_ITEM TEST_OPS test_seq_concat_op
)
# FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
...
...
@@ -469,7 +470,6 @@ if(WITH_DISTRIBUTE)
py_test_modules
(
test_fleet_sharding_meta_optimizer MODULES test_fleet_sharding_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_private_function MODULES test_fleet_private_function ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy
)
...
...
python/paddle/fluid/tests/unittests/pipeline_mnist.py
0 → 100644
浏览文件 @
f77a78cd
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
numpy
as
np
import
argparse
import
time
import
math
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.profiler
as
profiler
from
paddle.fluid
import
core
import
unittest
from
multiprocessing
import
Process
import
os
import
signal
from
functools
import
reduce
from
test_dist_base
import
TestDistRunnerBase
,
runtime_main
import
paddle.distributed.fleet
as
fleet
paddle
.
enable_static
()
DTYPE
=
"float32"
paddle
.
dataset
.
mnist
.
fetch
()
# Fix seed for test
fluid
.
default_startup_program
().
random_seed
=
1
fluid
.
default_main_program
().
random_seed
=
1
def
cnn_model
(
data
):
conv_pool_1
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
data
,
filter_size
=
5
,
num_filters
=
20
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
)))
conv_pool_2
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
conv_pool_1
,
filter_size
=
5
,
num_filters
=
50
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
,
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
)))
SIZE
=
10
input_shape
=
conv_pool_2
.
shape
param_shape
=
[
reduce
(
lambda
a
,
b
:
a
*
b
,
input_shape
[
1
:],
1
)]
+
[
SIZE
]
scale
=
(
2.0
/
(
param_shape
[
0
]
**
2
*
SIZE
))
**
0.5
predict
=
fluid
.
layers
.
fc
(
input
=
conv_pool_2
,
size
=
SIZE
,
act
=
"softmax"
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
)))
return
predict
class
TestDistMnist2x2
(
TestDistRunnerBase
):
def
get_model
(
self
,
batch_size
=
2
,
use_dgc
=
False
,
dist_strategy
=
None
):
# Input data
with
fluid
.
device_guard
(
"gpu:0"
):
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
[
1
,
28
,
28
],
dtype
=
DTYPE
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
if
dist_strategy
:
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
[
images
,
label
],
capacity
=
64
,
use_double_buffer
=
False
,
iterable
=
False
)
# Train program
predict
=
cnn_model
(
images
)
with
fluid
.
device_guard
(
"gpu:1"
):
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
# Evaluator
with
fluid
.
device_guard
(
"gpu:1"
):
batch_size_tensor
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size_tensor
)
inference_program
=
fluid
.
default_main_program
().
clone
()
base_lr
=
self
.
lr
passes
=
[
30
,
60
,
80
,
90
]
steps_per_pass
=
10
bd
=
[
steps_per_pass
*
p
for
p
in
passes
]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
lr_val
=
fluid
.
layers
.
piecewise_decay
(
boundaries
=
bd
,
values
=
lr
)
opt
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
lr_val
,
momentum
=
0.9
)
# Reader
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
if
dist_strategy
:
fleet
.
init
(
is_collective
=
True
)
strategy
=
fleet
.
DistributedStrategy
()
strategy
.
pipeline
=
True
dist_opt
=
fleet
.
distributed_optimizer
(
optimizer
=
opt
,
strategy
=
strategy
)
dist_opt
.
minimize
(
avg_cost
)
else
:
opt
.
minimize
(
avg_cost
)
if
dist_strategy
:
return
inference_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
,
data_loader
else
:
return
inference_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
if
__name__
==
"__main__"
:
runtime_main
(
TestDistMnist2x2
)
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
f77a78cd
...
...
@@ -124,6 +124,67 @@ class TestDistRunnerBase(object):
exe
.
run
(
pserver_prog
)
print_to_err
(
type
(
self
).
__name__
,
"run pserver main program done."
)
def
run_pipeline_trainer
(
self
,
args
):
self
.
lr
=
args
.
lr
dist_strategy
=
DistributedStrategy
()
test_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
,
data_loader
=
\
self
.
get_model
(
batch_size
=
args
.
batch_size
,
dist_strategy
=
dist_strategy
)
device_id
=
int
(
os
.
getenv
(
"FLAGS_selected_gpus"
,
"0"
))
eprint
(
type
(
self
).
__name__
,
"device_id: %d."
%
device_id
)
place
=
fluid
.
CUDAPlace
(
device_id
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
eprint
(
type
(
self
).
__name__
,
"run worker startup program done."
)
data_loader
.
set_sample_list_generator
(
train_reader
,
place
)
data_loader
.
start
()
print_to_err
(
type
(
self
).
__name__
,
"begin to train on trainer"
)
out_losses
=
[]
for
i
in
six
.
moves
.
xrange
(
RUN_STEP
):
loss
=
exe
.
run
(
fluid
.
default_main_program
(),
fetch_list
=
[
avg_cost
])
loss
=
loss
[
0
]
if
loss
else
None
out_losses
.
append
(
loss
)
print_to_err
(
type
(
self
).
__name__
,
"run step %d finished"
%
i
)
print_to_err
(
type
(
self
).
__name__
,
"trainer run finished"
)
if
six
.
PY2
:
print
(
pickle
.
dumps
(
out_losses
))
else
:
sys
.
stdout
.
buffer
.
write
(
pickle
.
dumps
(
out_losses
))
if
args
.
save_model
:
model_save_dir
=
"/tmp"
if
fleet
.
worker_index
()
==
0
:
model_save_dir_fluid
=
os
.
path
.
join
(
model_save_dir
,
"fluid_persistables"
)
model_save_dir_fleet
=
os
.
path
.
join
(
model_save_dir
,
"fleet_persistables"
)
infer_save_dir_fluid
=
os
.
path
.
join
(
model_save_dir
,
"fluid_infer"
)
infer_save_dir_fleet
=
os
.
path
.
join
(
model_save_dir
,
"fleet_infer"
)
else
:
model_save_dir_fluid
=
os
.
path
.
join
(
model_save_dir
,
"fluid_persistables_2"
)
model_save_dir_fleet
=
os
.
path
.
join
(
model_save_dir
,
"fleet_persistables_2"
)
infer_save_dir_fluid
=
os
.
path
.
join
(
model_save_dir
,
"fluid_infer_2"
)
infer_save_dir_fleet
=
os
.
path
.
join
(
model_save_dir
,
"fleet_infer_2"
)
fluid
.
io
.
save_persistables
(
exe
,
model_save_dir_fluid
,
fleet
.
_origin_program
)
fleet
.
save_persistables
(
executor
=
exe
,
dirname
=
model_save_dir_fleet
)
feeded_var_names
=
[
var
.
name
for
var
in
feed_var_list
]
fluid
.
io
.
save_inference_model
(
infer_save_dir_fluid
,
feeded_var_names
,
[
avg_cost
],
exe
,
fleet
.
_origin_program
)
fleet
.
save_inference_model
(
exe
,
infer_save_dir_fleet
,
feeded_var_names
,
[
avg_cost
])
def
run_gpu_fleet_api_trainer
(
self
,
args
):
assert
args
.
update_method
==
"nccl2"
...
...
@@ -532,6 +593,7 @@ def runtime_main(test_class):
parser
.
add_argument
(
'--nccl_comm_num'
,
type
=
int
,
required
=
False
,
default
=
1
)
parser
.
add_argument
(
'--enable_backward_deps'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_hallreduce'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_pipeline'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--gpu_fleet_api'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_local_sgd'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--ut4grad_allreduce'
,
action
=
'store_true'
)
...
...
@@ -566,6 +628,8 @@ def runtime_main(test_class):
model
.
run_pserver
(
args
)
elif
args
.
gpu_fleet_api
:
model
.
run_gpu_fleet_api_trainer
(
args
)
elif
args
.
use_pipeline
:
model
.
run_pipeline_trainer
(
args
)
else
:
model
.
run_trainer
(
args
)
...
...
@@ -607,6 +671,7 @@ class TestDistBase(unittest.TestCase):
self
.
_dc_asgd
=
False
# must use with async mode
self
.
_use_reader_alloc
=
True
self
.
_nccl2_mode
=
False
self
.
_pipeline_mode
=
False
self
.
_mp_mode
=
False
# FIXME(typhoonzero): I added this stupid argument to enable
# testing allreduce layers, which users can call layers.allreduce
...
...
@@ -892,6 +957,8 @@ class TestDistBase(unittest.TestCase):
if
self
.
_use_dgc
:
tr_cmd
+=
" --use_dgc"
if
self
.
_pipeline_mode
:
tr_cmd
+=
" --use_pipeline"
if
self
.
_mp_mode
:
env
=
{
"FLAGS_selected_gpus"
:
"{}"
.
format
(
trainer_id
%
2
)}
...
...
@@ -978,6 +1045,51 @@ class TestDistBase(unittest.TestCase):
print
(
"outs[1]:"
,
outs
[
1
])
return
pickle
.
loads
(
outs
[
0
]),
pickle
.
loads
(
outs
[
1
])
def
_run_pipeline
(
self
,
model
,
envs
,
check_error_log
,
log_name
):
# NOTE: we reuse ps_endpoints as nccl2 worker endpoints
worker_endpoints
=
self
.
_ps_endpoints
.
split
(
","
)
update_method
=
"nccl2"
trainer_num
=
len
(
worker_endpoints
)
procs
=
[]
pipes
=
[]
for
i
in
range
(
0
,
trainer_num
):
tr_cmd
,
tr_env
=
self
.
_get_nccl2_trainer_cmd
(
model
,
worker_endpoints
[
i
],
update_method
,
i
,
trainer_num
)
tr_env
.
update
(
envs
)
tr_env
[
'CUDA_VISIBLE_DEVICES'
]
=
"0,1"
tr_env
[
'NCCL_SHM_DISABLE'
]
=
'1'
tr_env
[
'FLAGS_selected_gpus'
]
=
str
(
i
)
tr_env
[
'FLAGS_cudnn_deterministic'
]
=
'0'
print
(
"tr_cmd:{}, env: {}"
.
format
(
tr_cmd
,
tr_env
))
tr_pipe
=
open
(
"/tmp/"
+
"tr{}_err.log"
.
format
(
i
),
"wb"
)
print_to_err
(
type
(
self
).
__name__
,
"going to start process {} with nccl2"
.
format
(
i
))
tr_proc
=
subprocess
.
Popen
(
tr_cmd
.
strip
().
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
tr_pipe
,
env
=
tr_env
)
procs
.
append
(
tr_proc
)
pipes
.
append
(
tr_pipe
)
outs
=
[]
for
i
in
range
(
0
,
trainer_num
):
tr_out
,
tr_err
=
procs
[
i
].
communicate
()
outs
.
append
(
tr_out
)
pipes
[
i
].
close
()
sys
.
stderr
.
write
(
'trainer {} stderr: {}
\n
'
.
format
(
i
,
tr_err
))
if
check_error_log
:
print
(
"outs[0]:"
,
outs
[
0
])
print
(
"outs[1]:"
,
outs
[
1
])
return
pickle
.
loads
(
outs
[
0
]),
pickle
.
loads
(
outs
[
1
])
def
_get_required_envs
(
self
,
check_error_log
=
False
,
need_envs
=
{}):
# TODO(typhoonzero): should auto adapt GPU count on the machine.
required_envs
=
{
...
...
@@ -1032,6 +1144,9 @@ class TestDistBase(unittest.TestCase):
False
,
check_error_log
,
log_name
=
log_name
)
elif
self
.
_pipeline_mode
:
tr0_losses
,
tr1_losses
=
self
.
_run_pipeline
(
model_file
,
required_envs
,
check_error_log
,
log_name
=
log_name
)
else
:
tr0_losses
,
tr1_losses
=
self
.
_run_cluster
(
model_file
,
required_envs
,
check_error_log
,
log_name
=
log_name
)
...
...
@@ -1040,6 +1155,9 @@ class TestDistBase(unittest.TestCase):
local_loss
=
local_losses
[
step_id
]
tr0_loss
=
tr0_losses
[
step_id
]
tr1_loss
=
tr1_losses
[
step_id
]
if
self
.
_pipeline_mode
:
dist_loss
=
np
.
array
([
tr1_loss
])
else
:
dist_loss
=
(
np
.
array
([
tr0_loss
])
+
np
.
array
([
tr1_loss
]))
/
2
print
(
"======="
,
local_loss
,
":"
,
dist_loss
[
0
],
"======="
)
self
.
assertAlmostEqual
(
local_loss
,
dist_loss
[
0
],
delta
=
delta
)
...
...
python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
浏览文件 @
f77a78cd
...
...
@@ -16,6 +16,8 @@ import unittest
import
paddle
import
os
paddle
.
enable_static
()
class
TestFleetMetaOptimizer
(
unittest
.
TestCase
):
def
setUp
(
self
):
...
...
@@ -28,19 +30,14 @@ class TestFleetMetaOptimizer(unittest.TestCase):
import
paddle.distributed.fleet.base.role_maker
as
role_maker
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
with
paddle
.
fluid
.
device_guard
(
"
cpu
"
):
with
paddle
.
fluid
.
device_guard
(
"
gpu:0
"
):
input_x
=
paddle
.
fluid
.
layers
.
data
(
name
=
"x"
,
shape
=
[
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
fluid
.
layers
.
data
(
name
=
"y"
,
shape
=
[
1
],
dtype
=
'int64'
)
data_loader
=
paddle
.
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
[
input_x
,
input_y
],
capacity
=
64
,
use_double_buffer
=
True
,
iterable
=
False
)
fc_1
=
paddle
.
fluid
.
layers
.
fc
(
input
=
input_x
,
size
=
64
,
act
=
'tanh'
)
with
paddle
.
fluid
.
device_guard
(
"gpu:
0
"
):
with
paddle
.
fluid
.
device_guard
(
"gpu:
1
"
):
fc_2
=
paddle
.
fluid
.
layers
.
fc
(
input
=
fc_1
,
size
=
64
,
act
=
'tanh'
)
prediction
=
paddle
.
fluid
.
layers
.
fc
(
input
=
[
fc_2
],
size
=
2
,
...
...
python/paddle/fluid/tests/unittests/test_pipeline.py
浏览文件 @
f77a78cd
...
...
@@ -13,212 +13,32 @@
# limitations under the License.
from
__future__
import
print_function
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.layers
as
layers
import
numpy
as
np
import
os
import
shutil
import
unittest
import
math
def
conv_bn_layer
(
input
,
num_filters
,
filter_size
,
stride
=
1
,
groups
=
1
,
act
=
None
):
conv
=
fluid
.
layers
.
conv2d
(
input
=
input
,
num_filters
=
num_filters
,
filter_size
=
filter_size
,
stride
=
stride
,
padding
=
(
filter_size
-
1
)
//
2
,
groups
=
groups
,
act
=
None
,
bias_attr
=
False
)
return
fluid
.
layers
.
batch_norm
(
input
=
conv
,
act
=
act
,
)
def
shortcut
(
input
,
ch_out
,
stride
,
is_first
):
ch_in
=
input
.
shape
[
1
]
if
ch_in
!=
ch_out
or
stride
!=
1
or
is_first
==
True
:
return
conv_bn_layer
(
input
,
ch_out
,
1
,
stride
)
else
:
return
input
def
bottleneck_block
(
input
,
num_filters
,
stride
):
conv0
=
conv_bn_layer
(
input
=
input
,
num_filters
=
num_filters
,
filter_size
=
1
,
act
=
'relu'
)
conv1
=
conv_bn_layer
(
input
=
conv0
,
num_filters
=
num_filters
,
filter_size
=
3
,
stride
=
stride
,
act
=
'relu'
)
conv2
=
conv_bn_layer
(
input
=
conv1
,
num_filters
=
num_filters
*
4
,
filter_size
=
1
,
act
=
None
)
short
=
shortcut
(
input
,
num_filters
*
4
,
stride
,
is_first
=
False
)
return
fluid
.
layers
.
elementwise_add
(
x
=
short
,
y
=
conv2
,
act
=
'relu'
)
def
basic_block
(
input
,
num_filters
,
stride
,
is_first
):
conv0
=
conv_bn_layer
(
input
=
input
,
num_filters
=
num_filters
,
filter_size
=
3
,
act
=
'relu'
,
stride
=
stride
)
conv1
=
conv_bn_layer
(
input
=
conv0
,
num_filters
=
num_filters
,
filter_size
=
3
,
act
=
None
)
short
=
shortcut
(
input
,
num_filters
,
stride
,
is_first
)
return
fluid
.
layers
.
elementwise_add
(
x
=
short
,
y
=
conv1
,
act
=
'relu'
)
from
test_dist_base
import
TestDistBase
def
build_network
(
input
,
layers
=
50
,
class_dim
=
1000
):
supported_layers
=
[
18
,
34
,
50
,
101
,
152
]
assert
layers
in
supported_layers
depth
=
None
if
layers
==
18
:
depth
=
[
2
,
2
,
2
,
2
]
elif
layers
==
34
or
layers
==
50
:
depth
=
[
3
,
4
,
6
,
3
]
elif
layers
==
101
:
depth
=
[
3
,
4
,
23
,
3
]
elif
layers
==
152
:
depth
=
[
3
,
8
,
36
,
3
]
num_filters
=
[
64
,
128
,
256
,
512
]
with
fluid
.
device_guard
(
"cpu"
):
conv
=
conv_bn_layer
(
input
=
input
,
num_filters
=
64
,
filter_size
=
7
,
stride
=
2
,
act
=
'relu'
)
conv
=
fluid
.
layers
.
pool2d
(
input
=
conv
,
pool_size
=
3
,
pool_stride
=
2
,
pool_padding
=
1
,
pool_type
=
'max'
)
if
layers
>=
50
:
for
block
in
range
(
len
(
depth
)):
with
fluid
.
device_guard
(
"gpu:0"
):
for
i
in
range
(
depth
[
block
]):
conv
=
bottleneck_block
(
input
=
conv
,
num_filters
=
num_filters
[
block
],
stride
=
2
if
i
==
0
and
block
!=
0
else
1
)
with
fluid
.
device_guard
(
"gpu:0"
):
pool
=
fluid
.
layers
.
pool2d
(
input
=
conv
,
pool_size
=
7
,
pool_type
=
'avg'
,
global_pooling
=
True
)
stdv
=
1.0
/
math
.
sqrt
(
pool
.
shape
[
1
]
*
1.0
)
out
=
fluid
.
layers
.
fc
(
input
=
pool
,
size
=
class_dim
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
-
stdv
,
stdv
)))
else
:
for
block
in
range
(
len
(
depth
)):
with
fluid
.
device_guard
(
"gpu:0"
):
for
i
in
range
(
depth
[
block
]):
conv
=
basic_block
(
input
=
conv
,
num_filters
=
num_filters
[
block
],
stride
=
2
if
i
==
0
and
block
!=
0
else
1
,
is_first
=
block
==
i
==
0
)
with
fluid
.
device_guard
(
"gpu:0"
):
pool
=
fluid
.
layers
.
pool2d
(
input
=
conv
,
pool_size
=
7
,
pool_type
=
'avg'
,
global_pooling
=
True
)
stdv
=
1.0
/
math
.
sqrt
(
pool
.
shape
[
1
]
*
1.0
)
out
=
fluid
.
layers
.
fc
(
input
=
pool
,
size
=
class_dim
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Uniform
(
-
stdv
,
stdv
)))
return
out
class
TestPipeline
(
unittest
.
TestCase
):
""" TestCases for Pipeline Training. """
def
_run
(
self
,
debug
):
main_prog
=
fluid
.
Program
()
startup_prog
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
with
fluid
.
device_guard
(
"cpu"
):
image
=
fluid
.
layers
.
data
(
name
=
"image"
,
shape
=
[
3
,
224
,
224
],
dtype
=
"float32"
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
[
image
,
label
],
capacity
=
64
,
use_double_buffer
=
True
,
iterable
=
False
)
fc
=
build_network
(
image
,
layers
=
50
)
with
fluid
.
device_guard
(
"gpu:0"
):
out
,
prob
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
fc
,
label
=
label
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
out
)
acc_top1
=
fluid
.
layers
.
accuracy
(
input
=
prob
,
label
=
label
,
k
=
1
)
acc_top5
=
fluid
.
layers
.
accuracy
(
input
=
prob
,
label
=
label
,
k
=
5
)
base_lr
=
0.1
passes
=
[
30
,
60
,
80
,
90
]
total_images
=
1281167
steps_per_pass
=
total_images
//
128
bd
=
[
steps_per_pass
*
p
for
p
in
passes
]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
lr_val
=
fluid
.
layers
.
piecewise_decay
(
boundaries
=
bd
,
values
=
lr
)
optimizer
=
fluid
.
optimizer
.
MomentumOptimizer
(
lr_val
,
momentum
=
0.9
,
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-4
))
optimizer
=
fluid
.
optimizer
.
PipelineOptimizer
(
optimizer
,
num_microbatches
=
2
)
optimizer
.
minimize
(
loss
)
def
train_reader
():
for
_
in
range
(
4
):
img
=
np
.
random
.
random
(
size
=
[
3
,
224
,
224
]).
astype
(
'float32'
)
label
=
np
.
random
.
random
(
size
=
[
1
]).
astype
(
'int64'
)
yield
img
,
label
data_loader
.
set_sample_generator
(
train_reader
,
batch_size
=
1
)
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
data_loader
.
start
()
exe
.
train_from_dataset
(
main_prog
,
debug
=
debug
)
def
test_pipeline
(
self
):
self
.
_run
(
False
)
self
.
_run
(
True
)
def
test_pipeline_noneoptimizer
(
self
):
with
fluid
.
device_guard
(
"gpu:0"
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1
],
dtype
=
'int64'
,
lod_level
=
0
)
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'int64'
,
lod_level
=
0
)
emb_x
=
layers
.
embedding
(
input
=
x
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"embx"
),
size
=
[
10
,
2
],
is_sparse
=
False
)
fc
=
layers
.
fc
(
input
=
emb_x
,
name
=
"fc"
,
size
=
1
,
num_flatten_dims
=
1
,
bias_attr
=
False
)
loss
=
layers
.
reduce_mean
(
fc
)
import
os
import
paddle
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.5
)
with
self
.
assertRaises
(
ValueError
):
optimizer
=
fluid
.
optimizer
.
PipelineOptimizer
(
dict
(),
num_microbatches
=
2
)
paddle
.
enable_static
()
flag_name
=
os
.
path
.
splitext
(
__file__
)[
0
]
class
TestPipeline
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_use_reduce
=
False
self
.
_use_reader_alloc
=
False
self
.
_pipeline_mode
=
True
self
.
_nccl_comm_num
=
1
def
test_dist_train
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
self
.
check_with_place
(
"pipeline_mnist.py"
,
delta
=
1e-5
,
check_error_log
=
True
,
log_name
=
flag_name
)
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录