Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
c3974d0e
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
c3974d0e
编写于
3月 26, 2021
作者:
L
lilong12
提交者:
GitHub
3月 26, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[3D-parallel] Reformat pipeline parallel (#31786)
* update, test=develop
上级
01aa2526
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
816 addition
and
569 deletion
+816
-569
paddle/fluid/framework/section_worker.cc
paddle/fluid/framework/section_worker.cc
+10
-10
python/paddle/distributed/fleet/meta_optimizers/common.py
python/paddle/distributed/fleet/meta_optimizers/common.py
+38
-3
python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
...e/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+136
-172
python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+7
-3
python/paddle/fluid/device_worker.py
python/paddle/fluid/device_worker.py
+1
-1
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+15
-8
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+589
-365
python/paddle/fluid/tests/unittests/pipeline_mnist.py
python/paddle/fluid/tests/unittests/pipeline_mnist.py
+20
-7
未找到文件。
paddle/fluid/framework/section_worker.cc
浏览文件 @
c3974d0e
...
@@ -39,13 +39,13 @@ void SectionWorker::RunForward(
...
@@ -39,13 +39,13 @@ void SectionWorker::RunForward(
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
// We run op with op_role = kLRSched only for the first microbatch
// We run op with op_role = kLRSched only for the first microbatch
// to avoid increasing the @LR_DECAY_STEP@ multiple times.
// to avoid increasing the @LR_DECAY_STEP@ multiple times.
bool
run_first_mbatch
=
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
)
||
bool
run_first_mbatch
=
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
)
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
(
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
))
||
static_cast
<
int
>
(
OpRole
::
kLoss
)
))
||
op_role
==
static_cast
<
int
>
(
OpRole
::
kLRSched
);
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kLRSched
)
);
bool
run_others
=
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
)
||
bool
run_others
=
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
)
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
(
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kForward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
));
static_cast
<
int
>
(
OpRole
::
kLoss
)
));
if
((
micro_id
==
0
&&
run_first_mbatch
)
||
(
micro_id
!=
0
&&
run_others
))
{
if
((
micro_id
==
0
&&
run_first_mbatch
)
||
(
micro_id
!=
0
&&
run_others
))
{
VLOG
(
3
)
<<
"Forward: running op "
<<
op
->
Type
()
<<
" for micro-batch "
VLOG
(
3
)
<<
"Forward: running op "
<<
op
->
Type
()
<<
" for micro-batch "
<<
micro_id
;
<<
micro_id
;
...
@@ -64,9 +64,9 @@ void SectionWorker::RunBackward(
...
@@ -64,9 +64,9 @@ void SectionWorker::RunBackward(
&
unused_vars_
)
{
&
unused_vars_
)
{
for
(
auto
&
op
:
ops_
)
{
for
(
auto
&
op
:
ops_
)
{
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
int
op_role
=
op
->
Attr
<
int
>
(
std
::
string
(
"op_role"
));
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kBackward
)
||
if
(
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kBackward
)
)
||
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kBackward
)
|
(
op_role
==
(
static_cast
<
int
>
(
OpRole
::
kBackward
)
|
static_cast
<
int
>
(
OpRole
::
kLoss
)))
{
static_cast
<
int
>
(
OpRole
::
kLoss
)
)))
{
VLOG
(
3
)
<<
"Backward: running op "
<<
op
->
Type
()
<<
" for micro-batch "
VLOG
(
3
)
<<
"Backward: running op "
<<
op
->
Type
()
<<
" for micro-batch "
<<
micro_id
;
<<
micro_id
;
op
->
Run
(
*
microbatch_scopes_
[
micro_id
],
place_
);
op
->
Run
(
*
microbatch_scopes_
[
micro_id
],
place_
);
...
...
python/paddle/distributed/fleet/meta_optimizers/common.py
浏览文件 @
c3974d0e
...
@@ -47,7 +47,7 @@ def is_optimizer_op(op):
...
@@ -47,7 +47,7 @@ def is_optimizer_op(op):
class
CollectiveHelper
(
object
):
class
CollectiveHelper
(
object
):
def
__init__
(
self
,
role_maker
,
nrings
=
1
,
wait_port
=
'6174'
):
def
__init__
(
self
,
role_maker
,
nrings
=
1
,
wait_port
=
True
):
self
.
nrings
=
nrings
self
.
nrings
=
nrings
self
.
wait_port
=
wait_port
self
.
wait_port
=
wait_port
self
.
role_maker
=
role_maker
self
.
role_maker
=
role_maker
...
@@ -65,14 +65,48 @@ class CollectiveHelper(object):
...
@@ -65,14 +65,48 @@ class CollectiveHelper(object):
self
.
role_maker
.
_worker_index
(),
ring_id
,
self
.
wait_port
)
self
.
role_maker
.
_worker_index
(),
ring_id
,
self
.
wait_port
)
self
.
_broadcast_params
()
self
.
_broadcast_params
()
def
_init_communicator
(
self
,
program
,
current_endpoint
,
endpoints
,
rank
,
def
_init_communicator
(
self
,
ring_id
,
wait_port
):
program
,
current_endpoint
,
endpoints
,
rank
,
ring_id
,
wait_port
,
global_ring_id
=
None
,
sync
=
True
):
nranks
=
len
(
endpoints
)
nranks
=
len
(
endpoints
)
other_endpoints
=
endpoints
[:]
other_endpoints
=
endpoints
[:]
other_endpoints
.
remove
(
current_endpoint
)
other_endpoints
.
remove
(
current_endpoint
)
if
rank
==
0
and
wait_port
:
if
rank
==
0
and
wait_port
:
wait_server_ready
(
other_endpoints
)
wait_server_ready
(
other_endpoints
)
def
_add_sync_by_allreduce
(
block
):
sync_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'sync_var'
),
dtype
=
core
.
VarDesc
.
VarType
.
INT32
,
persistable
=
False
,
stop_gradient
=
True
)
block
.
append_op
(
type
=
'fill_constant'
,
inputs
=
{},
outputs
=
{
'Out'
:
[
sync_var
]},
attrs
=
{
'shape'
:
[
1
],
'dtype'
:
sync_var
.
dtype
,
'value'
:
1
,
'force_cpu'
:
False
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
block
.
append_op
(
type
=
'c_allreduce_sum'
,
inputs
=
{
'X'
:
[
sync_var
]},
outputs
=
{
'Out'
:
[
sync_var
]},
attrs
=
{
'ring_id'
:
global_ring_id
,
'use_calc_stream'
:
True
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
block
=
program
.
global_block
()
block
=
program
.
global_block
()
if
core
.
is_compiled_with_cuda
():
if
core
.
is_compiled_with_cuda
():
comm_id_var
=
block
.
create_var
(
comm_id_var
=
block
.
create_var
(
...
@@ -128,6 +162,7 @@ class CollectiveHelper(object):
...
@@ -128,6 +162,7 @@ class CollectiveHelper(object):
raise
ValueError
(
raise
ValueError
(
"comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
"comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
)
)
if
sync
:
_add_sync_by_allreduce
(
block
)
def
_wait
(
self
,
current_endpoint
,
endpoints
):
def
_wait
(
self
,
current_endpoint
,
endpoints
):
assert
(
self
.
wait_port
)
assert
(
self
.
wait_port
)
...
...
python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
浏览文件 @
c3974d0e
...
@@ -19,130 +19,21 @@ from paddle.fluid import core, unique_name
...
@@ -19,130 +19,21 @@ from paddle.fluid import core, unique_name
from
..base.private_helper_function
import
wait_server_ready
from
..base.private_helper_function
import
wait_server_ready
from
paddle.fluid.optimizer
import
PipelineOptimizer
as
PO
from
paddle.fluid.optimizer
import
PipelineOptimizer
as
PO
from
.meta_optimizer_base
import
MetaOptimizerBase
from
.meta_optimizer_base
import
MetaOptimizerBase
from
.common
import
OpRole
,
OP_ROLE_KEY
,
OP_ROLE_VAR_KEY
,
CollectiveHelper
,
is_update_op
,
is_loss_grad_op
,
is_backward_op
,
is_optimizer_op
from
.common
import
OpRole
,
OP_ROLE_KEY
,
OP_ROLE_VAR_KEY
,
CollectiveHelper
,
is_loss_grad_op
,
is_backward_op
,
is_optimizer_op
def
_get_node_num
(
endpoints
):
ss
=
set
()
for
ep
in
endpoints
:
ip
=
ep
.
split
(
":"
)[
0
].
strip
()
if
ip
not
in
ss
:
ss
.
add
(
ip
)
return
len
(
ss
)
class
PipelineHelper
(
object
):
def
__init__
(
self
,
role_maker
,
wait_port
=
'6174'
):
self
.
wait_port
=
wait_port
self
.
role_maker
=
role_maker
def
update_startup_program
(
self
,
startup_program
=
None
,
inner_parallelism
=
None
):
self
.
startup_program
=
startup_program
nranks
=
self
.
role_maker
.
_worker_num
()
rank
=
self
.
role_maker
.
_worker_index
()
endpoints
=
self
.
role_maker
.
_get_trainer_endpoints
()
current_endpoint
=
endpoints
[
rank
]
node_num
=
_get_node_num
(
endpoints
)
assert
nranks
%
node_num
==
0
# Create ring 0 for all gpus in the same pipeline
if
inner_parallelism
>
1
:
pipeline_rank
=
rank
%
inner_parallelism
pipeline_id
=
rank
//
inner_parallelism
start_index
=
pipeline_id
*
inner_parallelism
pipeline_endpoints
=
endpoints
[
start_index
:
start_index
+
inner_parallelism
]
self
.
_init_communicator
(
self
.
startup_program
,
current_endpoint
,
pipeline_endpoints
,
pipeline_rank
,
0
,
self
.
wait_port
)
pipeline_num
=
len
(
endpoints
)
//
inner_parallelism
if
pipeline_num
==
1
:
return
# Create rings for gpus with the same pipeline id for data parallel
eps
=
[]
pipeline_rank
=
rank
%
inner_parallelism
ring_id
=
pipeline_rank
+
1
for
i
in
range
(
pipeline_num
):
eps
.
append
(
endpoints
[
i
*
inner_parallelism
+
pipeline_rank
])
# rank in a ring of gpus with the same pipeline id for data parallel
dp_rank
=
rank
//
inner_parallelism
self
.
_init_communicator
(
self
.
startup_program
,
current_endpoint
,
eps
,
dp_rank
,
ring_id
,
self
.
wait_port
)
self
.
_broadcast_params
(
ring_id
)
def
_init_communicator
(
self
,
program
,
current_endpoint
,
endpoints
,
rank
,
ring_id
,
wait_port
):
nranks
=
len
(
endpoints
)
other_endpoints
=
endpoints
[:]
other_endpoints
.
remove
(
current_endpoint
)
if
rank
==
0
and
wait_port
:
wait_server_ready
(
other_endpoints
)
block
=
program
.
global_block
()
nccl_id_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
inputs
=
{},
outputs
=
{
'Out'
:
nccl_id_var
},
attrs
=
{
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
,
OP_ROLE_KEY
:
OpRole
.
Forward
,
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
,
})
def
_broadcast_params
(
self
,
ring_id
):
block
=
self
.
startup_program
.
global_block
()
for
var_name
in
block
.
vars
:
if
"nccl_id"
in
var_name
:
continue
param
=
block
.
var
(
var_name
)
if
not
param
.
persistable
:
continue
block
.
append_op
(
type
=
'c_broadcast'
,
inputs
=
{
'X'
:
param
},
outputs
=
{
'Out'
:
param
},
attrs
=
{
'ring_id'
:
ring_id
,
'root'
:
0
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
block
.
append_op
(
type
=
'c_sync_comm_stream'
,
inputs
=
{
'X'
:
param
},
outputs
=
{
'Out'
:
param
},
attrs
=
{
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
class
PipelineOptimizer
(
MetaOptimizerBase
):
class
PipelineOptimizer
(
MetaOptimizerBase
):
def
__init__
(
self
,
optimizer
):
def
__init__
(
self
,
optimizer
):
super
(
PipelineOptimizer
,
self
).
__init__
(
optimizer
)
super
(
PipelineOptimizer
,
self
).
__init__
(
optimizer
)
self
.
inner_opt
=
optimizer
self
.
inner_opt
=
optimizer
# we do not allow meta optimizer to be inner optimizer currently
self
.
meta_optimizers_white_list
=
[
self
.
meta_optimizers_white_list
=
[
"RecomputeOptimizer"
,
"RecomputeOptimizer"
,
"AMPOptimizer"
,
"AMPOptimizer"
,
]
]
self
.
meta_optimizers_black_list
=
[
"GraphExecutionOptimizer"
,
]
self
.
meta_optimizers_black_list
=
[
"GraphExecutionOptimizer"
,
]
self
.
global_ring_id
=
1
self
.
dp_ring_id
=
2
self
.
start_pipeline_ring_id
=
20
# Just a magic number
def
_set_basic_info
(
self
,
loss
,
role_maker
,
user_defined_optimizer
,
def
_set_basic_info
(
self
,
loss
,
role_maker
,
user_defined_optimizer
,
user_defined_strategy
):
user_defined_strategy
):
...
@@ -165,7 +56,11 @@ class PipelineOptimizer(MetaOptimizerBase):
...
@@ -165,7 +56,11 @@ class PipelineOptimizer(MetaOptimizerBase):
def
_disable_strategy
(
self
,
dist_strategy
):
def
_disable_strategy
(
self
,
dist_strategy
):
dist_strategy
.
pipeline
=
False
dist_strategy
.
pipeline
=
False
dist_strategy
.
pipeline_configs
=
{}
dist_strategy
.
pipeline_configs
=
{
"micro_batch_size"
:
1
,
"accumulate_steps"
:
1
,
"schedule_mode"
:
"1F1B"
,
}
def
_enable_strategy
(
self
,
dist_strategy
,
context
):
def
_enable_strategy
(
self
,
dist_strategy
,
context
):
dist_strategy
.
pipeline
=
True
dist_strategy
.
pipeline
=
True
...
@@ -175,61 +70,134 @@ class PipelineOptimizer(MetaOptimizerBase):
...
@@ -175,61 +70,134 @@ class PipelineOptimizer(MetaOptimizerBase):
"schedule_mode"
:
"1F1B"
,
"schedule_mode"
:
"1F1B"
,
}
}
def
_broadcast_params
(
self
,
ring_id
):
block
=
self
.
startup_program
.
global_block
()
param
=
None
for
param
in
block
.
iter_parameters
():
if
param
.
is_distributed
:
continue
block
.
append_op
(
type
=
'c_broadcast'
,
inputs
=
{
'X'
:
param
},
outputs
=
{
'Out'
:
param
},
attrs
=
{
'ring_id'
:
ring_id
,
'root'
:
0
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
if
not
param
:
return
# no parameter on this device
block
.
append_op
(
type
=
'c_sync_comm_stream'
,
inputs
=
{
'X'
:
param
},
outputs
=
{
'Out'
:
param
},
attrs
=
{
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Forward
})
def
_get_process_group_info
(
self
):
# global ring info
self
.
global_endpoints
=
self
.
endpoints
self
.
global_rank
=
self
.
rank
self
.
global_nranks
=
self
.
nranks
# data parallel ring info
if
self
.
pipeline_num
>
1
:
self
.
dp_rank
=
self
.
rank
//
self
.
inner_parallelism
self
.
dp_nranks
=
self
.
nranks
//
self
.
inner_parallelism
start_index
=
self
.
rank
%
self
.
inner_parallelism
self
.
dp_endpoints
=
[
self
.
endpoints
[
start_index
+
i
*
self
.
inner_parallelism
]
for
i
in
range
(
self
.
pipeline_num
)
]
def
_init_process_group
(
self
,
pipeline_pair
,
pipeline_ring_map
):
self
.
_get_process_group_info
()
collective_helper
=
CollectiveHelper
(
self
.
role_maker
,
wait_port
=
False
)
# Create global ring for all gpus (ring_id = 0)
collective_helper
.
_init_communicator
(
self
.
startup_program
,
self
.
current_endpoint
,
self
.
global_endpoints
,
self
.
global_rank
,
self
.
global_ring_id
,
True
,
self
.
global_ring_id
,
True
)
# Create pipeline rings
if
self
.
inner_parallelism
>
1
:
pipeline_id
=
self
.
rank
//
self
.
inner_parallelism
start_index
=
pipeline_id
*
self
.
inner_parallelism
for
pair
in
pipeline_pair
:
pair_key
=
pair
[
0
]
*
1000
+
pair
[
1
]
ring_id
=
pipeline_ring_map
[
pair_key
]
assert
ring_id
>=
self
.
start_pipeline_ring_id
first_node
=
pair
[
0
]
+
start_index
second_node
=
pair
[
1
]
+
start_index
if
self
.
rank
!=
first_node
and
self
.
rank
!=
second_node
:
continue
pipeline_endpoints
=
[
self
.
endpoints
[
first_node
],
self
.
endpoints
[
second_node
]
]
pipeline_rank
=
0
if
self
.
rank
==
first_node
else
1
pipeline_nranks
=
2
collective_helper
.
_init_communicator
(
self
.
startup_program
,
self
.
current_endpoint
,
pipeline_endpoints
,
pipeline_rank
,
ring_id
,
False
,
self
.
global_ring_id
,
True
)
# Create dp rings
if
self
.
pipeline_num
>
1
:
collective_helper
.
_init_communicator
(
self
.
startup_program
,
self
.
current_endpoint
,
self
.
dp_endpoints
,
self
.
dp_rank
,
self
.
dp_ring_id
,
True
,
self
.
global_ring_id
,
True
)
self
.
_broadcast_params
(
self
.
dp_ring_id
)
def
minimize_impl
(
self
,
def
minimize_impl
(
self
,
loss
,
loss
,
startup_program
=
None
,
startup_program
=
None
,
parameter_list
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
no_grad_set
=
None
):
endpoints
=
self
.
role_maker
.
_get_trainer_endpoints
()
self
.
endpoints
=
self
.
role_maker
.
_get_trainer_endpoints
()
current_endpoint
=
endpoints
[
self
.
role_maker
.
_worker_index
()]
self
.
current_endpoint
=
self
.
endpoints
[
self
.
role_maker
.
_worker_index
()]
self
.
wrapped_opt
=
PO
(
self
.
inner_opt
,
num_microbatches
=
self
.
num_microbatches
)
node_num
=
_get_node_num
(
endpoints
)
gpus_per_node
=
len
(
endpoints
)
//
node_num
self
.
startup_program
=
startup_program
if
startup_program
is
None
:
self
.
startup_program
=
fluid
.
default_startup_program
()
self
.
rank
=
self
.
role_maker
.
_worker_index
()
self
.
rank
=
self
.
role_maker
.
_worker_index
()
self
.
nranks
=
self
.
role_maker
.
_worker_num
()
self
.
nranks
=
self
.
role_maker
.
_worker_num
()
assert
self
.
nranks
%
node_num
==
0
loss
.
block
.
program
.
_pipeline_opt
=
dict
()
self
.
wrapped_opt
=
PO
(
self
.
inner_opt
,
loss
.
block
.
program
.
_pipeline_opt
[
'local_rank'
]
=
self
.
rank
num_microbatches
=
self
.
num_microbatches
)
loss
.
block
.
program
.
_pipeline_opt
[
orig_startup_program
=
startup_program
if
startup_program
else
fluid
.
default_startup_program
(
'micro_batch_size'
]
=
self
.
micro_batch_size
)
loss
.
block
.
program
.
_pipeline_opt
[
'schedule_mode'
]
=
self
.
schedule_mode
block
=
loss
.
block
optimize_ops
,
params_grads
,
prog_list
=
self
.
wrapped_opt
.
minimize
(
program
=
block
.
program
program
.
_pipeline_opt
=
dict
()
program
.
_pipeline_opt
[
'local_rank'
]
=
self
.
rank
program
.
_pipeline_opt
[
'global_ring_id'
]
=
self
.
global_ring_id
program
.
_pipeline_opt
[
'ring_id'
]
=
self
.
start_pipeline_ring_id
program
.
_pipeline_opt
[
'micro_batch_size'
]
=
self
.
micro_batch_size
program
.
_pipeline_opt
[
'schedule_mode'
]
=
self
.
schedule_mode
optimize_ops
,
params_grads
,
prog_list
,
pp_pair
,
ring_map
=
self
.
wrapped_opt
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
assert
prog_list
self
.
startup_program
=
orig_startup_program
.
_pipeline_opt
[
'startup_program'
]
self
.
main_program_list
=
prog_list
self
.
inner_parallelism
=
program
.
_pipeline_opt
[
'inner_parallelism'
]
self
.
main_program
=
loss
.
block
.
program
self
.
inner_parallelism
=
loss
.
block
.
program
.
_pipeline_opt
[
'inner_parallelism'
]
assert
self
.
nranks
%
self
.
inner_parallelism
==
0
assert
self
.
nranks
%
self
.
inner_parallelism
==
0
assert
prog_list
self
.
pipeline_num
=
len
(
self
.
endpoints
)
//
self
.
inner_parallelism
pipeline_helper
=
PipelineHelper
(
self
.
role_maker
)
self
.
_init_process_group
(
pp_pair
,
ring_map
)
pipeline_helper
.
update_startup_program
(
self
.
startup_program
.
_pipeline_opt
[
"startup_program"
],
self
.
inner_parallelism
)
pipeline_num
=
self
.
nranks
//
self
.
inner_parallelism
self
.
main_program_list
=
prog_list
self
.
_transpile_main_program
(
loss
,
pipeline_num
,
self
.
inner_parallelism
)
self
.
main_program
=
program
if
self
.
pipeline_num
>
1
:
self
.
_transpile_main_program
(
loss
)
return
optimize_ops
,
params_grads
return
optimize_ops
,
params_grads
def
_transpile_main_program
(
self
,
loss
,
pipeline_num
,
inner_parallelism
):
def
_transpile_main_program
(
self
,
loss
):
if
pipeline_num
<=
1
:
return
self
.
_insert_loss_grad_ops
(
loss
,
self
.
pipeline_num
)
self
.
_insert_loss_grad_ops
(
loss
,
pipeline_num
)
self
.
_insert_allreduce_ops
(
self
.
dp_ring_id
)
for
ring_id
in
range
(
1
,
inner_parallelism
+
1
):
self
.
_insert_allreduce_ops
(
ring_id
)
def
_insert_loss_grad_ops
(
self
,
loss
,
pipeline_num
):
def
_insert_loss_grad_ops
(
self
,
loss
,
pipeline_num
):
"""
"""
In order to keep the learning rate consistent in different numbers of
In order to keep the learning rate consistent in different numbers of
training workers, we scale the loss grad by the number of workers
training workers, we scale the loss grad by the number of workers
"""
"""
block
=
self
.
main_program_list
[
-
1
]
[
'program'
]
.
global_block
()
block
=
self
.
main_program_list
[
-
1
].
global_block
()
for
idx
,
op
in
reversed
(
list
(
enumerate
(
block
.
ops
))):
for
idx
,
op
in
reversed
(
list
(
enumerate
(
block
.
ops
))):
if
is_loss_grad_op
(
op
):
if
is_loss_grad_op
(
op
):
loss_grad_var
=
block
.
vars
[
op
.
output_arg_names
[
0
]]
loss_grad_var
=
block
.
vars
[
op
.
output_arg_names
[
0
]]
...
@@ -244,57 +212,53 @@ class PipelineOptimizer(MetaOptimizerBase):
...
@@ -244,57 +212,53 @@ class PipelineOptimizer(MetaOptimizerBase):
})
})
def
_insert_allreduce_ops
(
self
,
ring_id
):
def
_insert_allreduce_ops
(
self
,
ring_id
):
block
=
self
.
main_program_list
[
ring_id
-
1
][
'program'
].
global_block
()
block
=
self
.
main_program
.
_pipeline_opt
[
'section_program'
].
global_block
(
)
origin_block
=
self
.
main_program
.
global_block
()
origin_block
=
self
.
main_program
.
global_block
()
grad
=
None
grad
=
None
processed_param_name
=
set
()
processed_param_name
=
set
()
first_optimize_op_idx
=
None
add_sync_calc_stream
=
False
for
idx
,
op
in
reversed
(
list
(
enumerate
(
block
.
ops
))):
for
idx
,
op
in
reversed
(
list
(
enumerate
(
block
.
ops
))):
if
is_backward_op
(
op
)
and
not
first_optimize_op_idx
:
first_optimize_op_idx
=
idx
+
1
# no optimize phase
if
first_optimize_op_idx
==
len
(
block
.
ops
):
return
if
is_backward_op
(
op
)
and
\
if
is_backward_op
(
op
)
and
\
OP_ROLE_VAR_KEY
in
op
.
attr_names
:
OP_ROLE_VAR_KEY
in
op
.
attr_names
:
op_role_var
=
op
.
all_attrs
()[
OP_ROLE_VAR_KEY
]
op_role_var
=
op
.
all_attrs
()[
OP_ROLE_VAR_KEY
]
if
len
(
op_role_var
)
==
0
:
if
len
(
op_role_var
)
==
0
:
continue
continue
assert
len
(
op_role_var
)
%
2
==
0
assert
len
(
op_role_var
)
%
2
==
0
offset
=
idx
offset
=
0
for
i
in
range
(
0
,
len
(
op_role_var
),
2
):
for
i
in
range
(
0
,
len
(
op_role_var
),
2
):
param_name
=
op_role_var
[
i
]
param_name
=
op_role_var
[
i
]
param
=
block
.
vars
[
op_role_var
[
i
]]
param
=
block
.
vars
[
op_role_var
[
i
]]
if
param_name
in
processed_param_name
:
continue
if
param_name
in
processed_param_name
:
continue
processed_param_name
.
add
(
param_name
)
processed_param_name
.
add
(
param_name
)
grad
=
block
.
vars
[
op_role_var
[
i
+
1
]]
grad_name
=
op_role_var
[
i
+
1
]
if
not
'MERGED'
in
grad_name
:
grad_name
+=
'@MERGED'
grad
=
block
.
vars
[
grad_name
]
origin_param
=
origin_block
.
vars
[
op_role_var
[
i
]]
origin_param
=
origin_block
.
vars
[
op_role_var
[
i
]]
if
origin_param
.
is_distributed
:
if
origin_param
.
is_distributed
:
continue
continue
if
offset
==
idx
:
if
not
add_sync_calc_stream
:
offset
+=
1
add_sync_calc_stream
=
True
block
.
_insert_op
(
block
.
_insert_op
(
offset
,
first_optimize_op_idx
+
offset
,
type
=
'c_sync_calc_stream'
,
type
=
'c_sync_calc_stream'
,
inputs
=
{
'X'
:
grad
},
inputs
=
{
'X'
:
grad
},
outputs
=
{
'Out'
:
grad
},
outputs
=
{
'Out'
:
grad
},
attrs
=
{
OP_ROLE_KEY
:
OpRole
.
Backward
})
attrs
=
{
OP_ROLE_KEY
:
OpRole
.
Optimize
})
offset
+=
1
offset
+=
1
block
.
_insert_op
(
block
.
_insert_op
(
offset
,
first_optimize_op_idx
+
offset
,
type
=
'c_allreduce_sum'
,
type
=
'c_allreduce_sum'
,
inputs
=
{
'X'
:
grad
},
inputs
=
{
'X'
:
grad
},
outputs
=
{
'Out'
:
grad
},
outputs
=
{
'Out'
:
grad
},
attrs
=
{
attrs
=
{
'ring_id'
:
ring_id
,
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Backward
'use_calc_stream'
:
True
,
OP_ROLE_KEY
:
OpRole
.
Optimize
})
})
if
grad
is
None
:
return
for
idx
,
op
in
enumerate
(
block
.
ops
):
if
is_optimizer_op
(
op
):
block
.
_insert_op
(
idx
,
type
=
'c_sync_comm_stream'
,
inputs
=
{
'X'
:
grad
},
outputs
=
{
'Out'
:
grad
},
attrs
=
{
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Backward
})
break
python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
浏览文件 @
c3974d0e
...
@@ -123,7 +123,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
...
@@ -123,7 +123,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
outputs
=
{
"Out"
:
out_var
},
outputs
=
{
"Out"
:
out_var
},
attrs
=
{
attrs
=
{
"in_dtype"
:
in_var
.
dtype
,
"in_dtype"
:
in_var
.
dtype
,
"out_dtype"
:
out_var
.
dtype
"out_dtype"
:
out_var
.
dtype
,
"op_device"
:
op
.
attr
(
"op_device"
)
})
})
num_cast_ops
+=
1
num_cast_ops
+=
1
_rename_arg
(
op
,
in_var
.
name
,
out_var
.
name
)
_rename_arg
(
op
,
in_var
.
name
,
out_var
.
name
)
...
@@ -171,8 +172,11 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
...
@@ -171,8 +172,11 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
type
=
"cast"
,
type
=
"cast"
,
inputs
=
{
"X"
:
target_var
},
inputs
=
{
"X"
:
target_var
},
outputs
=
{
"Out"
:
cast_var
},
outputs
=
{
"Out"
:
cast_var
},
attrs
=
{
"in_dtype"
:
target_var
.
dtype
,
attrs
=
{
"out_dtype"
:
cast_var
.
dtype
})
"in_dtype"
:
target_var
.
dtype
,
"out_dtype"
:
cast_var
.
dtype
,
"op_device"
:
op
.
attr
(
"op_device"
)
})
num_cast_ops
+=
1
num_cast_ops
+=
1
op_var_rename_map
[
block
.
idx
][
target_var
.
name
]
=
cast_var
.
name
op_var_rename_map
[
block
.
idx
][
target_var
.
name
]
=
cast_var
.
name
...
...
python/paddle/fluid/device_worker.py
浏览文件 @
c3974d0e
...
@@ -427,7 +427,7 @@ class Section(DeviceWorker):
...
@@ -427,7 +427,7 @@ class Section(DeviceWorker):
section_param
.
schedule_mode
=
schedule_mode
section_param
.
schedule_mode
=
schedule_mode
cfg
=
section_param
.
section_config
cfg
=
section_param
.
section_config
program
=
pipeline_opt
[
"section_program"
]
program
=
pipeline_opt
[
"section_program"
]
cfg
.
program_desc
.
ParseFromString
(
program
[
"program"
]
.
_get_desc
()
cfg
.
program_desc
.
ParseFromString
(
program
.
_get_desc
()
.
serialize_to_string
())
.
serialize_to_string
())
# TODO: why does not work
# TODO: why does not work
# cfg.program_desc.CopyFrom(program.program._get_desc())
# cfg.program_desc.CopyFrom(program.program._get_desc())
...
...
python/paddle/fluid/executor.py
浏览文件 @
c3974d0e
...
@@ -1458,7 +1458,7 @@ class Executor(object):
...
@@ -1458,7 +1458,7 @@ class Executor(object):
dataset
.
_prepare_to_run
()
dataset
.
_prepare_to_run
()
real_fetch_list
=
[]
real_fetch_list
=
[]
if
program
.
_pipeline_opt
:
if
program
.
_pipeline_opt
:
real_program
=
program
.
_pipeline_opt
[
"section_program"
]
[
'program'
]
real_program
=
program
.
_pipeline_opt
[
"section_program"
]
for
fetch_var
in
fetch_list
:
for
fetch_var
in
fetch_list
:
if
isinstance
(
fetch_var
,
Variable
):
if
isinstance
(
fetch_var
,
Variable
):
fetch_var_name
=
fetch_var
.
name
fetch_var_name
=
fetch_var
.
name
...
@@ -1467,13 +1467,20 @@ class Executor(object):
...
@@ -1467,13 +1467,20 @@ class Executor(object):
if
fetch_var_name
in
real_program
.
global_block
().
vars
:
if
fetch_var_name
in
real_program
.
global_block
().
vars
:
real_fetch_list
.
append
(
fetch_var
)
real_fetch_list
.
append
(
fetch_var
)
program
.
_pipeline_opt
[
"section_program"
][
program
.
_pipeline_opt
[
"section_program"
]
=
self
.
_add_feed_fetch_ops
(
'program'
]
=
self
.
_add_feed_fetch_ops
(
program
=
program
.
_pipeline_opt
[
"section_program"
],
program
=
program
.
_pipeline_opt
[
"section_program"
][
'program'
],
feed
=
[],
feed
=
[],
fetch_list
=
real_fetch_list
,
fetch_list
=
real_fetch_list
,
feed_var_name
=
'feed'
,
feed_var_name
=
'feed'
,
fetch_var_name
=
'fetch'
)
fetch_var_name
=
'fetch'
)
main_block
=
program
.
_pipeline_opt
[
"section_program"
].
block
(
0
)
for
op
in
main_block
.
ops
:
# set the op_role of fetch op to Optimize to avoid
# erase the fetched vars by gc for pipeline
if
op
.
type
==
'fetch'
:
op
.
_set_attr
(
'op_role'
,
core
.
op_proto_and_checker_maker
.
OpRole
.
Optimize
)
fetch_list
=
None
fetch_list
=
None
scope
,
trainer
=
self
.
_prepare_trainer
(
scope
,
trainer
=
self
.
_prepare_trainer
(
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
c3974d0e
...
@@ -3784,6 +3784,12 @@ class PipelineOptimizer(object):
...
@@ -3784,6 +3784,12 @@ class PipelineOptimizer(object):
"Optimizer, but the given type is {}."
.
format
(
"Optimizer, but the given type is {}."
.
format
(
type
(
optimizer
)))
type
(
optimizer
)))
self
.
_optimizer
=
optimizer
self
.
_optimizer
=
optimizer
# Get the original optimizer defined by users, such as SGD
self
.
_origin_optimizer
=
self
.
_optimizer
while
hasattr
(
self
.
_origin_optimizer
,
"inner_opt"
):
self
.
_origin_optimizer
=
self
.
_origin_optimizer
.
inner_opt
assert
num_microbatches
>=
1
,
(
assert
num_microbatches
>=
1
,
(
"num_microbatches must be a positive value."
)
"num_microbatches must be a positive value."
)
self
.
_num_microbatches
=
num_microbatches
self
.
_num_microbatches
=
num_microbatches
...
@@ -3797,13 +3803,98 @@ class PipelineOptimizer(object):
...
@@ -3797,13 +3803,98 @@ class PipelineOptimizer(object):
self
.
_op_role_var_key
=
op_maker
.
kOpRoleVarAttrName
()
self
.
_op_role_var_key
=
op_maker
.
kOpRoleVarAttrName
()
self
.
_op_device_key
=
op_maker
.
kOpDeviceAttrName
()
self
.
_op_device_key
=
op_maker
.
kOpDeviceAttrName
()
self
.
_param_device_map
=
None
self
.
_param_device_map
=
None
self
.
_pipeline_pair
=
[]
self
.
_pp_ring_map
=
dict
()
self
.
_global_ring_id
=
None
# insert allreduce op to sync global information for global
# gradient clip and amp
def
_insert_allreduce_op
(
self
,
op_idx
,
block
):
"""
Insert allreduce op to sync global information for global
gradient clip and amp.
"""
op
=
block
.
ops
[
op_idx
]
out_name
=
op
.
desc
.
output_arg_names
()[
0
]
out_var
=
block
.
var
(
out_name
)
offset
=
0
if
op
.
type
==
"reduce_any"
:
# cast the bool var to int32 to use allreduce_max op
temp_var_name
=
unique_name
.
generate
(
out_name
+
"_cast_int32"
)
temp_var
=
block
.
create_var
(
name
=
temp_var_name
,
shape
=
[
1
],
dtype
=
"int32"
)
block
.
_insert_op
(
op_idx
+
1
+
offset
,
type
=
'cast'
,
inputs
=
{
'X'
:
out_var
},
outputs
=
{
'Out'
:
temp_var
},
attrs
=
{
'in_dtype'
:
out_var
.
dtype
,
'out_dtype'
:
temp_var
.
dtype
,
self
.
_op_role_key
:
self
.
_op_role
.
Optimize
})
offset
+=
1
block
.
_insert_op
(
op_idx
+
1
+
offset
,
type
=
'c_allreduce_max'
if
op
.
type
==
"reduce_any"
else
'c_allreduce_sum'
,
inputs
=
{
'X'
:
temp_var
if
op
.
type
==
"reduce_any"
else
out_var
},
outputs
=
{
'Out'
:
temp_var
if
op
.
type
==
"reduce_any"
else
out_var
},
attrs
=
{
'ring_id'
:
self
.
_global_ring_id
,
self
.
_op_role_key
:
self
.
_op_role
.
Optimize
,
'use_calc_stream'
:
True
})
offset
+=
1
if
op
.
type
==
"reduce_any"
:
block
.
_insert_op
(
op_idx
+
1
+
offset
,
type
=
'cast'
,
inputs
=
{
'X'
:
temp_var
},
outputs
=
{
'Out'
:
out_var
},
attrs
=
{
'in_dtype'
:
temp_var
.
dtype
,
'out_dtype'
:
out_var
.
dtype
,
self
.
_op_role_key
:
self
.
_op_role
.
Optimize
})
return
offset
def
_create_vars
(
self
,
block
,
ori_block
):
def
_create_vars
(
self
,
block
,
ori_block
):
# Create vars for block, copied from
main_program's global
block
# Create vars for block, copied from
ori_
block
used_var_set
=
set
()
used_var_set
=
set
()
for
op_idx
in
range
(
block
.
desc
.
op_size
()):
added_op_num
=
0
op_desc
=
block
.
desc
.
op
(
op_idx
)
op_idx
=
0
vars
=
op_desc
.
input_arg_names
()
+
op_desc
.
output_arg_names
()
op_size
=
block
.
desc
.
op_size
()
while
op_idx
<
op_size
+
added_op_num
:
# Whether to insert allreduce_sum or allreduce_max op.
# For amp and global gradient clip strategies, we should
# get the global information, so allreduce op is needed.
should_insert
=
False
op
=
block
.
ops
[
op_idx
]
# For op process vars on all devices, remove its input
# vars not in this block
reserved_x
=
[]
if
op
.
type
==
'reduce_any'
and
self
.
_is_optimize_op
(
op
):
should_insert
=
True
elif
op
.
type
==
'concat'
and
self
.
_is_optimize_op
(
op
):
for
input_name
in
op
.
desc
.
input
(
"X"
):
if
block
.
_find_var_recursive
(
input_name
):
reserved_x
.
append
(
input_name
)
op
.
desc
.
set_input
(
'X'
,
reserved_x
)
elif
op
.
type
==
'update_loss_scaling'
:
for
input_name
in
op
.
desc
.
input
(
"X"
):
if
block
.
_find_var_recursive
(
input_name
):
reserved_x
.
append
(
input_name
)
op
.
desc
.
set_input
(
'X'
,
reserved_x
)
op
.
desc
.
set_output
(
'Out'
,
reserved_x
)
elif
op
.
type
==
'sum'
and
self
.
_is_gradient_clip_op
(
op
):
for
input_name
in
op
.
desc
.
input
(
"X"
):
if
block
.
_find_var_recursive
(
input_name
):
reserved_x
.
append
(
input_name
)
op
.
desc
.
set_input
(
'X'
,
reserved_x
)
should_insert
=
True
vars
=
op
.
desc
.
input_arg_names
()
+
op
.
desc
.
output_arg_names
()
for
var
in
vars
:
for
var
in
vars
:
# a var whose name contains "blocking_queue"
# a var whose name contains "blocking_queue"
# only exists in startup program
# only exists in startup program
...
@@ -3813,27 +3904,39 @@ class PipelineOptimizer(object):
...
@@ -3813,27 +3904,39 @@ class PipelineOptimizer(object):
if
block
.
_find_var_recursive
(
str
(
var
)):
continue
if
block
.
_find_var_recursive
(
str
(
var
)):
continue
source_var
=
ori_block
.
_var_recursive
(
str
(
var
))
source_var
=
ori_block
.
_var_recursive
(
str
(
var
))
if
source_var
.
type
==
core
.
VarDesc
.
VarType
.
READER
:
if
source_var
.
type
==
core
.
VarDesc
.
VarType
.
READER
:
block
.
create_var
(
dest_var
=
block
.
create_var
(
name
=
var
,
name
=
var
,
type
=
core
.
VarDesc
.
VarType
.
READER
,
type
=
core
.
VarDesc
.
VarType
.
READER
,
persistable
=
source_var
.
persistable
)
persistable
=
source_var
.
persistable
)
else
:
else
:
block
.
_clone_variable
(
source_var
,
False
)
dest_var
=
block
.
_clone_variable
(
source_var
,
False
)
dest_var
.
stop_gradient
=
source_var
.
stop_gradient
# When use with sharding, allreduce_sum and allreduce_max
# used for global gradient clip and amp will be added by sharding.
op_idx
+=
1
if
self
.
use_sharding
or
not
should_insert
:
continue
inserted_ops
=
self
.
_insert_allreduce_op
(
op_idx
-
1
,
block
)
added_op_num
+=
inserted_ops
op_idx
+=
inserted_ops
block
.
_sync_with_cpp
()
def
_is_loss_grad_op
(
self
,
op
):
def
_is_loss_grad_op
(
self
,
op
):
if
self
.
_op_role_key
not
in
op
.
attr_names
:
assert
self
.
_op_role_key
in
op
.
attr_names
return
False
op_role
=
int
(
op
.
attr
(
self
.
_op_role_key
))
op_role
=
int
(
op
.
all_attrs
()[
self
.
_op_role_key
])
return
op_role
&
int
(
self
.
_op_role
.
Backward
)
and
op_role
&
int
(
return
op_role
&
int
(
self
.
_op_role
.
Backward
)
and
op_role
&
int
(
self
.
_op_role
.
Loss
)
self
.
_op_role
.
Loss
)
def
_is_backward_op
(
self
,
op
):
def
_is_backward_op
(
self
,
op
):
return
self
.
_op_role_key
in
op
.
attr_names
and
int
(
op
.
all_attrs
()[
return
self
.
_op_role_key
in
op
.
attr_names
and
(
self
.
_op_role_key
])
&
int
(
self
.
_op_role
.
Backward
)
int
(
op
.
attr
(
self
.
_op_role_key
))
&
int
(
self
.
_op_role
.
Backward
))
def
_is_loss_op
(
self
,
op
):
assert
self
.
_op_role_key
in
op
.
attr_names
return
int
(
op
.
attr
(
self
.
_op_role_key
))
==
int
(
self
.
_op_role
.
Loss
)
def
_is_optimize_op
(
self
,
op
):
def
_is_optimize_op
(
self
,
op
):
return
self
.
_op_role_key
in
op
.
attr_names
and
int
(
op
.
all_attrs
()[
return
self
.
_op_role_key
in
op
.
attr_names
and
(
self
.
_op_role_key
])
&
int
(
self
.
_op_role
.
Optimize
)
int
(
op
.
attr
(
self
.
_op_role_key
))
&
int
(
self
.
_op_role
.
Optimize
)
)
def
_is_update_op
(
self
,
op
):
def
_is_update_op
(
self
,
op
):
return
'Param'
in
op
.
input_names
and
'Grad'
in
op
.
input_names
and
(
return
'Param'
in
op
.
input_names
and
'Grad'
in
op
.
input_names
and
(
...
@@ -3842,50 +3945,40 @@ class PipelineOptimizer(object):
...
@@ -3842,50 +3945,40 @@ class PipelineOptimizer(object):
def
_split_program
(
self
,
main_program
,
devices
):
def
_split_program
(
self
,
main_program
,
devices
):
"""
"""
Split a program into sections according to devices that ops run on.
Split a program into sections according to devices that ops run on.
The op
s of the role LRSched are
copied to all sections.
The op
whose op_device attr is "gpu:all" is
copied to all sections.
Args:
Args:
main_program (Program): the main program
main_program (Program): the main program
devices: all used devices
devices: all used devices
"""
"""
programs
=
[]
# Map from device to its corresponding section program info
# Map from device to its corresponding section program info
device_program_map
=
dict
()
device_program_map
=
defaultdict
(
Program
)
for
device
in
devices
:
p
=
{
'program'
:
Program
()}
device_program_map
[
device
]
=
p
block
=
main_program
.
block
(
0
)
block
=
main_program
.
block
(
0
)
for
op
in
block
.
ops
:
for
op
in
block
.
ops
:
device
=
op
.
attr
(
self
.
_op_device_key
)
device
=
op
.
attr
(
self
.
_op_device_key
)
op_role
=
op
.
attr
(
self
.
_op_role_key
)
# Copy ops whose op_device set to "gpu:all" to all sections.
if
int
(
op_role
)
&
int
(
self
.
_op_role
.
LRSched
):
if
device
==
"gpu:all"
:
# Copy ops of the role LRSched to all sections.
for
device
in
devices
:
for
device
in
device_program_map
.
keys
():
program
=
device_program_map
[
device
]
op_desc
=
op
.
desc
ap_op
=
program
[
"program"
].
block
(
0
).
desc
.
append_op
()
ap_op
.
copy_from
(
op_desc
)
# ap_op._set_attr(self._op_device_key, "")
elif
op
.
type
==
"create_py_reader"
or
op
.
type
==
"read"
or
op
.
type
==
"create_double_buffer_reader"
:
# Copy read related ops to all section to make them exit after each epoch.
for
device
in
device_program_map
.
keys
():
program
=
device_program_map
[
device
]
program
=
device_program_map
[
device
]
op_desc
=
op
.
desc
op_desc
=
op
.
desc
ap_op
=
program
[
"program"
].
block
(
0
).
desc
.
append_op
()
ap_op
=
program
.
global_block
(
).
desc
.
append_op
()
ap_op
.
copy_from
(
op_desc
)
ap_op
.
copy_from
(
op_desc
)
ap_op
.
_set_attr
(
self
.
_op_device_key
,
""
)
else
:
else
:
program
=
device_program_map
[
device
]
program
=
device_program_map
[
device
]
op_desc
=
op
.
desc
op_desc
=
op
.
desc
ap_op
=
program
[
"program"
].
block
(
0
).
desc
.
append_op
()
ap_op
=
program
.
global_block
(
).
desc
.
append_op
()
ap_op
.
copy_from
(
op_desc
)
ap_op
.
copy_from
(
op_desc
)
ap_op
.
_set_attr
(
self
.
_op_device_key
,
""
)
program_list
=
[]
for
key
in
devices
:
for
key
in
devices
:
program
=
device_program_map
[
key
]
program
=
device_program_map
[
key
]
program
[
'program'
]
.
_sync_with_cpp
()
program
.
_sync_with_cpp
()
program
s
.
append
(
program
)
program
_list
.
append
(
program
)
return
program
s
return
program
_list
def
_get_op_device_for_startup_program
(
self
,
var_name
):
def
_get_op_device_for_startup_program
(
self
,
var_name
):
"""
"""
...
@@ -3894,21 +3987,22 @@ class PipelineOptimizer(object):
...
@@ -3894,21 +3987,22 @@ class PipelineOptimizer(object):
get the real op_device attribute of the fill_constant as the device
get the real op_device attribute of the fill_constant as the device
where the corresponding parameters on.
where the corresponding parameters on.
"""
"""
assert
"beta1_pow_acc"
in
var_name
or
"beta2_pow_acc"
in
var_name
assert
"beta1_pow_acc"
in
var_name
or
"beta2_pow_acc"
in
var_name
,
\
'For accumulators for Adam, the name must contain beta1_pow_acc '
\
'or beta2_pow_acc.'
param_name
=
var_name
[
0
:
var_name
.
index
(
'_beta'
)]
param_name
=
var_name
[
0
:
var_name
.
index
(
'_beta'
)]
device
=
self
.
_param_device_map
[
param_name
]
device
=
self
.
_param_device_map
[
param_name
]
return
device
return
device
def
_split_startup_program
(
self
,
startup_program
,
local_rank
):
def
_split_startup_program
(
self
,
startup_program
,
device_id
):
block
=
startup_program
.
block
(
0
)
block
=
startup_program
.
global_block
(
)
new_startup_program
=
Program
()
new_startup_program
=
Program
()
for
op
in
block
.
ops
:
for
op
in
block
.
ops
:
device
=
op
.
attr
(
self
.
_op_device_key
)
device
=
op
.
attr
(
self
.
_op_device_key
)
if
device
==
"cpu"
:
if
device
==
"cpu"
:
assert
op
.
type
==
"fill_constant"
,
(
assert
op
.
type
==
"fill_constant"
,
(
"For ops in startup "
"For ops in startup program with the op_device attribute "
"program that with the op_device attribute of cpu, "
"of cpu, they must be of type fill_constant."
)
"they must be fill_constant."
)
output_var
=
op
.
output_arg_names
[
0
]
output_var
=
op
.
output_arg_names
[
0
]
device
=
self
.
_get_op_device_for_startup_program
(
output_var
)
device
=
self
.
_get_op_device_for_startup_program
(
output_var
)
...
@@ -3917,14 +4011,13 @@ class PipelineOptimizer(object):
...
@@ -3917,14 +4011,13 @@ class PipelineOptimizer(object):
else
:
else
:
# LR related ops
# LR related ops
device
=
None
device
=
None
if
device
and
device_index
!=
local_rank
:
continue
if
device
and
device_index
!=
device_id
:
continue
op_desc
=
op
.
desc
op_desc
=
op
.
desc
ap_op
=
new_startup_program
.
block
(
0
).
desc
.
append_op
()
ap_op
=
new_startup_program
.
global_block
(
).
desc
.
append_op
()
ap_op
.
copy_from
(
op_desc
)
ap_op
.
copy_from
(
op_desc
)
ap_op
.
_set_attr
(
self
.
_op_device_key
,
""
)
ap_op
.
_set_attr
(
self
.
_op_device_key
,
""
)
new_startup_program
.
_sync_with_cpp
()
new_startup_program
.
_sync_with_cpp
()
self
.
_create_vars
(
self
.
_create_vars
(
new_startup_program
.
global_block
(),
block
)
new_startup_program
.
block
(
0
),
startup_program
.
global_block
())
return
new_startup_program
return
new_startup_program
def
_find_post_op
(
self
,
ops
,
cur_op
,
var_name
):
def
_find_post_op
(
self
,
ops
,
cur_op
,
var_name
):
...
@@ -3937,6 +4030,11 @@ class PipelineOptimizer(object):
...
@@ -3937,6 +4030,11 @@ class PipelineOptimizer(object):
var_name as output.
var_name as output.
var_name (string): Variable name.
var_name (string): Variable name.
"""
"""
# To skip the cast op added by amp which has no op_device set
if
'.cast_fp32'
in
var_name
:
var_name
=
var_name
.
replace
(
'.cast_fp32'
,
''
)
elif
'.cast_fp16'
in
var_name
:
var_name
=
var_name
.
replace
(
'.cast_fp16'
,
''
)
post_op
=
[]
post_op
=
[]
before
=
True
before
=
True
for
op
in
ops
:
for
op
in
ops
:
...
@@ -3965,7 +4063,8 @@ class PipelineOptimizer(object):
...
@@ -3965,7 +4063,8 @@ class PipelineOptimizer(object):
"""
"""
prev_op
=
[]
prev_op
=
[]
for
op
in
ops
:
for
op
in
ops
:
if
op
.
type
==
'send_v2'
or
op
.
type
==
'recv_v2'
:
if
op
.
type
==
'send_v2'
or
op
.
type
==
'recv_v2'
\
or
op
.
type
==
'c_broadcast'
:
continue
continue
if
op
==
cur_op
:
if
op
==
cur_op
:
break
break
...
@@ -3980,11 +4079,8 @@ class PipelineOptimizer(object):
...
@@ -3980,11 +4079,8 @@ class PipelineOptimizer(object):
return
None
return
None
def
_rename_arg
(
self
,
op
,
old_name
,
new_name
):
def
_rename_arg
(
self
,
op
,
old_name
,
new_name
):
op_desc
=
op
.
desc
op
.
_rename_input
(
old_name
,
new_name
)
if
isinstance
(
op_desc
,
tuple
):
op
.
_rename_output
(
old_name
,
new_name
)
op_desc
=
op_desc
[
0
]
op_desc
.
_rename_input
(
old_name
,
new_name
)
op_desc
.
_rename_output
(
old_name
,
new_name
)
def
_create_var
(
self
,
block
,
ref_var
,
name
):
def
_create_var
(
self
,
block
,
ref_var
,
name
):
"""
"""
...
@@ -3998,99 +4094,12 @@ class PipelineOptimizer(object):
...
@@ -3998,99 +4094,12 @@ class PipelineOptimizer(object):
dtype
=
ref_var
.
dtype
,
dtype
=
ref_var
.
dtype
,
type
=
ref_var
.
type
,
type
=
ref_var
.
type
,
lod_level
=
ref_var
.
lod_level
,
lod_level
=
ref_var
.
lod_level
,
persistable
=
Fals
e
,
persistable
=
ref_var
.
persistabl
e
,
is_data
=
False
,
is_data
=
ref_var
.
is_data
,
need_check_feed
=
ref_var
.
desc
.
need_check_feed
())
need_check_feed
=
ref_var
.
desc
.
need_check_feed
())
new_var
.
stop_gradient
=
ref_var
.
stop_gradient
return
new_var
return
new_var
def
_get_data_var_info
(
self
,
block
):
"""
Get info of all vars whose is_data attribute are true.
"""
# map of data vars to devices that that data on
data_devices_map
=
dict
()
for
op
in
block
.
ops
:
dev_spec
=
op
.
attr
(
self
.
_op_device_key
)
for
var_name
in
op
.
input_arg_names
:
if
"blocking_queue"
in
var_name
:
continue
var
=
block
.
var
(
var_name
)
if
not
var
.
is_data
:
continue
if
not
var_name
in
data_devices_map
:
data_devices_map
[
var_name
]
=
[]
if
not
dev_spec
in
data_devices_map
[
var_name
]:
data_devices_map
[
var_name
].
append
(
dev_spec
)
return
data_devices_map
def
_insert_sendrecv_for_data_var
(
self
,
main_block
,
programs
,
startup
,
devices
):
"""
Insert send and recv ops for data var that on other devices.
Args:
main_block (Block): Global block for main program
programs (dict): Dictionary for section params
startup (Program): Startup program
devices (list): List of devices in the format (dev:dev_index)
"""
main_program
=
main_block
.
program
data_devices_map
=
self
.
_get_data_var_info
(
main_block
)
first_prog
=
programs
[
0
][
'program'
]
first_block
=
first_prog
.
block
(
0
)
insert_index
=
0
for
op
in
first_block
.
ops
:
insert_index
+=
1
if
op
.
type
==
"read"
:
break
first_dev_spec
=
devices
[
0
]
first_dev_index
=
int
(
first_dev_spec
.
split
(
':'
)[
1
])
for
var_name
in
data_devices_map
.
keys
():
for
device
in
data_devices_map
[
var_name
]:
if
device
==
first_dev_spec
:
continue
main_var
=
main_block
.
var
(
var_name
)
assert
main_var
.
is_data
if
not
var_name
in
first_block
.
vars
:
self
.
_create_var
(
first_block
,
main_var
,
var_name
)
dev_index
=
int
(
device
.
split
(
':'
)[
1
])
first_block
.
_insert_op
(
index
=
insert_index
,
type
=
'send_v2'
,
inputs
=
{
'X'
:
first_block
.
var
(
var_name
)},
attrs
=
{
self
.
_op_device_key
:
first_dev_spec
,
self
.
_op_role_key
:
self
.
_op_role
.
Forward
,
'use_calc_stream'
:
True
,
'peer'
:
dev_index
,
})
# Get the device that that data on
assert
device
in
devices
prog_index
=
devices
.
index
(
device
)
prog
=
programs
[
prog_index
][
'program'
]
block
=
prog
.
block
(
0
)
index
=
0
for
op
in
block
.
ops
:
index
+=
1
if
op
.
type
==
"read"
:
break
source_var
=
main_program
.
block
(
0
).
var
(
var_name
)
new_var
=
self
.
_create_var
(
block
,
source_var
,
var_name
)
new_var_shape
=
list
(
new_var
.
shape
)
new_var_shape
[
0
]
=
self
.
micro_batch_size
if
new_var_shape
[
0
]
<
0
else
new_var_shape
[
0
]
block
.
_insert_op
(
index
=
index
,
type
=
'recv_v2'
,
outputs
=
{
'Out'
:
[
new_var
]},
attrs
=
{
'out_shape'
:
new_var_shape
,
'dtype'
:
new_var
.
dtype
,
self
.
_op_device_key
:
device
,
self
.
_op_role_key
:
self
.
_op_role
.
Forward
,
'peer'
:
first_dev_index
,
'use_calc_stream'
:
True
,
})
def
_strip_grad_suffix
(
self
,
name
):
def
_strip_grad_suffix
(
self
,
name
):
"""
"""
Strip the grad suffix from the given variable name
Strip the grad suffix from the given variable name
...
@@ -4104,95 +4113,161 @@ class PipelineOptimizer(object):
...
@@ -4104,95 +4113,161 @@ class PipelineOptimizer(object):
"""
"""
return
name
+
core
.
grad_var_suffix
()
return
name
+
core
.
grad_var_suffix
()
def
_
add_opdevice_attr_for_regularization_clip
(
self
,
block
):
def
_
get_op_device_attr
(
self
,
op
):
"""
"""
Add op_device attribute for regulization and clip ops
.
Get the op_device attribute of a op
.
"""
"""
for
op
in
block
.
ops
:
device
=
op
.
attr
(
self
.
_op_device_key
)
\
# role for regularization and clip ops is optimize
if
op
.
has_attr
(
self
.
_op_device_key
)
else
None
if
int
(
op
.
attr
(
self
.
_op_role_key
))
!=
int
(
self
.
_op_role
.
Optimize
):
if
device
:
continue
assert
device
[
0
:
3
]
==
'gpu'
,
"Now, only gpu devices are "
\
if
op
.
has_attr
(
self
.
_op_device_key
)
and
(
"supported in pipeline parallemism."
op
.
attr
(
self
.
_op_device_key
)
!=
""
):
return
device
continue
assert
self
.
_op_role_var_key
in
op
.
attr_names
op_role_var
=
op
.
all_attrs
()[
self
.
_op_role_var_key
]
assert
len
(
op_role_var
)
==
2
param_name
=
op_role_var
[
0
]
device
=
self
.
_param_device_map
[
param_name
]
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
def
_add_
default_opdevice_attr
(
self
,
block
):
def
_add_
op_device_attr_for_op
(
self
,
op
,
idx
,
block
):
"""
"""
1. Add default op_device attribute for lr-related ops.
Add op_device attrribute for ops that have not that attribute set.
The default value is the one that of the first place.
We use "gpu:all" to represent the op should be put on all
2. Add default op_device attribute for sum ops added during
sub-programs, such as lr-related ops. Note that: "gpu:all"
backward. For these ops, we set the op_device attribute
is only used by pipeline as an indicator.
as the one of its post op, i.e, which op has the output of the
sum op as an input.
"""
"""
first_devcie
=
""
# Get the device spec of the first place.
# device_spec: 'cpu' for cpu device and 'gpu:id' for gpu device,
# e.g. 'gpu:0', 'gpu:1', etc.
for
op
in
block
.
ops
:
if
op
.
has_attr
(
self
.
_op_device_key
)
and
(
op
.
attr
(
self
.
_op_device_key
)
!=
""
):
first_device
=
op
.
attr
(
self
.
_op_device_key
)
break
assert
first_device
first_device_type
=
first_device
.
split
(
":"
)[
0
]
assert
first_device_type
==
"gpu"
# set op_device attr for lr-related ops
lrsched_role
=
int
(
self
.
_op_role
.
LRSched
)
lrsched_role
=
int
(
self
.
_op_role
.
LRSched
)
for
op
in
block
.
ops
:
if
op
.
attr
(
self
.
_op_role_key
)
==
lrsched_role
:
if
not
op
.
has_attr
(
self
.
_op_device_key
)
or
(
# For LRSched ops, we should put them on all sub-programs to
op
.
attr
(
self
.
_op_device_key
)
==
""
):
# make sure each sub-program update the lr correctly
if
op
.
type
==
"sum"
:
op
.
_set_attr
(
self
.
_op_device_key
,
"gpu:all"
)
# For sum ops that compute the sum of @RENAMED@ vars
elif
(
op
.
type
==
"cast"
or
for
name
in
op
.
desc
.
input_arg_names
():
op
.
type
==
"scale"
)
and
self
.
_is_backward_op
(
op
):
assert
'@RENAME@'
in
name
prev_op
=
self
.
_find_real_prev_op
(
block
.
ops
,
op
,
assert
len
(
op
.
desc
.
output_arg_names
())
==
1
op
.
desc
.
input
(
"X"
)[
0
])
out_name
=
op
.
desc
.
output_arg_names
()[
0
]
op
.
_set_attr
(
self
.
_op_device_key
,
prev_op
.
attr
(
self
.
_op_device_key
))
post_op
=
self
.
_find_post_op
(
block
.
ops
,
op
,
out_name
)
elif
op
.
type
==
"memcpy"
and
not
self
.
_is_optimize_op
(
op
):
device
=
post_op
.
attr
(
self
.
_op_device_key
)
assert
len
(
op
.
input_arg_names
)
==
1
and
len
(
assert
device
op
.
output_arg_names
)
==
1
input_name
=
op
.
input_arg_names
[
0
]
output_name
=
op
.
output_arg_names
[
0
]
if
'@Fetch'
in
output_name
:
post_op
=
self
.
_find_post_op
(
block
.
ops
,
op
,
output_name
)
op
.
_set_attr
(
self
.
_op_device_key
,
post_op
.
attr
(
self
.
_op_device_key
))
else
:
prev_op
=
self
.
_find_real_prev_op
(
block
.
ops
,
op
,
op
.
desc
.
input
(
"X"
)[
0
])
op
.
_set_attr
(
self
.
_op_device_key
,
prev_op
.
attr
(
self
.
_op_device_key
))
elif
self
.
_is_loss_op
(
op
):
# For loss * loss_scaling op added by AMP
offset
=
1
while
(
not
block
.
ops
[
idx
+
offset
].
has_attr
(
self
.
_op_device_key
)
or
not
block
.
ops
[
idx
+
offset
].
attr
(
self
.
_op_device_key
)):
offset
+=
1
device
=
block
.
ops
[
idx
+
offset
].
attr
(
self
.
_op_device_key
)
assert
device
,
"Please put you program within device_guard scope."
for
i
in
range
(
offset
):
block
.
ops
[
idx
+
i
].
_set_attr
(
self
.
_op_device_key
,
device
)
elif
self
.
_is_optimize_op
(
op
)
and
op
.
type
==
"check_finite_and_unscale"
:
op_role_var
=
op
.
attr
(
self
.
_op_role_var_key
)
param_name
=
op_role_var
[
0
]
device
=
self
.
_param_device_map
[
param_name
]
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
elif
self
.
_is_optimize_op
(
op
)
and
op
.
type
==
"cast"
:
# For fp16-->fp32 cast added by AMP
grad_name
=
op
.
output
(
'Out'
)
assert
len
(
grad_name
)
==
1
param_name
=
grad_name
[
0
].
strip
(
core
.
grad_var_suffix
())
device
=
self
.
_param_device_map
[
param_name
]
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
elif
self
.
_is_gradient_clip_op
(
op
)
or
self
.
_is_regularization_op
(
op
):
# For gradient clip and regularization ops, we set their op_device
# attribute to the device where their corresponding parameters on.
assert
self
.
_op_role_var_key
in
op
.
attr_names
,
"gradient_clip "
\
"and regularization ops must have op_role_var attribute."
op_role_var
=
op
.
attr
(
self
.
_op_role_var_key
)
assert
len
(
op_role_var
)
==
2
,
"op_role_var for gradient_clip "
\
"regularization ops must have two elements."
param_name
=
op_role_var
[
0
]
device
=
self
.
_param_device_map
[
param_name
]
# For sum op added by global gradient clip, it must be
# put on all devices
if
(
op
.
type
==
'sum'
or
op
.
type
==
'sqrt'
or
op
.
type
==
'fill_constant'
or
op
.
type
==
'elementwise_max'
or
op
.
type
==
'elementwise_div'
):
device
=
"gpu:all"
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
else
:
other_known_ops
=
[
'update_loss_scaling'
,
'reduce_any'
,
'concat'
,
'sum'
]
assert
op
.
type
in
other_known_ops
,
"For other ops without "
\
"op_device set, they must be one of {}, but it "
\
"is {}"
.
format
(
other_known_ops
,
op
.
type
)
assert
self
.
_is_optimize_op
(
op
)
op
.
_set_attr
(
self
.
_op_device_key
,
"gpu:all"
)
def
_add_op_device_attr
(
self
,
block
):
"""
Add op_device attrribute for ops in block that have
not that attribute set.
"""
for
idx
,
op
in
enumerate
(
list
(
block
.
ops
)):
if
(
op
.
type
==
"create_py_reader"
or
op
.
type
==
"read"
or
op
.
type
==
"create_double_buffer_reader"
):
# Copy read related ops to all section to make them exit
# after each epoch.
# We use "gpu:all" to represent the op should be put on all
# sub-programs, such as lr-related ops. Note that: "gpu:all"
# is only used by pipeline as an indicator.
op
.
_set_attr
(
self
.
_op_device_key
,
"gpu:all"
)
continue
continue
# op_device attribute has been set
assert
op
.
attr
(
self
.
_op_role_key
)
==
lrsched_role
,
(
if
self
.
_get_op_device_attr
(
op
):
continue
"Op whose op_device attr has not been set for pipeline"
self
.
_add_op_device_attr_for_op
(
op
,
idx
,
block
)
" must be of the role LRSched."
)
op
.
_set_attr
(
self
.
_op_device_key
,
first_device
)
def
_check_validation
(
self
,
block
):
def
_check_validation
(
self
,
block
):
"""
"""
Check whether ops in a block
are all validate (i.e.,
the
Check whether ops in a block
have both the op_device and
the
op_
device attribute has been set)
.
op_
role attributes set
.
Then, return all device
specification
s in order.
Then, return all devices in order.
"""
"""
device_specs
=
[]
device_list
=
[]
# Section worker only supports the following op_role
valid_op_role_value
=
[
int
(
self
.
_op_role
.
LRSched
),
int
(
self
.
_op_role
.
Forward
),
int
(
self
.
_op_role
.
Backward
),
int
(
self
.
_op_role
.
Loss
),
int
(
self
.
_op_role
.
Optimize
),
int
(
self
.
_op_role
.
Backward
)
|
int
(
self
.
_op_role
.
Loss
),
]
for
op
in
block
.
ops
:
for
op
in
block
.
ops
:
type
=
op
.
type
if
not
op
.
_has_kernel
(
op
.
type
):
if
not
op
.
_has_kernel
(
type
):
assert
op
.
type
==
"conditional_block"
and
(
assert
op
.
type
==
"conditional_block"
and
(
op
.
attr
(
self
.
_op_role_key
)
==
int
(
self
.
_op_role
.
LRSched
)),
(
op
.
attr
(
self
.
_op_role_key
)
==
int
(
self
.
_op_role
.
LRSched
)),
(
"Now, the only supported op without kernel is "
"Now, the only supported op without kernel is "
"conditional_block, and its op role must be LRSched."
)
"conditional_block, and its op role must be LRSched."
)
assert
op
.
has_attr
(
self
.
_op_role_key
),
(
"op ({}) has no {} attribute."
.
format
(
op
.
type
,
self
.
_op_role_key
))
assert
int
(
op
.
attr
(
self
.
_op_role_key
))
in
valid_op_role_value
,
\
"op_role {} for op {} must be one of {}"
.
format
(
op
.
attr
(
self
.
_op_role_key
),
op
.
type
,
valid_op_role_value
)
assert
op
.
has_attr
(
self
.
_op_device_key
),
(
assert
op
.
has_attr
(
self
.
_op_device_key
),
(
"op ({}) has no {} attribute."
.
format
(
op
.
type
,
"op ({}) has no {} attribute."
.
format
(
op
.
type
,
self
.
_op_device_key
))
self
.
_op_device_key
))
dev_spec
=
op
.
attr
(
self
.
_op_device_key
)
assert
dev_spec
,
(
"op_device attribute for op "
device
=
op
.
attr
(
self
.
_op_device_key
)
assert
device
,
(
"op_device attribute for op "
"{} has not been set."
.
format
(
op
.
type
))
"{} has not been set."
.
format
(
op
.
type
))
dev_type
=
dev_spec
.
split
(
':'
)[
0
]
if
device
==
"gpu:all"
:
continue
dev_type
=
device
.
split
(
':'
)[
0
]
assert
dev_type
==
"gpu"
,
(
"Now only gpu devices are supported "
assert
dev_type
==
"gpu"
,
(
"Now only gpu devices are supported "
"for pipeline parallelism."
)
"for pipeline parallelism."
)
if
not
dev
_spec
in
device_specs
:
if
not
dev
ice
in
device_list
:
device_
specs
.
append
(
dev_spec
)
device_
list
.
append
(
device
)
return
device_
specs
return
device_
list
def
_insert_sendrecv_ops_for_boundaries
(
self
,
block
):
def
_insert_sendrecv_ops_for_boundaries
(
self
,
block
):
"""
"""
...
@@ -4201,49 +4276,105 @@ class PipelineOptimizer(object):
...
@@ -4201,49 +4276,105 @@ class PipelineOptimizer(object):
"""
"""
extra_index
=
0
extra_index
=
0
# A map from var to device
spec
where op takes it as input,
# A map from var to device where op takes it as input,
# avoiding multiple send and recv ops.
# avoiding multiple send and recv ops.
var_dev
spec
=
dict
()
var_dev
_map
=
dict
()
for
index
,
op
in
enumerate
(
list
(
block
.
ops
)):
for
index
,
op
in
enumerate
(
list
(
block
.
ops
)):
# skips lr-related ops and vars, as we will process them later.
cur_device
=
op
.
attr
(
self
.
_op_device_key
)
if
int
(
op
.
attr
(
self
.
_op_role_key
))
&
int
(
self
.
_op_role
.
LRSched
):
if
cur_device
==
"gpu:all"
:
continue
continue
# skips update ops and vars, as we will process them later.
if
self
.
_is_update_op
(
op
):
continue
cur_device_spec
=
op
.
attr
(
self
.
_op_device_key
)
for
var_name
in
op
.
input_arg_names
:
for
var_name
in
op
.
input_arg_names
:
# i.e., lod_tensor_blocking_queue created by DataLoader,
# i.e., lod_tensor_blocking_queue created by DataLoader,
# which only exists in startup program.
# which only exists in startup program.
if
not
var_name
in
block
.
vars
:
continue
var
=
block
.
var
(
var_name
)
var
=
block
.
var
(
var_name
)
# skip data, because we will process it later
# skip data, because we will process it later
if
var
.
is_data
:
continue
if
var
.
is_data
:
continue
prev_device
=
None
if
var_name
in
self
.
_param_device_map
:
prev_device
=
self
.
_param_device_map
[
var_name
]
prev_op
=
self
.
_find_real_prev_op
(
block
.
ops
,
op
,
var_name
)
prev_op
=
self
.
_find_real_prev_op
(
block
.
ops
,
op
,
var_name
)
if
prev_op
is
None
:
if
not
prev_device
:
continue
prev_device
=
prev_op
.
attr
(
self
.
_op_device_key
)
\
prev_device_spec
=
prev_op
.
attr
(
self
.
_op_device_key
)
if
prev_op
else
None
if
not
prev_device
or
prev_device
==
'gpu:all'
:
continue
if
prev_device_spec
!=
cur_device_spec
:
if
prev_device
!=
cur_device
:
if
var_name
not
in
var_devspec
:
if
var_name
not
in
var_dev_map
:
var_dev_map
[
var_name
]
=
[]
var_devspec
[
var_name
]
=
[]
if
cur_device
in
var_dev_map
[
var_name
]:
continue
if
cur_device_spec
in
var_devspec
[
var_name
]:
continue
var_dev_map
[
var_name
].
append
(
cur_device
)
var_devspec
[
var_name
].
append
(
cur_device_spec
)
op_role
=
op
.
all_attrs
()[
self
.
_op_role_key
]
op_role
=
op
.
all_attrs
()[
self
.
_op_role_key
]
var
=
block
.
vars
[
var_name
]
var
=
block
.
vars
[
var_name
]
prev_device_index
=
int
(
prev_device_spec
.
split
(
':'
)[
1
])
prev_device_index
=
int
(
prev_device
.
split
(
':'
)[
1
])
cur_device_index
=
int
(
cur_device_spec
.
split
(
':'
)[
1
])
cur_device_index
=
int
(
cur_device
.
split
(
':'
)[
1
])
pair
=
(
prev_device_index
,
cur_device_index
)
pair_key
=
prev_device_index
*
1000
+
cur_device_index
if
pair
not
in
self
.
_pipeline_pair
:
self
.
_pipeline_pair
.
append
(
pair
)
self
.
_pp_ring_map
[
pair_key
]
=
self
.
ring_id
ring_id
=
self
.
ring_id
self
.
ring_id
+=
1
else
:
ring_id
=
self
.
_pp_ring_map
[
pair_key
]
if
self
.
schedule_mode
==
'F-then-B'
:
# F-then-B
block
.
_insert_op
(
block
.
_insert_op
(
index
=
index
+
extra_index
,
index
=
index
+
extra_index
,
type
=
'send_v2'
,
type
=
'send_v2'
,
inputs
=
{
'X'
:
var
},
inputs
=
{
'X'
:
var
},
attrs
=
{
attrs
=
{
self
.
_op_device_key
:
prev_device_spec
,
self
.
_op_device_key
:
prev_device
,
self
.
_op_role_key
:
op_role
,
'use_calc_stream'
:
True
,
'peer'
:
1
,
'ring_id'
:
ring_id
})
extra_index
+=
1
block
.
_insert_op
(
index
=
index
+
extra_index
,
type
=
'recv_v2'
,
outputs
=
{
'Out'
:
[
var
]},
attrs
=
{
'out_shape'
:
var
.
shape
,
'dtype'
:
var
.
dtype
,
self
.
_op_device_key
:
cur_device
,
self
.
_op_role_key
:
op_role
,
self
.
_op_role_key
:
op_role
,
'use_calc_stream'
:
True
,
'use_calc_stream'
:
True
,
'peer'
:
cur_device_index
,
'peer'
:
0
,
'ring_id'
:
ring_id
})
extra_index
+=
1
elif
self
.
schedule_mode
==
'1F1B'
:
# 1F1B
block
.
_insert_op
(
index
=
index
+
extra_index
,
type
=
'c_sync_calc_stream'
,
inputs
=
{
'X'
:
[
var
]},
outputs
=
{
'Out'
:
[
var
]},
attrs
=
{
self
.
_op_device_key
:
prev_device
,
self
.
_op_role_key
:
op_role
,
})
extra_index
+=
1
block
.
_insert_op
(
index
=
index
+
extra_index
,
type
=
'send_v2'
,
inputs
=
{
'X'
:
var
},
attrs
=
{
self
.
_op_device_key
:
prev_device
,
self
.
_op_role_key
:
op_role
,
'use_calc_stream'
:
False
,
'ring_id'
:
ring_id
,
'peer'
:
1
,
})
extra_index
+=
1
block
.
_insert_op
(
index
=
index
+
extra_index
,
type
=
'c_sync_comm_stream'
,
inputs
=
{
'X'
:
[
var
]},
outputs
=
{
'Out'
:
[
var
]},
attrs
=
{
self
.
_op_device_key
:
prev_device
,
self
.
_op_role_key
:
self
.
_op_role
.
Backward
,
'ring_id'
:
ring_id
,
})
})
extra_index
+=
1
extra_index
+=
1
var_shape
=
list
(
var
.
shape
)
var_shape
=
list
(
var
.
shape
)
...
@@ -4256,93 +4387,156 @@ class PipelineOptimizer(object):
...
@@ -4256,93 +4387,156 @@ class PipelineOptimizer(object):
attrs
=
{
attrs
=
{
'out_shape'
:
var_shape
,
'out_shape'
:
var_shape
,
'dtype'
:
var
.
dtype
,
'dtype'
:
var
.
dtype
,
self
.
_op_device_key
:
cur_device_spec
,
self
.
_op_device_key
:
cur_device
,
self
.
_op_role_key
:
op_role
,
self
.
_op_role_key
:
op_role
,
'use_calc_stream'
:
True
,
'use_calc_stream'
:
True
,
'peer'
:
prev_device_index
,
'peer'
:
0
,
'ring_id'
:
ring_id
})
})
extra_index
+=
1
extra_index
+=
1
else
:
raise
ValueError
(
"Now only 'F-then-B' and '1F1B' are supported."
"The given value is {}."
.
format
(
self
.
schedule_mode
))
def
_clear_gradients
(
self
,
main_block
,
dev_spec
):
def
_insert_loss_scale
(
self
,
block
):
"""
Clear gradients at the begining of each run of a minibatch.
"""
for
param_name
in
self
.
_param_device_map
:
device
=
self
.
_param_device_map
[
param_name
]
if
device
!=
dev_spec
:
continue
grad_name
=
self
.
_append_grad_suffix
(
param_name
)
if
not
main_block
.
has_var
(
grad_name
):
continue
grad_var
=
main_block
.
vars
[
grad_name
]
grad_var
.
persistable
=
True
main_block
.
_insert_op
(
index
=
0
,
type
=
'fill_constant'
,
inputs
=
{},
outputs
=
{
'Out'
:
[
grad_var
]},
attrs
=
{
'shape'
:
grad_var
.
shape
,
'dtype'
:
grad_var
.
dtype
,
'value'
:
float
(
0
),
self
.
_op_device_key
:
device
,
# a trick to run this op once per mini-batch
self
.
_op_role_key
:
self
.
_op_role
.
Optimize
.
LRSched
,
})
def
_accumulate_gradients
(
self
,
block
):
"""
"""
Accumulate the gradients generated in microbatch to the one in mini-batch.
Scale the loss corresponding to number of micro-batches.
We also scale the loss corresponding to number of micro-batches as well.
"""
"""
if
self
.
_num_microbatches
==
1
:
return
for
index
,
op
in
reversed
(
tuple
(
enumerate
(
list
(
block
.
ops
)))):
for
index
,
op
in
reversed
(
tuple
(
enumerate
(
list
(
block
.
ops
)))):
offset
=
index
device
=
op
.
attr
(
self
.
_op_device_key
)
# Backward pass
if
self
.
_is_loss_grad_op
(
op
):
if
self
.
_is_loss_grad_op
(
op
):
loss_grad_var
=
block
.
vars
[
op
.
output_arg_names
[
0
]]
loss_grad_var
=
block
.
vars
[
op
.
output_arg_names
[
0
]]
scale_factor
=
self
.
_num_microbatches
block
.
_insert_op
(
block
.
_insert_op
(
index
=
index
+
1
,
index
=
index
+
1
,
type
=
'scale'
,
type
=
'scale'
,
inputs
=
{
'X'
:
loss_grad_var
},
inputs
=
{
'X'
:
loss_grad_var
},
outputs
=
{
'Out'
:
loss_grad_var
},
outputs
=
{
'Out'
:
loss_grad_var
},
attrs
=
{
attrs
=
{
'scale'
:
1.0
/
scale_factor
,
'scale'
:
1.0
/
self
.
_num_microbatches
,
self
.
_op_device_key
:
device
,
self
.
_op_role_key
:
self
.
_op_role
.
Backward
self
.
_op_role_key
:
self
.
_op_role
.
Backward
})
})
break
break
if
self
.
_is_backward_op
(
op
)
and
(
self
.
_op_role_var_key
in
op
.
attr_names
):
op_role_var
=
op
.
all_attrs
()[
self
.
_op_role_var_key
]
if
len
(
op_role_var
)
==
0
:
def
_rename_gradient_var_name
(
self
,
block
):
for
index
,
op
in
enumerate
(
block
.
ops
):
if
not
self
.
_is_optimize_op
(
op
):
continue
input_names
=
op
.
input_arg_names
output_names
=
op
.
output_arg_names
in_out_names
=
input_names
+
output_names
if
op
.
type
==
'cast'
:
continue
# append "MERGED" to the names of parameter gradients,
# and mofify the op_role_var attribute (by rename_arg func).
for
name
in
in_out_names
:
if
not
core
.
grad_var_suffix
()
in
name
:
continue
param_name
=
name
.
strip
(
core
.
grad_var_suffix
())
new_grad_name
=
name
+
"@MERGED"
self
.
_rename_arg
(
op
,
name
,
new_grad_name
)
def
_accumulate_gradients
(
self
,
block
,
pp_allreduce_in_optimize
=
False
):
"""
Create a new merged gradient for each parameter and accumulate the
corresponding gradient to it.
"""
merged_gradient_names
=
[]
first_opt_op_idx
=
None
for
index
,
op
in
reversed
(
tuple
(
enumerate
(
list
(
block
.
ops
)))):
# remove the cast op of fp16 grad to fp32 grad
if
self
.
_is_optimize_op
(
op
)
and
op
.
type
==
'cast'
:
in_name
=
op
.
input_arg_names
[
0
]
out_name
=
op
.
output_arg_names
[
0
]
if
out_name
.
strip
(
'@GRAD'
)
in
self
.
_param_device_map
:
assert
in_name
.
replace
(
'.cast_fp16'
,
''
)
==
out_name
block
.
_remove_op
(
index
)
continue
continue
if
self
.
_is_backward_op
(
op
)
and
not
first_opt_op_idx
:
first_opt_op_idx
=
index
+
1
# no optimize phase
if
first_opt_op_idx
==
len
(
block
.
ops
):
return
if
block
.
ops
[
first_opt_op_idx
].
type
==
"c_sync_comm_stream"
:
first_opt_op_idx
+=
1
if
self
.
_is_backward_op
(
op
)
and
(
self
.
_op_role_var_key
in
op
.
attr_names
):
op_role_var
=
op
.
attr
(
self
.
_op_role_var_key
)
if
len
(
op_role_var
)
==
0
:
continue
assert
len
(
op_role_var
)
%
2
==
0
assert
len
(
op_role_var
)
%
2
==
0
offset
=
index
for
i
in
range
(
0
,
len
(
op_role_var
),
2
):
for
i
in
range
(
0
,
len
(
op_role_var
),
2
):
offset
=
0
param_name
=
op_role_var
[
i
]
if
not
block
.
has_var
(
param_name
):
continue
if
'@BroadCast'
in
param_name
:
continue
param_grad_name
=
param_name
+
core
.
grad_var_suffix
()
merged_param_grad_name
=
param_grad_name
+
'@MERGED'
if
not
block
.
has_var
(
merged_param_grad_name
):
self
.
_create_var
(
block
,
block
.
vars
[
param_name
],
merged_param_grad_name
)
assert
block
.
has_var
(
merged_param_grad_name
)
param_grad_var
=
block
.
var
(
param_grad_name
)
merged_param_grad_var
=
block
.
var
(
merged_param_grad_name
)
merged_param_grad_var
.
persistable
=
True
block
.
_insert_op
(
index
=
first_opt_op_idx
+
offset
,
type
=
'fill_constant'
,
inputs
=
{},
outputs
=
{
'Out'
:
[
merged_param_grad_var
]},
attrs
=
{
'shape'
:
merged_param_grad_var
.
shape
,
'dtype'
:
merged_param_grad_var
.
dtype
,
'value'
:
float
(
0
),
# a trick to run this op once per mini-batch
self
.
_op_role_key
:
self
.
_op_role
.
Optimize
.
LRSched
,
})
offset
+=
1
grad_name
=
op_role_var
[
i
+
1
]
grad_name
=
op_role_var
[
i
+
1
]
grad_var
=
block
.
vars
[
grad_name
]
grad_var
=
block
.
vars
[
grad_name
]
new_grad_var_name
=
unique_name
.
generate
(
grad_name
)
if
not
'cast_fp16'
in
grad_name
:
new_var
=
self
.
_create_var
(
block
,
grad_var
,
new_grad_var_name
)
self
.
_rename_arg
(
op
,
grad_name
,
new_grad_var_name
)
block
.
_insert_op
(
block
.
_insert_op
(
index
=
offset
+
1
,
index
=
first_opt_op_idx
+
offset
,
type
=
'sum'
,
type
=
'sum'
,
inputs
=
{
'X'
:
[
grad_var
,
new_var
]},
inputs
=
{
'X'
:
[
grad_var
,
merged_param_grad_var
]},
outputs
=
{
'Out'
:
grad_var
},
outputs
=
{
'Out'
:
merged_param_grad_var
},
attrs
=
{
self
.
_op_role_key
:
self
.
_op_role
.
Backward
,
})
offset
+=
1
merged_gradient_names
.
append
(
merged_param_grad_name
)
else
:
# cast gradient to fp32 to accumulate to merged gradient
cast_grad_var_name
=
param_grad_name
+
'@TMP'
cast_grad_var
=
self
.
_create_var
(
block
,
param_grad_var
,
cast_grad_var_name
)
cast_grad_var
.
persistable
=
False
block
.
_insert_op
(
index
=
first_opt_op_idx
+
offset
,
type
=
'cast'
,
inputs
=
{
'X'
:
grad_var
},
outputs
=
{
'Out'
:
cast_grad_var
},
attrs
=
{
attrs
=
{
self
.
_op_device_key
:
device
,
'in_dtype'
:
grad_var
.
dtype
,
'out_dtype'
:
cast_grad_var
.
dtype
,
self
.
_op_role_key
:
self
.
_op_role
.
Backward
,
self
.
_op_role_key
:
self
.
_op_role
.
Backward
,
self
.
_op_role_var_key
:
op_role_var
})
})
offset
+=
1
offset
+=
1
block
.
_insert_op
(
index
=
first_opt_op_idx
+
offset
,
type
=
'sum'
,
inputs
=
{
'X'
:
[
merged_param_grad_var
,
cast_grad_var
]
},
outputs
=
{
'Out'
:
merged_param_grad_var
},
attrs
=
{
self
.
_op_role_key
:
self
.
_op_role
.
Backward
,
})
offset
+=
1
merged_gradient_names
.
append
(
merged_param_grad_name
)
return
merged_gradient_names
def
_add_sub_blocks
(
self
,
main_block
,
program_list
):
def
_add_sub_blocks
(
self
,
main_block
,
program_list
):
main_program
=
main_block
.
program
main_program
=
main_block
.
program
for
prog_info
in
program_list
:
for
prog
in
program_list
:
prog
=
prog_info
[
'program'
]
for
op
in
prog
.
block
(
0
).
ops
:
for
op
in
prog
.
block
(
0
).
ops
:
if
not
op
.
has_attr
(
'sub_block'
):
if
not
op
.
has_attr
(
'sub_block'
):
continue
continue
...
@@ -4372,8 +4566,7 @@ class PipelineOptimizer(object):
...
@@ -4372,8 +4566,7 @@ class PipelineOptimizer(object):
# var_info = {var_name: [program1, program2...]},
# var_info = {var_name: [program1, program2...]},
# persistable var only
# persistable var only
var_info
=
dict
()
var_info
=
dict
()
for
prog_info
in
program_list
:
for
prog
in
program_list
:
prog
=
prog_info
[
'program'
]
block
=
prog
.
block
(
0
)
block
=
prog
.
block
(
0
)
for
var_name
in
block
.
vars
:
for
var_name
in
block
.
vars
:
if
var_name
==
"double_buffer_0"
:
continue
if
var_name
==
"double_buffer_0"
:
continue
...
@@ -4395,7 +4588,7 @@ class PipelineOptimizer(object):
...
@@ -4395,7 +4588,7 @@ class PipelineOptimizer(object):
block
=
prog
.
block
(
0
)
block
=
prog
.
block
(
0
)
for
op
in
block
.
ops
:
for
op
in
block
.
ops
:
if
op
.
type
==
"recv_v2"
or
op
.
type
==
"create_py_reader"
or
\
if
op
.
type
==
"recv_v2"
or
op
.
type
==
"create_py_reader"
or
\
op
.
type
==
"read"
:
op
.
type
==
"read"
or
op
.
type
==
"update_loss_scaling"
:
continue
continue
# We have processed lr related vars
# We have processed lr related vars
if
op
.
attr
(
self
.
_op_role_key
)
==
int
(
if
op
.
attr
(
self
.
_op_role_key
)
==
int
(
...
@@ -4423,6 +4616,15 @@ class PipelineOptimizer(object):
...
@@ -4423,6 +4616,15 @@ class PipelineOptimizer(object):
read_block
=
prog
.
block
(
0
)
read_block
=
prog
.
block
(
0
)
read_device
=
self
.
_get_device_info
(
read_block
)
read_device
=
self
.
_get_device_info
(
read_block
)
read_dev_index
=
int
(
read_device
.
split
(
':'
)[
1
])
read_dev_index
=
int
(
read_device
.
split
(
':'
)[
1
])
pair
=
(
write_dev_index
,
read_dev_index
)
pair_key
=
write_dev_index
*
1000
+
read_dev_index
if
pair
not
in
self
.
_pipeline_pair
:
self
.
_pipeline_pair
.
append
(
pair
)
self
.
_pp_ring_map
[
pair_key
]
=
self
.
ring_id
ring_id
=
self
.
ring_id
self
.
ring_id
+=
1
else
:
ring_id
=
self
.
_pp_ring_map
[
pair_key
]
write_block
.
_insert_op
(
write_block
.
_insert_op
(
index
=
0
,
index
=
0
,
...
@@ -4430,11 +4632,12 @@ class PipelineOptimizer(object):
...
@@ -4430,11 +4632,12 @@ class PipelineOptimizer(object):
inputs
=
{
'X'
:
write_block
.
var
(
var_name
),
},
inputs
=
{
'X'
:
write_block
.
var
(
var_name
),
},
attrs
=
{
attrs
=
{
self
.
_op_device_key
:
write_device
,
self
.
_op_device_key
:
write_device
,
'use_calc_stream'
:
Tru
e
,
'use_calc_stream'
:
Fals
e
,
# A trick to make the role LRSched to avoid copy every
# A trick to make the role LRSched to avoid copy every
# microbatch
# microbatch
self
.
_op_role_key
:
self
.
_op_role
.
LRSched
,
self
.
_op_role_key
:
self
.
_op_role
.
LRSched
,
'peer'
:
read_dev_index
,
'peer'
:
read_dev_index
,
'ring_id'
:
ring_id
})
})
read_block
.
_insert_op
(
read_block
.
_insert_op
(
index
=
0
,
index
=
0
,
...
@@ -4444,36 +4647,68 @@ class PipelineOptimizer(object):
...
@@ -4444,36 +4647,68 @@ class PipelineOptimizer(object):
'out_shape'
:
read_block
.
var
(
var_name
).
shape
,
'out_shape'
:
read_block
.
var
(
var_name
).
shape
,
'dtype'
:
read_block
.
var
(
var_name
).
dtype
,
'dtype'
:
read_block
.
var
(
var_name
).
dtype
,
self
.
_op_device_key
:
read_device
,
self
.
_op_device_key
:
read_device
,
'use_calc_stream'
:
Tru
e
,
'use_calc_stream'
:
Fals
e
,
# A trick to make the role LRSched to avoid copy every
# A trick to make the role LRSched to avoid copy every
# microbatch
# microbatch
self
.
_op_role_key
:
self
.
_op_role
.
LRSched
,
self
.
_op_role_key
:
self
.
_op_role
.
LRSched
,
'peer'
:
write_dev_index
'peer'
:
write_dev_index
,
'ring_id'
:
ring_id
})
read_block
.
_insert_op
(
index
=
1
,
type
=
'c_sync_comm_stream'
,
inputs
=
{
'X'
:
[
read_block
.
var
(
var_name
)]},
outputs
=
{
'Out'
:
[
read_block
.
var
(
var_name
)]},
attrs
=
{
self
.
_op_device_key
:
read_device
,
# A trick to make the role LRSched to avoid copy every
# microbatch
self
.
_op_role_key
:
self
.
_op_role
.
LRSched
,
'ring_id'
:
ring_id
})
})
def
_is_gradient_clip_op
(
self
,
op
):
return
op
.
desc
.
has_attr
(
"op_namescope"
)
\
and
op
.
desc
.
attr
(
"op_namescope"
).
startswith
(
"/gradient_clip"
)
def
_is_regularization_op
(
self
,
op
):
return
op
.
desc
.
has_attr
(
"op_namescope"
)
\
and
op
.
desc
.
attr
(
"op_namescope"
).
startswith
(
"/regularization"
)
def
minimize
(
self
,
def
minimize
(
self
,
loss
,
loss
,
startup_program
=
None
,
startup_program
=
None
,
parameter_list
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
no_grad_set
=
None
):
main_block
=
loss
.
block
main_block
=
loss
.
block
self
.
origin_main_block
=
main_block
if
startup_program
is
None
:
if
startup_program
is
None
:
startup_program
=
default_startup_program
()
startup_program
=
default_startup_program
()
optimize_ops
,
params_grads
=
self
.
_optimizer
.
minimize
(
optimize_ops
,
params_grads
=
self
.
_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
self
.
_param_device_map
=
self
.
_optimizer
.
_param_device_map
self
.
_param_device_map
=
self
.
_origin_optimizer
.
_param_device_map
assert
main_block
.
program
.
_pipeline_opt
\
and
'local_rank'
in
main_block
.
program
.
_pipeline_opt
,
\
'Please use pipeline with fleet.'
local_rank
=
main_block
.
program
.
_pipeline_opt
[
'local_rank'
]
self
.
_global_ring_id
=
main_block
.
program
.
_pipeline_opt
[
'global_ring_id'
]
schedule_mode
=
0
if
'schedule_mode'
in
main_block
.
program
.
_pipeline_opt
:
schedule_mode
=
main_block
.
program
.
_pipeline_opt
[
'schedule_mode'
]
self
.
schedule_mode
=
schedule_mode
# micro batch size
self
.
micro_batch_size
=
main_block
.
program
.
_pipeline_opt
[
self
.
micro_batch_size
=
main_block
.
program
.
_pipeline_opt
[
'micro_batch_size'
]
'micro_batch_size'
]
# Step1: add default op_device attribute for regulization and clip ops
self
.
use_sharding
=
False
self
.
_add_opdevice_attr_for_regularization_clip
(
main_block
)
if
'use_sharding'
in
main_block
.
program
.
_pipeline_opt
:
self
.
use_sharding
=
main_block
.
program
.
_pipeline_opt
[
'use_sharding'
]
# Step2: add default op_device attribute for ops whose op_device
self
.
ring_id
=
main_block
.
program
.
_pipeline_opt
[
'ring_id'
]
# attribute have not been set yet. Then check all ops have the
# op_device attribute.
self
.
_add_default_opdevice_attr
(
main_block
)
device_specs
=
self
.
_check_validation
(
main_block
)
# Step1: add default op_device attribute for ops.
self
.
_add_op_device_attr
(
main_block
)
device_list
=
self
.
_check_validation
(
main_block
)
def
device_cmp
(
device1
,
device2
):
def
device_cmp
(
device1
,
device2
):
dev1_id
=
int
(
device1
.
split
(
':'
)[
1
])
dev1_id
=
int
(
device1
.
split
(
':'
)[
1
])
...
@@ -4485,70 +4720,59 @@ class PipelineOptimizer(object):
...
@@ -4485,70 +4720,59 @@ class PipelineOptimizer(object):
else
:
else
:
return
0
return
0
sorted_device_spec
=
sorted
(
device_specs
,
key
=
cmp_to_key
(
device_cmp
))
sorted_device_list
=
sorted
(
device_list
,
key
=
cmp_to_key
(
device_cmp
))
assert
sorted_device_spec
==
device_specs
,
(
assert
sorted_device_list
==
device_list
,
(
"With pipeline "
"With pipeline parallelism, you must use gpu devices one after "
"parallelism, you must use gpu devices one after another "
"another in the order of their ids."
)
"in the order of their ids."
)
# Step2: add send and recv ops between section boundaries
# Step3: add send and recv ops between section boundaries
self
.
_insert_sendrecv_ops_for_boundaries
(
main_block
)
self
.
_insert_sendrecv_ops_for_boundaries
(
main_block
)
# Step
4
: split program into sections and add pairs of
# Step
3
: split program into sections and add pairs of
# send and recv ops for data var.
# send and recv ops for data var.
main_program
=
main_block
.
program
main_program
=
main_block
.
program
program_list
=
self
.
_split_program
(
main_program
,
device_
specs
)
program_list
=
self
.
_split_program
(
main_program
,
device_
list
)
for
p
in
program_list
:
for
p
in
program_list
:
self
.
_create_vars
(
p
[
"program"
].
block
(
0
),
self
.
_create_vars
(
p
.
global_block
(),
main_block
)
main_program
.
global_block
())
self
.
_insert_sendrecv_for_data_var
(
main_block
,
program_list
,
startup_program
,
device_specs
)
# Step
5
: Special Case: process persistable vars that exist in
# Step
4
: Special Case: process persistable vars that exist in
# multiple sections
# multiple sections
self
.
_process_persistable_vars_in_multi_sections
(
self
.
_process_persistable_vars_in_multi_sections
(
main_program
,
startup_program
,
program_list
)
main_program
,
startup_program
,
program_list
)
# Step
6
: Add sub blocks for section programs
# Step
5
: Add sub blocks for section programs
self
.
_add_sub_blocks
(
main_block
,
program_list
)
self
.
_add_sub_blocks
(
main_block
,
program_list
)
assert
(
main_program
.
_pipeline_opt
and
local_rank
=
main_program
.
_pipeline_opt
[
'local_rank'
]
%
len
(
device_list
)
isinstance
(
main_program
.
_pipeline_opt
,
dict
)
and
'local_rank'
in
main_program
.
_pipeline_opt
),
\
"You must use pipeline with fleet"
local_rank
=
main_program
.
_pipeline_opt
[
'local_rank'
]
%
len
(
device_specs
)
self
.
schedule_mode
=
main_program
.
_pipeline_opt
[
'schedule_mode'
]
place_list
=
[]
place_list
=
[]
for
dev
_spec
in
device_specs
:
for
dev
in
device_list
:
dev_index
=
dev_spec
.
split
(
":"
)[
1
]
dev_index
=
int
(
dev
.
split
(
":"
)[
1
])
place_list
.
append
(
core
.
CUDAPlace
(
local_rank
))
place_list
.
append
(
core
.
CUDAPlace
(
dev_index
%
8
))
# Step
7
: Split startup program
# Step
6
: Split startup program
new_startup_program
=
self
.
_split_startup_program
(
startup_program
,
new_startup_program
=
self
.
_split_startup_program
(
startup_program
,
local_rank
)
local_rank
)
# Step8: clear gradients before each mini-batch and
# accumulate gradients during backward
self
.
_clear_gradients
(
program_list
[
local_rank
][
'program'
].
global_block
(),
dev_spec
=
device_specs
[
local_rank
])
self
.
_accumulate_gradients
(
program_list
[
local_rank
][
'program'
]
.
global_block
())
startup_program
.
_pipeline_opt
=
{
startup_program
.
_pipeline_opt
=
{
"startup_program"
:
new_startup_program
,
"startup_program"
:
new_startup_program
,
}
}
real_block
=
program_list
[
local_rank
].
global_block
()
self
.
_insert_loss_scale
(
real_block
)
if
not
self
.
use_sharding
:
# Step7: clear gradients before each mini-batch and
# accumulate gradients during backward
self
.
_rename_gradient_var_name
(
real_block
)
real_block
.
_sync_with_cpp
()
self
.
_accumulate_gradients
(
real_block
)
real_block
.
_sync_with_cpp
()
place_id
=
int
(
os
.
getenv
(
"FLAGS_selected_gpus"
,
"0"
))
place_id
=
int
(
os
.
getenv
(
"FLAGS_selected_gpus"
,
"0"
))
main_program
.
_pipeline_opt
=
{
main_program
.
_pipeline_opt
=
{
"trainer"
:
"PipelineTrainer"
,
"trainer"
:
"PipelineTrainer"
,
"device_worker"
:
"Section"
,
"device_worker"
:
"Section"
,
"pipeline_stage"
:
local_rank
,
"pipeline_stage"
:
local_rank
,
"num_pipeline_stages"
:
len
(
device_
specs
),
"num_pipeline_stages"
:
len
(
device_
list
),
"schedule_mode"
:
self
.
schedule_mode
,
"schedule_mode"
:
self
.
schedule_mode
,
"inner_parallelism"
:
len
(
device_
specs
),
"inner_parallelism"
:
len
(
device_
list
),
"section_program"
:
program_list
[
local_rank
],
"section_program"
:
program_list
[
local_rank
],
"place"
:
place_list
[
local_rank
],
"place"
:
place_list
[
local_rank
],
"place_id"
:
place_id
,
"place_id"
:
place_id
,
...
@@ -4556,7 +4780,7 @@ class PipelineOptimizer(object):
...
@@ -4556,7 +4780,7 @@ class PipelineOptimizer(object):
"num_microbatches"
:
self
.
_num_microbatches
,
"num_microbatches"
:
self
.
_num_microbatches
,
"start_cpu_core_id"
:
self
.
_start_cpu_core_id
,
"start_cpu_core_id"
:
self
.
_start_cpu_core_id
,
}
}
return
optimize_ops
,
params_grads
,
program_list
return
optimize_ops
,
params_grads
,
program_list
,
self
.
_pipeline_pair
,
self
.
_pp_ring_map
class
RecomputeOptimizer
(
Optimizer
):
class
RecomputeOptimizer
(
Optimizer
):
...
...
python/paddle/fluid/tests/unittests/pipeline_mnist.py
浏览文件 @
c3974d0e
...
@@ -66,12 +66,21 @@ def cnn_model(data):
...
@@ -66,12 +66,21 @@ def cnn_model(data):
param_shape
=
[
reduce
(
lambda
a
,
b
:
a
*
b
,
input_shape
[
1
:],
1
)]
+
[
SIZE
]
param_shape
=
[
reduce
(
lambda
a
,
b
:
a
*
b
,
input_shape
[
1
:],
1
)]
+
[
SIZE
]
scale
=
(
2.0
/
(
param_shape
[
0
]
**
2
*
SIZE
))
**
0.5
scale
=
(
2.0
/
(
param_shape
[
0
]
**
2
*
SIZE
))
**
0.5
with
fluid
.
device_guard
(
"gpu:1"
):
predict
=
fluid
.
layers
.
fc
(
predict
=
fluid
.
layers
.
fc
(
input
=
conv_pool_2
,
input
=
conv_pool_2
,
size
=
SIZE
,
size
=
SIZE
,
act
=
"softmax"
,
act
=
"softmax"
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
)))
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
)))
# To cover @RENAMED@GRADIENT
predict2
=
fluid
.
layers
.
fc
(
input
=
conv_pool_1
,
size
=
SIZE
,
act
=
"softmax"
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Constant
(
value
=
0.01
)))
predict
+=
predict2
return
predict
return
predict
...
@@ -108,7 +117,10 @@ class TestDistMnist2x2(TestDistRunnerBase):
...
@@ -108,7 +117,10 @@ class TestDistMnist2x2(TestDistRunnerBase):
bd
=
[
steps_per_pass
*
p
for
p
in
passes
]
bd
=
[
steps_per_pass
*
p
for
p
in
passes
]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
lr_val
=
fluid
.
layers
.
piecewise_decay
(
boundaries
=
bd
,
values
=
lr
)
lr_val
=
fluid
.
layers
.
piecewise_decay
(
boundaries
=
bd
,
values
=
lr
)
opt
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
lr_val
,
momentum
=
0.9
)
opt
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
lr_val
,
momentum
=
0.9
,
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
1.0
))
acc_steps
=
2
# accumulated steps for pipeline
acc_steps
=
2
# accumulated steps for pipeline
if
dist_strategy
:
if
dist_strategy
:
...
@@ -120,6 +132,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
...
@@ -120,6 +132,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
fleet
.
init
(
is_collective
=
True
)
fleet
.
init
(
is_collective
=
True
)
strategy
=
fleet
.
DistributedStrategy
()
strategy
=
fleet
.
DistributedStrategy
()
strategy
.
pipeline
=
True
strategy
.
pipeline
=
True
strategy
.
amp
=
True
strategy
.
pipeline_configs
=
{
strategy
.
pipeline_configs
=
{
'micro_batch_size'
:
batch_size
,
'micro_batch_size'
:
batch_size
,
'schedule_mode'
:
'1F1B'
,
'schedule_mode'
:
'1F1B'
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录