Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
695dd371
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
695dd371
编写于
3月 31, 2021
作者:
L
lilong12
提交者:
GitHub
3月 31, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Adjust pipeline optimizer for 3d parallelism (#31939)
* update, test=develop
上级
6f85e241
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
168 addition
and
161 deletion
+168
-161
paddle/fluid/framework/pipeline_trainer.cc
paddle/fluid/framework/pipeline_trainer.cc
+3
-24
python/paddle/distributed/fleet/meta_optimizers/common.py
python/paddle/distributed/fleet/meta_optimizers/common.py
+5
-0
python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
...e/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+1
-10
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+159
-127
未找到文件。
paddle/fluid/framework/pipeline_trainer.cc
浏览文件 @
695dd371
...
@@ -71,37 +71,16 @@ void PipelineTrainer::CopyParameters(int microbatch_id,
...
@@ -71,37 +71,16 @@ void PipelineTrainer::CopyParameters(int microbatch_id,
const
ProgramDesc
&
program
,
const
ProgramDesc
&
program
,
const
platform
::
Place
&
place
)
{
const
platform
::
Place
&
place
)
{
auto
&
global_block
=
program
.
Block
(
0
);
auto
&
global_block
=
program
.
Block
(
0
);
std
::
map
<
std
::
string
,
int
>
param_map
;
for
(
auto
&
var
:
global_block
.
AllVars
())
{
if
(
var
->
Persistable
())
{
param_map
[
var
->
Name
()]
=
1
;
}
}
for
(
auto
&
var
:
global_block
.
AllVars
())
{
for
(
auto
&
var
:
global_block
.
AllVars
())
{
bool
is_param_grad
=
false
;
size_t
pos
=
0
;
// A magic suffix to indicate the merged gradient
std
::
string
magicSuffix
=
std
::
string
(
kGradVarSuffix
)
+
"@MERGED"
;
if
((
pos
=
var
->
Name
().
find
(
magicSuffix
))
!=
std
::
string
::
npos
)
{
auto
prefix_name
=
var
->
Name
().
substr
(
0
,
pos
);
if
(
param_map
.
find
(
prefix_name
)
!=
param_map
.
end
())
{
is_param_grad
=
true
;
}
}
if
(
var
->
Persistable
()
&&
microbatch_id
==
0
)
{
if
(
var
->
Persistable
()
&&
microbatch_id
==
0
)
{
auto
*
ptr
=
root_scope_
->
Var
(
var
->
Name
());
auto
*
ptr
=
root_scope_
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create persistable var: "
<<
var
->
Name
()
VLOG
(
5
)
<<
"Create persistable var: "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
}
else
if
(
is_param_grad
&&
microbatch_id
==
0
)
{
auto
*
ptr
=
minibatch_scope_
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
VLOG
(
3
)
<<
"Create grad for persistable var: "
<<
var
->
Name
()
<<
", which pointer is "
<<
ptr
;
<<
", which pointer is "
<<
ptr
;
}
else
if
(
!
var
->
Persistable
()
&&
!
is_param_grad
)
{
}
else
if
(
!
var
->
Persistable
())
{
auto
*
ptr
=
microbatch_scopes_
[
microbatch_id
]
->
Var
(
var
->
Name
());
auto
*
ptr
=
microbatch_scopes_
[
microbatch_id
]
->
Var
(
var
->
Name
());
VLOG
(
3
)
<<
"Create variable "
<<
var
->
Name
()
<<
" for microbatch "
VLOG
(
5
)
<<
"Create variable "
<<
var
->
Name
()
<<
" for microbatch "
<<
microbatch_id
<<
", which pointer is "
<<
ptr
;
<<
microbatch_id
<<
", which pointer is "
<<
ptr
;
InitializeVariable
(
ptr
,
var
->
GetType
());
InitializeVariable
(
ptr
,
var
->
GetType
());
}
}
...
...
python/paddle/distributed/fleet/meta_optimizers/common.py
浏览文件 @
695dd371
...
@@ -106,6 +106,11 @@ class CollectiveHelper(object):
...
@@ -106,6 +106,11 @@ class CollectiveHelper(object):
'use_calc_stream'
:
True
,
'use_calc_stream'
:
True
,
OP_ROLE_KEY
:
OpRole
.
Forward
OP_ROLE_KEY
:
OpRole
.
Forward
})
})
block
.
append_op
(
type
=
'c_sync_calc_stream'
,
inputs
=
{
'X'
:
sync_var
},
outputs
=
{
'Out'
:
sync_var
},
attrs
=
{
OP_ROLE_KEY
:
OpRole
.
Forward
})
block
=
program
.
global_block
()
block
=
program
.
global_block
()
if
core
.
is_compiled_with_cuda
():
if
core
.
is_compiled_with_cuda
():
...
...
python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
浏览文件 @
695dd371
...
@@ -171,6 +171,7 @@ class PipelineOptimizer(MetaOptimizerBase):
...
@@ -171,6 +171,7 @@ class PipelineOptimizer(MetaOptimizerBase):
program
.
_pipeline_opt
[
'ring_id'
]
=
self
.
start_pipeline_ring_id
program
.
_pipeline_opt
[
'ring_id'
]
=
self
.
start_pipeline_ring_id
program
.
_pipeline_opt
[
'micro_batch_size'
]
=
self
.
micro_batch_size
program
.
_pipeline_opt
[
'micro_batch_size'
]
=
self
.
micro_batch_size
program
.
_pipeline_opt
[
'schedule_mode'
]
=
self
.
schedule_mode
program
.
_pipeline_opt
[
'schedule_mode'
]
=
self
.
schedule_mode
program
.
_pipeline_opt
[
'use_sharding'
]
=
False
optimize_ops
,
params_grads
,
prog_list
,
pp_pair
,
ring_map
=
self
.
wrapped_opt
.
minimize
(
optimize_ops
,
params_grads
,
prog_list
,
pp_pair
,
ring_map
=
self
.
wrapped_opt
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
self
.
startup_program
=
orig_startup_program
.
_pipeline_opt
[
self
.
startup_program
=
orig_startup_program
.
_pipeline_opt
[
...
@@ -218,7 +219,6 @@ class PipelineOptimizer(MetaOptimizerBase):
...
@@ -218,7 +219,6 @@ class PipelineOptimizer(MetaOptimizerBase):
grad
=
None
grad
=
None
processed_param_name
=
set
()
processed_param_name
=
set
()
first_optimize_op_idx
=
None
first_optimize_op_idx
=
None
add_sync_calc_stream
=
False
for
idx
,
op
in
reversed
(
list
(
enumerate
(
block
.
ops
))):
for
idx
,
op
in
reversed
(
list
(
enumerate
(
block
.
ops
))):
if
is_backward_op
(
op
)
and
not
first_optimize_op_idx
:
if
is_backward_op
(
op
)
and
not
first_optimize_op_idx
:
first_optimize_op_idx
=
idx
+
1
first_optimize_op_idx
=
idx
+
1
...
@@ -242,15 +242,6 @@ class PipelineOptimizer(MetaOptimizerBase):
...
@@ -242,15 +242,6 @@ class PipelineOptimizer(MetaOptimizerBase):
origin_param
=
origin_block
.
vars
[
op_role_var
[
i
]]
origin_param
=
origin_block
.
vars
[
op_role_var
[
i
]]
if
origin_param
.
is_distributed
:
if
origin_param
.
is_distributed
:
continue
continue
if
not
add_sync_calc_stream
:
add_sync_calc_stream
=
True
block
.
_insert_op
(
first_optimize_op_idx
+
offset
,
type
=
'c_sync_calc_stream'
,
inputs
=
{
'X'
:
grad
},
outputs
=
{
'Out'
:
grad
},
attrs
=
{
OP_ROLE_KEY
:
OpRole
.
Optimize
})
offset
+=
1
block
.
_insert_op
(
block
.
_insert_op
(
first_optimize_op_idx
+
offset
,
first_optimize_op_idx
+
offset
,
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
695dd371
...
@@ -3805,7 +3805,6 @@ class PipelineOptimizer(object):
...
@@ -3805,7 +3805,6 @@ class PipelineOptimizer(object):
self
.
_param_device_map
=
None
self
.
_param_device_map
=
None
self
.
_pipeline_pair
=
[]
self
.
_pipeline_pair
=
[]
self
.
_pp_ring_map
=
dict
()
self
.
_pp_ring_map
=
dict
()
self
.
_global_ring_id
=
None
# insert allreduce op to sync global information for global
# insert allreduce op to sync global information for global
# gradient clip and amp
# gradient clip and amp
...
@@ -3841,7 +3840,7 @@ class PipelineOptimizer(object):
...
@@ -3841,7 +3840,7 @@ class PipelineOptimizer(object):
inputs
=
{
'X'
:
temp_var
if
op
.
type
==
"reduce_any"
else
out_var
},
inputs
=
{
'X'
:
temp_var
if
op
.
type
==
"reduce_any"
else
out_var
},
outputs
=
{
'Out'
:
temp_var
if
op
.
type
==
"reduce_any"
else
out_var
},
outputs
=
{
'Out'
:
temp_var
if
op
.
type
==
"reduce_any"
else
out_var
},
attrs
=
{
attrs
=
{
'ring_id'
:
self
.
_
global_ring_id
,
'ring_id'
:
self
.
global_ring_id
,
self
.
_op_role_key
:
self
.
_op_role
.
Optimize
,
self
.
_op_role_key
:
self
.
_op_role
.
Optimize
,
'use_calc_stream'
:
True
'use_calc_stream'
:
True
})
})
...
@@ -3887,6 +3886,16 @@ class PipelineOptimizer(object):
...
@@ -3887,6 +3886,16 @@ class PipelineOptimizer(object):
reserved_x
.
append
(
input_name
)
reserved_x
.
append
(
input_name
)
op
.
desc
.
set_input
(
'X'
,
reserved_x
)
op
.
desc
.
set_input
(
'X'
,
reserved_x
)
op
.
desc
.
set_output
(
'Out'
,
reserved_x
)
op
.
desc
.
set_output
(
'Out'
,
reserved_x
)
elif
op
.
type
==
'check_finite_and_unscale'
:
for
input_name
in
op
.
desc
.
input
(
"X"
):
if
block
.
_find_var_recursive
(
input_name
):
reserved_x
.
append
(
input_name
)
op
.
desc
.
set_input
(
'X'
,
reserved_x
)
op
.
desc
.
set_output
(
'Out'
,
reserved_x
)
if
len
(
reserved_x
)
==
0
:
block
.
_remove_op
(
op_idx
)
op_size
-=
1
continue
elif
op
.
type
==
'sum'
and
self
.
_is_gradient_clip_op
(
op
):
elif
op
.
type
==
'sum'
and
self
.
_is_gradient_clip_op
(
op
):
for
input_name
in
op
.
desc
.
input
(
"X"
):
for
input_name
in
op
.
desc
.
input
(
"X"
):
if
block
.
_find_var_recursive
(
input_name
):
if
block
.
_find_var_recursive
(
input_name
):
...
@@ -4020,63 +4029,32 @@ class PipelineOptimizer(object):
...
@@ -4020,63 +4029,32 @@ class PipelineOptimizer(object):
self
.
_create_vars
(
new_startup_program
.
global_block
(),
block
)
self
.
_create_vars
(
new_startup_program
.
global_block
(),
block
)
return
new_startup_program
return
new_startup_program
def
_find_post_op
(
self
,
ops
,
cur_op
,
var_name
):
def
_find_post_op
(
self
,
index
,
var_name
):
"""
"""
Find the real post op that has variable named var_name as input.
Find the post op that has variable named var_name as input.
Args:
ops (list): A list of ops.
cur_op (Operator): Current operator which has variable named
var_name as output.
var_name (string): Variable name.
"""
"""
# To skip the cast op added by amp which has no op_device set
post_ops
=
self
.
input_var_to_op
[
var_name
]
if
'.cast_fp32'
in
var_name
:
if
post_ops
==
None
:
return
None
var_name
=
var_name
.
replace
(
'.cast_fp32'
,
''
)
result_op
=
None
elif
'.cast_fp16'
in
var_name
:
for
post_op
,
post_idx
in
reversed
(
post_ops
):
var_name
=
var_name
.
replace
(
'.cast_fp16'
,
''
)
if
post_idx
>
index
:
post_op
=
[]
result_op
=
post_op
before
=
True
for
op
in
ops
:
if
op
==
cur_op
:
before
=
False
continue
if
before
:
continue
for
in_var_name
in
op
.
input_arg_names
:
if
in_var_name
==
var_name
:
post_op
.
append
(
op
)
break
break
if
post_op
:
return
result_op
return
post_op
[
0
]
return
None
def
_find_
real_prev_op
(
self
,
ops
,
cur_op
,
var_name
):
def
_find_
prev_op
(
self
,
index
,
var_name
):
"""
"""
Find the real previous op that outputs variable named var_name.
Find the previous op of op with index that outputs
variable named var_name.
Args:
ops (list): A list of ops.
cur_op (Operator): Current operator which has variable named
var_name as input.
var_name (string): Variable name.
"""
"""
prev_op
=
[
]
prev_op
s
=
self
.
output_var_to_op
[
var_name
]
for
op
in
ops
:
if
prev_ops
==
None
:
return
None
if
op
.
type
==
'send_v2'
or
op
.
type
==
'recv_v2'
\
result_op
=
None
or
op
.
type
==
'c_broadcast'
:
for
prev_op
,
prev_idx
in
reversed
(
prev_ops
)
:
continue
if
prev_idx
<
index
:
if
op
==
cur_op
:
result_op
=
prev_op
break
break
for
out_var_name
in
op
.
output_arg_names
:
return
result_op
if
out_var_name
==
var_name
:
prev_op
.
append
(
op
)
if
prev_op
:
# A op may have more than one prev op,
# e.g., for 'learning_rate', there may be multiple ops have it as
# output.
return
prev_op
[
-
1
]
return
None
def
_rename_arg
(
self
,
op
,
old_name
,
new_name
):
def
_rename_arg
(
self
,
op
,
old_name
,
new_name
):
op
.
_rename_input
(
old_name
,
new_name
)
op
.
_rename_input
(
old_name
,
new_name
)
...
@@ -4136,23 +4114,21 @@ class PipelineOptimizer(object):
...
@@ -4136,23 +4114,21 @@ class PipelineOptimizer(object):
# For LRSched ops, we should put them on all sub-programs to
# For LRSched ops, we should put them on all sub-programs to
# make sure each sub-program update the lr correctly
# make sure each sub-program update the lr correctly
op
.
_set_attr
(
self
.
_op_device_key
,
"gpu:all"
)
op
.
_set_attr
(
self
.
_op_device_key
,
"gpu:all"
)
elif
(
op
.
type
==
"cast"
or
elif
op
.
type
==
"scale"
and
self
.
_is_backward_op
(
op
):
op
.
type
==
"scale"
)
and
self
.
_is_backward_op
(
op
):
prev_op
=
self
.
_find_prev_op
(
idx
,
op
.
desc
.
input
(
"X"
)[
0
])
prev_op
=
self
.
_find_real_prev_op
(
block
.
ops
,
op
,
op
.
desc
.
input
(
"X"
)[
0
])
op
.
_set_attr
(
self
.
_op_device_key
,
prev_op
.
attr
(
self
.
_op_device_key
))
op
.
_set_attr
(
self
.
_op_device_key
,
prev_op
.
attr
(
self
.
_op_device_key
))
elif
op
.
type
==
"memcpy"
and
not
self
.
_is_optimize_op
(
op
):
elif
op
.
type
==
"memcpy"
and
not
self
.
_is_optimize_op
(
op
):
# for checkpoint offloading
assert
len
(
op
.
input_arg_names
)
==
1
and
len
(
assert
len
(
op
.
input_arg_names
)
==
1
and
len
(
op
.
output_arg_names
)
==
1
op
.
output_arg_names
)
==
1
input_name
=
op
.
input_arg_names
[
0
]
input_name
=
op
.
input_arg_names
[
0
]
output_name
=
op
.
output_arg_names
[
0
]
output_name
=
op
.
output_arg_names
[
0
]
if
'@Fetch'
in
output_name
:
if
'@Fetch'
in
output_name
:
post_op
=
self
.
_find_post_op
(
block
.
ops
,
op
,
output_name
)
post_op
=
self
.
_find_post_op
(
idx
,
output_name
)
op
.
_set_attr
(
self
.
_op_device_key
,
op
.
_set_attr
(
self
.
_op_device_key
,
post_op
.
attr
(
self
.
_op_device_key
))
post_op
.
attr
(
self
.
_op_device_key
))
else
:
else
:
prev_op
=
self
.
_find_real_prev_op
(
block
.
ops
,
op
,
prev_op
=
self
.
_find_prev_op
(
idx
,
op
.
desc
.
input
(
"X"
)[
0
])
op
.
desc
.
input
(
"X"
)[
0
])
op
.
_set_attr
(
self
.
_op_device_key
,
op
.
_set_attr
(
self
.
_op_device_key
,
prev_op
.
attr
(
self
.
_op_device_key
))
prev_op
.
attr
(
self
.
_op_device_key
))
elif
self
.
_is_loss_op
(
op
):
elif
self
.
_is_loss_op
(
op
):
...
@@ -4165,16 +4141,11 @@ class PipelineOptimizer(object):
...
@@ -4165,16 +4141,11 @@ class PipelineOptimizer(object):
assert
device
,
"Please put you program within device_guard scope."
assert
device
,
"Please put you program within device_guard scope."
for
i
in
range
(
offset
):
for
i
in
range
(
offset
):
block
.
ops
[
idx
+
i
].
_set_attr
(
self
.
_op_device_key
,
device
)
block
.
ops
[
idx
+
i
].
_set_attr
(
self
.
_op_device_key
,
device
)
elif
self
.
_is_optimize_op
(
op
)
and
op
.
type
==
"check_finite_and_unscale"
:
op_role_var
=
op
.
attr
(
self
.
_op_role_var_key
)
param_name
=
op_role_var
[
0
]
device
=
self
.
_param_device_map
[
param_name
]
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
elif
self
.
_is_optimize_op
(
op
)
and
op
.
type
==
"cast"
:
elif
self
.
_is_optimize_op
(
op
)
and
op
.
type
==
"cast"
:
# For fp16-->fp32 cast added by AMP
# For fp16-->fp32 cast added by AMP
grad_name
=
op
.
output
(
'Out'
)
grad_name
=
op
.
output
(
'Out'
)
assert
len
(
grad_name
)
==
1
assert
len
(
grad_name
)
==
1
param_name
=
grad_name
[
0
].
strip
(
core
.
grad_var_suffix
()
)
param_name
=
self
.
_strip_grad_suffix
(
grad_name
[
0
]
)
device
=
self
.
_param_device_map
[
param_name
]
device
=
self
.
_param_device_map
[
param_name
]
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
elif
self
.
_is_gradient_clip_op
(
op
)
or
self
.
_is_regularization_op
(
op
):
elif
self
.
_is_gradient_clip_op
(
op
)
or
self
.
_is_regularization_op
(
op
):
...
@@ -4197,7 +4168,11 @@ class PipelineOptimizer(object):
...
@@ -4197,7 +4168,11 @@ class PipelineOptimizer(object):
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
else
:
else
:
other_known_ops
=
[
other_known_ops
=
[
'update_loss_scaling'
,
'reduce_any'
,
'concat'
,
'sum'
'update_loss_scaling'
,
'reduce_any'
,
'concat'
,
'sum'
,
'check_finite_and_unscale'
,
]
]
assert
op
.
type
in
other_known_ops
,
"For other ops without "
\
assert
op
.
type
in
other_known_ops
,
"For other ops without "
\
"op_device set, they must be one of {}, but it "
\
"op_device set, they must be one of {}, but it "
\
...
@@ -4274,41 +4249,70 @@ class PipelineOptimizer(object):
...
@@ -4274,41 +4249,70 @@ class PipelineOptimizer(object):
Insert a pair of send and recv ops for every two
Insert a pair of send and recv ops for every two
consecutive ops on different devices.
consecutive ops on different devices.
"""
"""
extra_index
=
0
extra_index
_info
=
{
'index'
:
0
}
# A map from var to device where op takes it as input,
# A map from var to device where op takes it as input,
# avoiding multiple send and recv ops.
# avoiding multiple send and recv ops.
var_dev_map
=
dict
()
input_var_to_device
=
dict
()
for
index
,
op
in
enumerate
(
list
(
block
.
ops
)):
for
index
,
op
in
enumerate
(
list
(
block
.
ops
)):
cur_device
=
op
.
attr
(
self
.
_op_device_key
)
cur_device
=
op
.
attr
(
self
.
_op_device_key
)
if
cur_device
==
"gpu:all"
:
continue
if
cur_device
==
"gpu:all"
:
continue
for
var_name
in
op
.
input_arg_names
:
for
var_name
in
op
.
input_arg_names
:
# i.e., lod_tensor_blocking_queue created by DataLoader,
# which only exists in startup program.
var
=
block
.
var
(
var_name
)
var
=
block
.
var
(
var_name
)
# skip data
, because we will process it late
r
# skip data
va
r
if
var
.
is_data
:
continue
if
var
.
is_data
:
continue
prev_device
=
None
prev_device
=
None
if
var_name
in
self
.
_param_device_map
:
generate_ops
=
self
.
output_var_to_op
.
get
(
var_name
)
if
generate_ops
is
None
:
if
var_name
not
in
self
.
_param_device_map
:
continue
prev_device
=
self
.
_param_device_map
[
var_name
]
prev_device
=
self
.
_param_device_map
[
var_name
]
prev_op
=
self
.
_find_real_prev_op
(
block
.
ops
,
op
,
var_name
)
prev_op
=
self
.
_find_prev_op
(
index
,
var_name
)
if
not
prev_device
:
if
not
prev_device
:
prev_device
=
prev_op
.
attr
(
self
.
_op_device_key
)
\
prev_device
=
prev_op
.
attr
(
self
.
_op_device_key
)
\
if
prev_op
else
None
if
prev_op
else
None
if
not
prev_device
or
prev_device
==
'gpu:all'
:
continue
if
prev_device
!=
cur_device
:
if
prev_device
is
None
or
prev_device
==
"gpu:all"
:
continue
if
var_name
not
in
var_dev_map
:
var_dev_map
[
var_name
]
=
[]
if
cur_device
in
var_dev_map
[
var_name
]:
continue
if
prev_device
==
cur_device
:
continue
var_dev_map
[
var_name
].
append
(
cur_device
)
if
var_name
not
in
input_var_to_device
:
input_var_to_device
[
var_name
]
=
[]
if
(
cur_device
,
prev_device
)
in
input_var_to_device
[
var_name
]:
continue
device_type
=
cur_device
.
split
(
':'
)[
0
]
+
':'
def
_insert_send_recv
(
cur_id
,
prev_id
):
cur_dev
=
device_type
+
str
(
cur_id
)
prev_dev
=
device_type
+
str
(
prev_id
)
if
(
cur_dev
,
prev_dev
)
in
input_var_to_device
[
var_name
]:
return
if
cur_id
-
prev_id
>
1
:
_insert_send_recv
(
cur_id
-
1
,
prev_id
)
_insert_send_recv
(
cur_id
,
cur_id
-
1
)
input_var_to_device
[
var_name
].
append
(
(
cur_dev
,
prev_dev
))
return
elif
cur_id
-
prev_id
<
-
1
:
_insert_send_recv
(
cur_id
+
1
,
prev_id
)
_insert_send_recv
(
cur_id
,
cur_id
+
1
)
input_var_to_device
[
var_name
].
append
(
(
cur_dev
,
prev_dev
))
return
assert
abs
(
cur_id
-
prev_id
)
==
1
input_var_to_device
[
var_name
].
append
((
cur_dev
,
prev_dev
))
op_role
=
op
.
a
ll_attrs
()[
self
.
_op_role_key
]
op_role
=
op
.
a
ttr
(
self
.
_op_role_key
)
var
=
block
.
vars
[
var_name
]
var
=
block
.
vars
[
var_name
]
prev_device_index
=
int
(
prev_device
.
split
(
':'
)[
1
])
pair
=
(
prev_id
,
cur_id
)
cur_device_index
=
int
(
cur_device
.
split
(
':'
)[
1
])
# 1000 is just a magic number
pair
=
(
prev_device_index
,
cur_device_index
)
pair_key
=
prev_id
*
1000
+
cur_id
pair_key
=
prev_device_index
*
1000
+
cur_device_index
if
pair
not
in
self
.
_pipeline_pair
:
if
pair
not
in
self
.
_pipeline_pair
:
self
.
_pipeline_pair
.
append
(
pair
)
self
.
_pipeline_pair
.
append
(
pair
)
self
.
_pp_ring_map
[
pair_key
]
=
self
.
ring_id
self
.
_pp_ring_map
[
pair_key
]
=
self
.
ring_id
...
@@ -4316,89 +4320,95 @@ class PipelineOptimizer(object):
...
@@ -4316,89 +4320,95 @@ class PipelineOptimizer(object):
self
.
ring_id
+=
1
self
.
ring_id
+=
1
else
:
else
:
ring_id
=
self
.
_pp_ring_map
[
pair_key
]
ring_id
=
self
.
_pp_ring_map
[
pair_key
]
if
self
.
schedule_mode
==
'F-then-B'
:
# F-then-B
if
self
.
schedule_mode
==
'F-then-B'
:
# F-then-B
block
.
_insert_op
(
block
.
_insert_op
(
index
=
index
+
extra_index
,
index
=
index
+
extra_index
_info
[
'index'
]
,
type
=
'send_v2'
,
type
=
'send_v2'
,
inputs
=
{
'X'
:
var
},
inputs
=
{
'X'
:
var
},
attrs
=
{
attrs
=
{
self
.
_op_device_key
:
prev_dev
ice
,
self
.
_op_device_key
:
prev_dev
,
self
.
_op_role_key
:
op_role
,
self
.
_op_role_key
:
op_role
,
'use_calc_stream'
:
True
,
'use_calc_stream'
:
True
,
'peer'
:
1
,
'peer'
:
1
,
'ring_id'
:
ring_id
'ring_id'
:
ring_id
})
})
extra_index
+=
1
extra_index
_info
[
'index'
]
+=
1
block
.
_insert_op
(
block
.
_insert_op
(
index
=
index
+
extra_index
,
index
=
index
+
extra_index
_info
[
'index'
]
,
type
=
'recv_v2'
,
type
=
'recv_v2'
,
outputs
=
{
'Out'
:
[
var
]},
outputs
=
{
'Out'
:
[
var
]},
attrs
=
{
attrs
=
{
'out_shape'
:
var
.
shape
,
'out_shape'
:
var
.
shape
,
'dtype'
:
var
.
dtype
,
'dtype'
:
var
.
dtype
,
self
.
_op_device_key
:
cur_dev
ice
,
self
.
_op_device_key
:
cur_dev
,
self
.
_op_role_key
:
op_role
,
self
.
_op_role_key
:
op_role
,
'use_calc_stream'
:
True
,
'use_calc_stream'
:
True
,
'peer'
:
0
,
'peer'
:
0
,
'ring_id'
:
ring_id
'ring_id'
:
ring_id
})
})
extra_index
+=
1
extra_index
_info
[
'index'
]
+=
1
elif
self
.
schedule_mode
==
'1F1B'
:
# 1F1B
elif
self
.
schedule_mode
==
'1F1B'
:
# 1F1B
block
.
_insert_op
(
block
.
_insert_op
(
index
=
index
+
extra_index
,
index
=
index
+
extra_index
_info
[
'index'
]
,
type
=
'c_sync_calc_stream'
,
type
=
'c_sync_calc_stream'
,
inputs
=
{
'X'
:
[
var
]},
inputs
=
{
'X'
:
[
var
]},
outputs
=
{
'Out'
:
[
var
]},
outputs
=
{
'Out'
:
[
var
]},
attrs
=
{
attrs
=
{
self
.
_op_device_key
:
prev_dev
ice
,
self
.
_op_device_key
:
prev_dev
,
self
.
_op_role_key
:
op_role
,
self
.
_op_role_key
:
op_role
,
})
})
extra_index
+=
1
extra_index
_info
[
'index'
]
+=
1
block
.
_insert_op
(
block
.
_insert_op
(
index
=
index
+
extra_index
,
index
=
index
+
extra_index
_info
[
'index'
]
,
type
=
'send_v2'
,
type
=
'send_v2'
,
inputs
=
{
'X'
:
var
},
inputs
=
{
'X'
:
var
},
attrs
=
{
attrs
=
{
self
.
_op_device_key
:
prev_dev
ice
,
self
.
_op_device_key
:
prev_dev
,
self
.
_op_role_key
:
op_role
,
self
.
_op_role_key
:
op_role
,
'use_calc_stream'
:
False
,
'use_calc_stream'
:
False
,
'ring_id'
:
ring_id
,
'ring_id'
:
ring_id
,
'peer'
:
1
,
'peer'
:
1
,
})
})
extra_index
+=
1
extra_index
_info
[
'index'
]
+=
1
block
.
_insert_op
(
block
.
_insert_op
(
index
=
index
+
extra_index
,
index
=
index
+
extra_index
_info
[
'index'
]
,
type
=
'c_sync_comm_stream'
,
type
=
'c_sync_comm_stream'
,
inputs
=
{
'X'
:
[
var
]},
inputs
=
{
'X'
:
[
var
]},
outputs
=
{
'Out'
:
[
var
]},
outputs
=
{
'Out'
:
[
var
]},
attrs
=
{
attrs
=
{
self
.
_op_device_key
:
prev_dev
ice
,
self
.
_op_device_key
:
prev_dev
,
self
.
_op_role_key
:
self
.
_op_role
.
Backward
,
self
.
_op_role_key
:
self
.
_op_role
.
Backward
,
'ring_id'
:
ring_id
,
'ring_id'
:
ring_id
,
})
})
extra_index
+=
1
extra_index
_info
[
'index'
]
+=
1
var_shape
=
list
(
var
.
shape
)
var_shape
=
list
(
var
.
shape
)
var_shape
[
0
]
=
self
.
micro_batch_size
if
var_shape
[
var_shape
[
0
]
=
self
.
micro_batch_size
if
var_shape
[
0
]
<
0
else
var_shape
[
0
]
0
]
<
0
else
var_shape
[
0
]
block
.
_insert_op
(
block
.
_insert_op
(
index
=
index
+
extra_index
,
index
=
index
+
extra_index
_info
[
'index'
]
,
type
=
'recv_v2'
,
type
=
'recv_v2'
,
outputs
=
{
'Out'
:
[
var
]},
outputs
=
{
'Out'
:
[
var
]},
attrs
=
{
attrs
=
{
'out_shape'
:
var_shape
,
'out_shape'
:
var_shape
,
'dtype'
:
var
.
dtype
,
'dtype'
:
var
.
dtype
,
self
.
_op_device_key
:
cur_dev
ice
,
self
.
_op_device_key
:
cur_dev
,
self
.
_op_role_key
:
op_role
,
self
.
_op_role_key
:
op_role
,
'use_calc_stream'
:
True
,
'use_calc_stream'
:
True
,
'peer'
:
0
,
'peer'
:
0
,
'ring_id'
:
ring_id
'ring_id'
:
ring_id
})
})
extra_index
+=
1
extra_index
_info
[
'index'
]
+=
1
else
:
else
:
raise
ValueError
(
raise
ValueError
(
"Now only 'F-then-B' and '1F1B' are supported."
"Now only 'F-then-B' and '1F1B' are supported."
"The given value is {}."
.
format
(
self
.
schedule_mode
))
"The given value is {}."
.
format
(
self
.
schedule_mode
))
_insert_send_recv
(
int
(
cur_device
.
split
(
':'
)[
1
]),
int
(
prev_device
.
split
(
':'
)[
1
]))
block
.
_sync_with_cpp
()
def
_insert_loss_scale
(
self
,
block
):
def
_insert_loss_scale
(
self
,
block
):
"""
"""
Scale the loss corresponding to number of micro-batches.
Scale the loss corresponding to number of micro-batches.
...
@@ -4675,6 +4685,23 @@ class PipelineOptimizer(object):
...
@@ -4675,6 +4685,23 @@ class PipelineOptimizer(object):
return
op
.
desc
.
has_attr
(
"op_namescope"
)
\
return
op
.
desc
.
has_attr
(
"op_namescope"
)
\
and
op
.
desc
.
attr
(
"op_namescope"
).
startswith
(
"/regularization"
)
and
op
.
desc
.
attr
(
"op_namescope"
).
startswith
(
"/regularization"
)
def
_get_input_output_info
(
self
,
block
):
'''
Get info of op input and output.
'''
# A map from output var to op which generate it.
self
.
output_var_to_op
=
dict
()
# A map from var to op which takes it as input.
self
.
input_var_to_op
=
dict
()
for
index
,
op
in
enumerate
(
list
(
block
.
ops
)):
for
var_name
in
op
.
input_arg_names
:
ops
=
self
.
input_var_to_op
.
setdefault
(
var_name
,
[])
ops
.
append
([
op
,
index
])
for
var_name
in
op
.
output_arg_names
:
ops
=
self
.
output_var_to_op
.
setdefault
(
var_name
,
[])
ops
.
append
([
op
,
index
])
def
minimize
(
self
,
def
minimize
(
self
,
loss
,
loss
,
startup_program
=
None
,
startup_program
=
None
,
...
@@ -4682,30 +4709,35 @@ class PipelineOptimizer(object):
...
@@ -4682,30 +4709,35 @@ class PipelineOptimizer(object):
no_grad_set
=
None
):
no_grad_set
=
None
):
main_block
=
loss
.
block
main_block
=
loss
.
block
self
.
origin_main_block
=
main_block
self
.
origin_main_block
=
main_block
main_program
=
main_block
.
program
if
startup_program
is
None
:
if
startup_program
is
None
:
startup_program
=
default_startup_program
()
startup_program
=
default_startup_program
()
optimize_ops
,
params_grads
=
self
.
_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
assert
main_program
.
_pipeline_opt
,
'Please use pipeline with fleet.'
self
.
_param_device_map
=
self
.
_origin_optimizer
.
_param_device_map
required_keys
=
[
assert
main_block
.
program
.
_pipeline_opt
\
'local_rank'
,
and
'local_rank'
in
main_block
.
program
.
_pipeline_opt
,
\
'schedule_mode'
,
'Please use pipeline with fleet.'
'micro_batch_size'
,
local_rank
=
main_block
.
program
.
_pipeline_opt
[
'local_rank'
]
'ring_id'
,
self
.
_global_ring_id
=
main_block
.
program
.
_pipeline_opt
[
'global_ring_id'
,
'global_ring_id'
]
'use_sharding'
,
schedule_mode
=
0
]
if
'schedule_mode'
in
main_block
.
program
.
_pipeline_opt
:
for
key
in
required_keys
:
schedule_mode
=
main_block
.
program
.
_pipeline_opt
[
'schedule_mode'
]
assert
key
in
main_program
.
_pipeline_opt
,
\
self
.
schedule_mode
=
schedule_mode
'Please use pipeline with fleet to use {}.'
.
format
(
key
)
# micro batch size
self
.
local_rank
=
main_block
.
program
.
_pipeline_opt
[
'local_rank'
]
self
.
schedule_mode
=
main_block
.
program
.
_pipeline_opt
[
'schedule_mode'
]
self
.
micro_batch_size
=
main_block
.
program
.
_pipeline_opt
[
self
.
micro_batch_size
=
main_block
.
program
.
_pipeline_opt
[
'micro_batch_size'
]
'micro_batch_size'
]
self
.
use_sharding
=
False
if
'use_sharding'
in
main_block
.
program
.
_pipeline_opt
:
self
.
use_sharding
=
main_block
.
program
.
_pipeline_opt
[
'use_sharding'
]
self
.
use_sharding
=
main_block
.
program
.
_pipeline_opt
[
'use_sharding'
]
self
.
ring_id
=
main_block
.
program
.
_pipeline_opt
[
'ring_id'
]
self
.
ring_id
=
main_block
.
program
.
_pipeline_opt
[
'ring_id'
]
self
.
global_ring_id
=
main_block
.
program
.
_pipeline_opt
[
'global_ring_id'
]
optimize_ops
,
params_grads
=
self
.
_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
self
.
_param_device_map
=
self
.
_origin_optimizer
.
_param_device_map
self
.
_get_input_output_info
(
main_block
)
# Step1: add default op_device attribute for ops.
# Step1: add default op_device attribute for ops.
self
.
_add_op_device_attr
(
main_block
)
self
.
_add_op_device_attr
(
main_block
)
device_list
=
self
.
_check_validation
(
main_block
)
device_list
=
self
.
_check_validation
(
main_block
)
...
@@ -4742,20 +4774,20 @@ class PipelineOptimizer(object):
...
@@ -4742,20 +4774,20 @@ class PipelineOptimizer(object):
# Step5: Add sub blocks for section programs
# Step5: Add sub blocks for section programs
self
.
_add_sub_blocks
(
main_block
,
program_list
)
self
.
_add_sub_blocks
(
main_block
,
program_list
)
local_rank
=
main_program
.
_pipeline_opt
[
'local_rank'
]
%
len
(
device_list
)
self
.
local_rank
%=
len
(
device_list
)
place_list
=
[]
place_list
=
[]
for
dev
in
device_list
:
for
dev
in
device_list
:
dev_index
=
int
(
dev
.
split
(
":"
)[
1
])
dev_index
=
int
(
dev
.
split
(
":"
)[
1
])
place_list
.
append
(
core
.
CUDAPlace
(
dev_index
%
8
))
place_list
.
append
(
core
.
CUDAPlace
(
0
))
# Step6: Split startup program
# Step6: Split startup program
new_startup_program
=
self
.
_split_startup_program
(
startup_program
,
new_startup_program
=
self
.
_split_startup_program
(
startup_program
,
local_rank
)
self
.
local_rank
)
startup_program
.
_pipeline_opt
=
{
startup_program
.
_pipeline_opt
=
{
"startup_program"
:
new_startup_program
,
"startup_program"
:
new_startup_program
,
}
}
real_block
=
program_list
[
local_rank
].
global_block
()
real_block
=
program_list
[
self
.
local_rank
].
global_block
()
self
.
_insert_loss_scale
(
real_block
)
self
.
_insert_loss_scale
(
real_block
)
if
not
self
.
use_sharding
:
if
not
self
.
use_sharding
:
# Step7: clear gradients before each mini-batch and
# Step7: clear gradients before each mini-batch and
...
@@ -4769,12 +4801,12 @@ class PipelineOptimizer(object):
...
@@ -4769,12 +4801,12 @@ class PipelineOptimizer(object):
main_program
.
_pipeline_opt
=
{
main_program
.
_pipeline_opt
=
{
"trainer"
:
"PipelineTrainer"
,
"trainer"
:
"PipelineTrainer"
,
"device_worker"
:
"Section"
,
"device_worker"
:
"Section"
,
"pipeline_stage"
:
local_rank
,
"pipeline_stage"
:
self
.
local_rank
,
"num_pipeline_stages"
:
len
(
device_list
),
"num_pipeline_stages"
:
len
(
device_list
),
"schedule_mode"
:
self
.
schedule_mode
,
"schedule_mode"
:
self
.
schedule_mode
,
"inner_parallelism"
:
len
(
device_list
),
"inner_parallelism"
:
len
(
device_list
),
"section_program"
:
program_list
[
local_rank
],
"section_program"
:
program_list
[
self
.
local_rank
],
"place"
:
place_list
[
local_rank
],
"place"
:
place_list
[
self
.
local_rank
],
"place_id"
:
place_id
,
"place_id"
:
place_id
,
"sync_steps"
:
-
1
,
"sync_steps"
:
-
1
,
"num_microbatches"
:
self
.
_num_microbatches
,
"num_microbatches"
:
self
.
_num_microbatches
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录