Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
03babe17
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
03babe17
编写于
2月 26, 2021
作者:
W
WangXi
提交者:
GitHub
2月 26, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fleet distributed strategy support pure fp16 (#30754) (#31238)
上级
188bcbb7
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
178 addition
and
20 deletion
+178
-20
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+2
-0
python/paddle/distributed/fleet/base/distributed_strategy.py
python/paddle/distributed/fleet/base/distributed_strategy.py
+24
-1
python/paddle/distributed/fleet/base/fleet_base.py
python/paddle/distributed/fleet/base/fleet_base.py
+16
-2
python/paddle/distributed/fleet/base/strategy_compiler.py
python/paddle/distributed/fleet/base/strategy_compiler.py
+3
-0
python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
...paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+10
-1
python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
...ibuted/fleet/meta_optimizers/graph_execution_optimizer.py
+3
-1
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-0
python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
...paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
+15
-0
python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
+68
-14
python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
...le/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+15
-0
python/paddle/fluid/tests/unittests/test_fleet_base_single.py
...on/paddle/fluid/tests/unittests/test_fleet_base_single.py
+2
-0
python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
...sts/unittests/test_fleet_gradient_merge_meta_optimizer.py
+17
-0
python/paddle/optimizer/adam.py
python/paddle/optimizer/adam.py
+1
-1
未找到文件。
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
03babe17
...
...
@@ -44,6 +44,8 @@ message AMPConfig {
repeated
string
custom_white_list
=
7
;
repeated
string
custom_black_list
=
8
;
repeated
string
custom_black_varnames
=
9
;
optional
bool
use_pure_fp16
=
10
[
default
=
false
];
optional
bool
use_fp16_guard
=
11
[
default
=
true
];
}
message
LocalSGDConfig
{
...
...
python/paddle/distributed/fleet/base/distributed_strategy.py
浏览文件 @
03babe17
...
...
@@ -49,6 +49,9 @@ def assign_configs_value(msg, config):
for
key
in
config
:
for
f
in
fields
:
if
key
==
f
.
name
:
# LABEL_OPTIONAL = 1
# LABEL_REPEATED = 3
# LABEL_REQUIRED = 2
if
f
.
label
==
3
:
getattr
(
msg
,
f
.
name
).
extend
(
config
[
f
.
name
])
elif
f
.
label
==
1
or
f
.
label
==
2
:
...
...
@@ -366,7 +369,14 @@ class DistributedStrategy(object):
custom_black_list(list[str]): Users' custom black list which forbidden execution fp16.
Examples:
custom_black_varnames(list[str]): Users' custom black varibles' names.
use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
Default True. Only takes effect when `use_pure_fp16` is turned on.
Examples 1:
.. code-block:: python
...
...
@@ -376,6 +386,19 @@ class DistributedStrategy(object):
strategy.amp_configs = {
"init_loss_scaling": 32768,
"custom_white_list": ['conv2d']}
Examples 2:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.amp = True
# pure fp16
strategy.amp_configs = {
"init_loss_scaling": 32768,
"use_pure_fp16": True
}
"""
return
get_msg_dict
(
self
.
strategy
.
amp_configs
)
...
...
python/paddle/distributed/fleet/base/fleet_base.py
浏览文件 @
03babe17
...
...
@@ -196,6 +196,7 @@ class Fleet(object):
else
:
if
isinstance
(
role_maker
,
RoleMakerBase
):
self
.
_role_maker
=
role_maker
self
.
_is_collective
=
role_maker
.
_is_collective
else
:
raise
ValueError
(
"`role_maker` should be subclass of `RoleMakerBase`, but got {}"
.
...
...
@@ -1022,9 +1023,22 @@ class Fleet(object):
if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
run_example_code()
"""
# imitate target optimizer retrieval
return
self
.
user_defined_optimizer
.
amp_init
(
place
,
scope
,
test_program
,
use_fp16_test
)
amp_optimizer
=
None
for
optimizer
in
self
.
strategy_compiler
.
_get_applied_meta_optimizer
():
if
hasattr
(
optimizer
,
'amp_init'
):
amp_optimizer
=
optimizer
break
if
amp_optimizer
is
None
:
if
hasattr
(
self
.
user_defined_optimizer
,
'amp_init'
):
amp_optimizer
=
self
.
user_defined_optimizer
assert
amp_optimizer
is
not
None
,
\
"amp_init can only be used when the amp(auto mixed precision) strategy is turned on."
return
amp_optimizer
.
amp_init
(
place
,
scope
,
test_program
,
use_fp16_test
)
def
_final_strategy
(
self
):
if
"valid_strategy"
not
in
self
.
_context
:
...
...
python/paddle/distributed/fleet/base/strategy_compiler.py
浏览文件 @
03babe17
...
...
@@ -129,6 +129,9 @@ class StrategyCompiler(StrategyCompilerBase):
self
.
_meta_optimizer_candidates
=
[]
self
.
_graph_optimizer_candidates
=
[]
def
_get_applied_meta_optimizer
(
self
):
return
self
.
_meta_optimizers
def
_get_applied_meta_list
(
self
):
return
[
type
(
opt
).
__name__
for
opt
in
self
.
_meta_optimizers
]
...
...
python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
浏览文件 @
03babe17
...
...
@@ -50,7 +50,8 @@ class AMPOptimizer(MetaOptimizerBase):
self
.
inner_opt
,
amp_lists
,
config
[
'init_loss_scaling'
],
config
[
'incr_every_n_steps'
],
config
[
'decr_every_n_nan_or_inf'
],
config
[
'incr_ratio'
],
config
[
'decr_ratio'
],
config
[
'use_dynamic_loss_scaling'
])
config
[
'use_dynamic_loss_scaling'
],
config
[
'use_pure_fp16'
],
config
[
'use_fp16_guard'
])
# if worker_num > 1, all cards will communication with each other,
# add is_distributed to optimize amp, overlap communication and
...
...
@@ -112,3 +113,11 @@ class AMPOptimizer(MetaOptimizerBase):
self
.
wrapped_opt
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
return
optimize_ops
,
params_grads
def
amp_init
(
self
,
place
,
scope
=
None
,
test_program
=
None
,
use_fp16_test
=
False
):
return
self
.
wrapped_opt
.
amp_init
(
place
,
scope
,
test_program
,
use_fp16_test
)
python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
浏览文件 @
03babe17
...
...
@@ -165,7 +165,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
main_program
.
_hierarchical_allreduce_inter_nranks
=
local_build_strategy
.
hierarchical_allreduce_inter_nranks
# TODO(guru4elephant): should be an independent optimizer
self
.
_setup_nccl_op
(
startup_program
,
main_program
,
local_build_strategy
)
if
worker_num
>
1
:
self
.
_setup_nccl_op
(
startup_program
,
main_program
,
local_build_strategy
)
local_build_strategy
.
num_trainers
=
self
.
role_maker
.
_worker_num
()
local_build_strategy
.
trainer_id
=
self
.
role_maker
.
_worker_index
()
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
03babe17
...
...
@@ -48,6 +48,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_amp_init
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_sharding_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer
)
...
...
@@ -506,6 +507,7 @@ if(WITH_DISTRIBUTE)
py_test_modules
(
test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_sharding_meta_optimizer MODULES test_fleet_sharding_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_amp_init MODULES test_fleet_amp_init ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_private_function MODULES test_fleet_private_function ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS
${
dist_ENVS
}
)
...
...
python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
浏览文件 @
03babe17
...
...
@@ -88,6 +88,21 @@ class TestFleetMetaOptimizer(unittest.TestCase):
"custom_white_list"
:
[
'softmax'
],
"custom_black_list"
:
[
'tanh'
],
}
elif
name
==
'pure_fp16'
:
strategy
.
amp
=
True
strategy
.
amp_configs
=
{
"init_loss_scaling"
:
32768
,
"decr_every_n_nan_or_inf"
:
2
,
"incr_every_n_steps"
:
1000
,
"incr_ratio"
:
2.0
,
"use_dynamic_loss_scaling"
:
True
,
"decr_ratio"
:
0.5
,
"custom_white_list"
:
[
'softmax'
],
"custom_black_list"
:
[
'tanh'
],
"use_pure_fp16"
:
True
,
"use_fp16_guard"
:
False
,
}
elif
name
==
'dgc'
:
strategy
.
dgc
=
True
strategy
.
dgc_configs
=
{
...
...
python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
浏览文件 @
03babe17
...
...
@@ -46,34 +46,88 @@ class TestFleetAMPInit(unittest.TestCase):
def
test_fleet_amp_init
(
self
):
if
not
fluid
.
core
.
is_compiled_with_cuda
():
return
input_x
=
paddle
.
static
.
data
(
name
=
"x"
,
shape
=
[
None
,
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
static
.
data
(
name
=
"y"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
cost
=
mlp
(
input_x
,
input_y
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
momentum
=
0.9
,
weight_decay
=
fluid
.
regularizer
.
L2Decay
(
1e-4
),
multi_precision
=
True
)
main_program
=
paddle
.
static
.
Program
()
startup_program
=
paddle
.
static
.
Program
()
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
optimizer
=
paddle
.
static
.
amp
.
decorate
(
optimizer
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
)
optimizer
.
minimize
(
cost
)
with
paddle
.
static
.
program_guard
(
main_program
,
startup_program
):
input_x
=
paddle
.
static
.
data
(
name
=
"x"
,
shape
=
[
None
,
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
static
.
data
(
name
=
"y"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
cost
=
mlp
(
input_x
,
input_y
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
momentum
=
0.9
,
weight_decay
=
fluid
.
regularizer
.
L2Decay
(
1e-4
),
multi_precision
=
True
)
optimizer
=
paddle
.
static
.
amp
.
decorate
(
optimizer
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
)
optimizer
.
minimize
(
cost
)
place
=
paddle
.
CUDAPlace
(
0
)
exe
=
paddle
.
static
.
Executor
(
place
)
exe
.
run
(
paddle
.
static
.
default_startup_program
()
)
exe
.
run
(
startup_program
)
optimizer
.
amp_init
(
place
)
step
=
1
for
i
in
range
(
step
):
cost_val
=
exe
.
run
(
program
=
paddle
.
static
.
default_main_program
(),
cost_val
=
exe
.
run
(
program
=
main_program
,
feed
=
gen_data
(),
fetch_list
=
[
cost
.
name
])
def
test_fleet_amp_meta_optimizer_init
(
self
):
if
not
fluid
.
core
.
is_compiled_with_cuda
():
return
main_program
=
paddle
.
static
.
Program
()
startup_program
=
paddle
.
static
.
Program
()
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
with
paddle
.
static
.
program_guard
(
main_program
,
startup_program
):
input_x
=
paddle
.
static
.
data
(
name
=
"x"
,
shape
=
[
None
,
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
static
.
data
(
name
=
"y"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
cost
=
mlp
(
input_x
,
input_y
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
momentum
=
0.9
,
weight_decay
=
fluid
.
regularizer
.
L2Decay
(
1e-4
),
multi_precision
=
True
)
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
strategy
.
amp
=
True
strategy
.
amp_configs
=
{
'use_pure_fp16'
:
True
}
strategy
.
gradient_merge
=
True
strategy
.
gradient_merge_configs
=
{
"k_steps"
:
2
}
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
cost
)
print
(
fleet
.
_get_applied_meta_list
())
place
=
paddle
.
CUDAPlace
(
0
)
exe
=
paddle
.
static
.
Executor
(
place
)
exe
.
run
(
startup_program
)
optimizer
.
amp_init
(
place
)
step
=
3
for
i
in
range
(
step
):
cost_val
=
exe
.
run
(
program
=
main_program
,
feed
=
gen_data
(),
fetch_list
=
[
cost
.
name
])
print
(
cost_val
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
浏览文件 @
03babe17
...
...
@@ -93,6 +93,21 @@ class TestFleetAMPOptimizer(TestFleetMetaOptimizer):
self
.
assertIn
(
'cast'
,
ops
)
self
.
assertIn
(
'check_finite_and_unscale'
,
ops
)
def
test_pure_fp16_optimizer
(
self
):
""" test pure fp16 """
train_prog
,
startup_prog
=
fluid
.
Program
(),
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
self
.
set_strategy
(
strategy
,
'pure_fp16'
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
)
params
=
train_prog
.
all_parameters
()
for
param
in
train_prog
.
all_parameters
():
self
.
assertEqual
(
param
.
dtype
,
fluid
.
core
.
VarDesc
.
VarType
.
FP16
)
ops
=
[
op
.
type
for
op
in
avg_cost
.
block
.
ops
]
self
.
assertIn
(
'cast'
,
ops
)
self
.
assertIn
(
'check_finite_and_unscale'
,
ops
)
def
test_amp_distributed_optimizer
(
self
):
""" test amp when distributed """
train_prog
,
startup_prog
=
fluid
.
Program
(),
fluid
.
Program
()
...
...
python/paddle/fluid/tests/unittests/test_fleet_base_single.py
浏览文件 @
03babe17
...
...
@@ -78,6 +78,7 @@ class TestFleetBaseSingleRunCollective(unittest.TestCase):
}
def
test_single_run_collective_minimize
(
self
):
paddle
.
enable_static
()
input_x
=
paddle
.
static
.
data
(
name
=
"x"
,
shape
=
[
-
1
,
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
static
.
data
(
name
=
"y"
,
shape
=
[
-
1
,
1
],
dtype
=
'int64'
)
...
...
@@ -114,6 +115,7 @@ class TestFleetBaseSingleRunPS(unittest.TestCase):
}
def
test_single_run_ps_minimize
(
self
):
paddle
.
enable_static
()
input_x
=
paddle
.
static
.
data
(
name
=
"x"
,
shape
=
[
-
1
,
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
static
.
data
(
name
=
"y"
,
shape
=
[
-
1
,
1
],
dtype
=
'int64'
)
...
...
python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
浏览文件 @
03babe17
...
...
@@ -53,8 +53,25 @@ class TestFleetGradientMergeMetaOptimizer(TestFleetMetaOptimizer):
self
.
set_strategy
(
strategy
,
'gradient_merge'
)
self
.
set_strategy
(
strategy
,
'amp'
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
)
vars
=
[
x
.
name
for
x
in
train_prog
.
list_vars
()]
self
.
assertIn
(
'@GradientMerge'
,
''
.
join
(
vars
))
self
.
assertIn
(
'cast'
,
''
.
join
(
vars
))
def
test_gm_pure_fp16_optimizer
(
self
):
train_prog
,
startup_prog
=
paddle
.
fluid
.
Program
(),
paddle
.
fluid
.
Program
(
)
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
self
.
set_strategy
(
strategy
,
'gradient_merge'
)
self
.
set_strategy
(
strategy
,
'pure_fp16'
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
)
print
(
train_prog
)
params
=
train_prog
.
all_parameters
()
for
param
in
train_prog
.
all_parameters
():
self
.
assertEqual
(
param
.
dtype
,
paddle
.
fluid
.
core
.
VarDesc
.
VarType
.
FP16
)
vars
=
[
x
.
name
for
x
in
train_prog
.
list_vars
()]
self
.
assertIn
(
'@GradientMerge'
,
''
.
join
(
vars
))
self
.
assertIn
(
'cast'
,
''
.
join
(
vars
))
...
...
python/paddle/optimizer/adam.py
浏览文件 @
03babe17
...
...
@@ -244,7 +244,7 @@ class Adam(Optimizer):
if
p
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
and
not
self
.
_multi_precision
:
warnings
.
warn
(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the
Momentu
m optimizer."
"Consider using multi_precision=True option of the
Ada
m optimizer."
)
self
.
_add_moments_pows
(
p
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录