Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
03babe17
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
03babe17
编写于
2月 26, 2021
作者:
W
WangXi
提交者:
GitHub
2月 26, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fleet distributed strategy support pure fp16 (#30754) (#31238)
上级
188bcbb7
变更
13
显示空白变更内容
内联
并排
Showing
13 changed file
with
178 addition
and
20 deletion
+178
-20
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+2
-0
python/paddle/distributed/fleet/base/distributed_strategy.py
python/paddle/distributed/fleet/base/distributed_strategy.py
+24
-1
python/paddle/distributed/fleet/base/fleet_base.py
python/paddle/distributed/fleet/base/fleet_base.py
+16
-2
python/paddle/distributed/fleet/base/strategy_compiler.py
python/paddle/distributed/fleet/base/strategy_compiler.py
+3
-0
python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
...paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+10
-1
python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
...ibuted/fleet/meta_optimizers/graph_execution_optimizer.py
+3
-1
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-0
python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
...paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
+15
-0
python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
+68
-14
python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
...le/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+15
-0
python/paddle/fluid/tests/unittests/test_fleet_base_single.py
...on/paddle/fluid/tests/unittests/test_fleet_base_single.py
+2
-0
python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
...sts/unittests/test_fleet_gradient_merge_meta_optimizer.py
+17
-0
python/paddle/optimizer/adam.py
python/paddle/optimizer/adam.py
+1
-1
未找到文件。
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
03babe17
...
...
@@ -44,6 +44,8 @@ message AMPConfig {
repeated
string
custom_white_list
=
7
;
repeated
string
custom_black_list
=
8
;
repeated
string
custom_black_varnames
=
9
;
optional
bool
use_pure_fp16
=
10
[
default
=
false
];
optional
bool
use_fp16_guard
=
11
[
default
=
true
];
}
message
LocalSGDConfig
{
...
...
python/paddle/distributed/fleet/base/distributed_strategy.py
浏览文件 @
03babe17
...
...
@@ -49,6 +49,9 @@ def assign_configs_value(msg, config):
for
key
in
config
:
for
f
in
fields
:
if
key
==
f
.
name
:
# LABEL_OPTIONAL = 1
# LABEL_REPEATED = 3
# LABEL_REQUIRED = 2
if
f
.
label
==
3
:
getattr
(
msg
,
f
.
name
).
extend
(
config
[
f
.
name
])
elif
f
.
label
==
1
or
f
.
label
==
2
:
...
...
@@ -366,7 +369,14 @@ class DistributedStrategy(object):
custom_black_list(list[str]): Users' custom black list which forbidden execution fp16.
Examples:
custom_black_varnames(list[str]): Users' custom black varibles' names.
use_pure_fp16(bool): Whether to use the pure fp16 training. Default False.
use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
Default True. Only takes effect when `use_pure_fp16` is turned on.
Examples 1:
.. code-block:: python
...
...
@@ -376,6 +386,19 @@ class DistributedStrategy(object):
strategy.amp_configs = {
"init_loss_scaling": 32768,
"custom_white_list": ['conv2d']}
Examples 2:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.amp = True
# pure fp16
strategy.amp_configs = {
"init_loss_scaling": 32768,
"use_pure_fp16": True
}
"""
return
get_msg_dict
(
self
.
strategy
.
amp_configs
)
...
...
python/paddle/distributed/fleet/base/fleet_base.py
浏览文件 @
03babe17
...
...
@@ -196,6 +196,7 @@ class Fleet(object):
else
:
if
isinstance
(
role_maker
,
RoleMakerBase
):
self
.
_role_maker
=
role_maker
self
.
_is_collective
=
role_maker
.
_is_collective
else
:
raise
ValueError
(
"`role_maker` should be subclass of `RoleMakerBase`, but got {}"
.
...
...
@@ -1022,9 +1023,22 @@ class Fleet(object):
if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
run_example_code()
"""
# imitate target optimizer retrieval
return
self
.
user_defined_optimizer
.
amp_init
(
place
,
scope
,
test_program
,
use_fp16_test
)
amp_optimizer
=
None
for
optimizer
in
self
.
strategy_compiler
.
_get_applied_meta_optimizer
():
if
hasattr
(
optimizer
,
'amp_init'
):
amp_optimizer
=
optimizer
break
if
amp_optimizer
is
None
:
if
hasattr
(
self
.
user_defined_optimizer
,
'amp_init'
):
amp_optimizer
=
self
.
user_defined_optimizer
assert
amp_optimizer
is
not
None
,
\
"amp_init can only be used when the amp(auto mixed precision) strategy is turned on."
return
amp_optimizer
.
amp_init
(
place
,
scope
,
test_program
,
use_fp16_test
)
def
_final_strategy
(
self
):
if
"valid_strategy"
not
in
self
.
_context
:
...
...
python/paddle/distributed/fleet/base/strategy_compiler.py
浏览文件 @
03babe17
...
...
@@ -129,6 +129,9 @@ class StrategyCompiler(StrategyCompilerBase):
self
.
_meta_optimizer_candidates
=
[]
self
.
_graph_optimizer_candidates
=
[]
def
_get_applied_meta_optimizer
(
self
):
return
self
.
_meta_optimizers
def
_get_applied_meta_list
(
self
):
return
[
type
(
opt
).
__name__
for
opt
in
self
.
_meta_optimizers
]
...
...
python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
浏览文件 @
03babe17
...
...
@@ -50,7 +50,8 @@ class AMPOptimizer(MetaOptimizerBase):
self
.
inner_opt
,
amp_lists
,
config
[
'init_loss_scaling'
],
config
[
'incr_every_n_steps'
],
config
[
'decr_every_n_nan_or_inf'
],
config
[
'incr_ratio'
],
config
[
'decr_ratio'
],
config
[
'use_dynamic_loss_scaling'
])
config
[
'use_dynamic_loss_scaling'
],
config
[
'use_pure_fp16'
],
config
[
'use_fp16_guard'
])
# if worker_num > 1, all cards will communication with each other,
# add is_distributed to optimize amp, overlap communication and
...
...
@@ -112,3 +113,11 @@ class AMPOptimizer(MetaOptimizerBase):
self
.
wrapped_opt
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
return
optimize_ops
,
params_grads
def
amp_init
(
self
,
place
,
scope
=
None
,
test_program
=
None
,
use_fp16_test
=
False
):
return
self
.
wrapped_opt
.
amp_init
(
place
,
scope
,
test_program
,
use_fp16_test
)
python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
浏览文件 @
03babe17
...
...
@@ -165,7 +165,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
main_program
.
_hierarchical_allreduce_inter_nranks
=
local_build_strategy
.
hierarchical_allreduce_inter_nranks
# TODO(guru4elephant): should be an independent optimizer
self
.
_setup_nccl_op
(
startup_program
,
main_program
,
local_build_strategy
)
if
worker_num
>
1
:
self
.
_setup_nccl_op
(
startup_program
,
main_program
,
local_build_strategy
)
local_build_strategy
.
num_trainers
=
self
.
role_maker
.
_worker_num
()
local_build_strategy
.
trainer_id
=
self
.
role_maker
.
_worker_index
()
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
03babe17
...
...
@@ -48,6 +48,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_amp_init
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_sharding_meta_optimizer
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer
)
...
...
@@ -506,6 +507,7 @@ if(WITH_DISTRIBUTE)
py_test_modules
(
test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_sharding_meta_optimizer MODULES test_fleet_sharding_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_amp_init MODULES test_fleet_amp_init ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_private_function MODULES test_fleet_private_function ENVS
${
dist_ENVS
}
)
py_test_modules
(
test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS
${
dist_ENVS
}
)
...
...
python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
浏览文件 @
03babe17
...
...
@@ -88,6 +88,21 @@ class TestFleetMetaOptimizer(unittest.TestCase):
"custom_white_list"
:
[
'softmax'
],
"custom_black_list"
:
[
'tanh'
],
}
elif
name
==
'pure_fp16'
:
strategy
.
amp
=
True
strategy
.
amp_configs
=
{
"init_loss_scaling"
:
32768
,
"decr_every_n_nan_or_inf"
:
2
,
"incr_every_n_steps"
:
1000
,
"incr_ratio"
:
2.0
,
"use_dynamic_loss_scaling"
:
True
,
"decr_ratio"
:
0.5
,
"custom_white_list"
:
[
'softmax'
],
"custom_black_list"
:
[
'tanh'
],
"use_pure_fp16"
:
True
,
"use_fp16_guard"
:
False
,
}
elif
name
==
'dgc'
:
strategy
.
dgc
=
True
strategy
.
dgc_configs
=
{
...
...
python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
浏览文件 @
03babe17
...
...
@@ -46,9 +46,18 @@ class TestFleetAMPInit(unittest.TestCase):
def
test_fleet_amp_init
(
self
):
if
not
fluid
.
core
.
is_compiled_with_cuda
():
return
main_program
=
paddle
.
static
.
Program
()
startup_program
=
paddle
.
static
.
Program
()
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
with
paddle
.
static
.
program_guard
(
main_program
,
startup_program
):
input_x
=
paddle
.
static
.
data
(
name
=
"x"
,
shape
=
[
None
,
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
static
.
data
(
name
=
"y"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
input_y
=
paddle
.
static
.
data
(
name
=
"y"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
cost
=
mlp
(
input_x
,
input_y
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
...
...
@@ -57,23 +66,68 @@ class TestFleetAMPInit(unittest.TestCase):
weight_decay
=
fluid
.
regularizer
.
L2Decay
(
1e-4
),
multi_precision
=
True
)
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
optimizer
=
paddle
.
static
.
amp
.
decorate
(
optimizer
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
)
optimizer
.
minimize
(
cost
)
place
=
paddle
.
CUDAPlace
(
0
)
exe
=
paddle
.
static
.
Executor
(
place
)
exe
.
run
(
paddle
.
static
.
default_startup_program
()
)
exe
.
run
(
startup_program
)
optimizer
.
amp_init
(
place
)
step
=
1
for
i
in
range
(
step
):
cost_val
=
exe
.
run
(
program
=
paddle
.
static
.
default_main_program
(),
cost_val
=
exe
.
run
(
program
=
main_program
,
feed
=
gen_data
(),
fetch_list
=
[
cost
.
name
])
def
test_fleet_amp_meta_optimizer_init
(
self
):
if
not
fluid
.
core
.
is_compiled_with_cuda
():
return
main_program
=
paddle
.
static
.
Program
()
startup_program
=
paddle
.
static
.
Program
()
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
with
paddle
.
static
.
program_guard
(
main_program
,
startup_program
):
input_x
=
paddle
.
static
.
data
(
name
=
"x"
,
shape
=
[
None
,
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
static
.
data
(
name
=
"y"
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
cost
=
mlp
(
input_x
,
input_y
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
momentum
=
0.9
,
weight_decay
=
fluid
.
regularizer
.
L2Decay
(
1e-4
),
multi_precision
=
True
)
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
strategy
.
amp
=
True
strategy
.
amp_configs
=
{
'use_pure_fp16'
:
True
}
strategy
.
gradient_merge
=
True
strategy
.
gradient_merge_configs
=
{
"k_steps"
:
2
}
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
cost
)
print
(
fleet
.
_get_applied_meta_list
())
place
=
paddle
.
CUDAPlace
(
0
)
exe
=
paddle
.
static
.
Executor
(
place
)
exe
.
run
(
startup_program
)
optimizer
.
amp_init
(
place
)
step
=
3
for
i
in
range
(
step
):
cost_val
=
exe
.
run
(
program
=
main_program
,
feed
=
gen_data
(),
fetch_list
=
[
cost
.
name
])
print
(
cost_val
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
浏览文件 @
03babe17
...
...
@@ -93,6 +93,21 @@ class TestFleetAMPOptimizer(TestFleetMetaOptimizer):
self
.
assertIn
(
'cast'
,
ops
)
self
.
assertIn
(
'check_finite_and_unscale'
,
ops
)
def
test_pure_fp16_optimizer
(
self
):
""" test pure fp16 """
train_prog
,
startup_prog
=
fluid
.
Program
(),
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
self
.
set_strategy
(
strategy
,
'pure_fp16'
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
)
params
=
train_prog
.
all_parameters
()
for
param
in
train_prog
.
all_parameters
():
self
.
assertEqual
(
param
.
dtype
,
fluid
.
core
.
VarDesc
.
VarType
.
FP16
)
ops
=
[
op
.
type
for
op
in
avg_cost
.
block
.
ops
]
self
.
assertIn
(
'cast'
,
ops
)
self
.
assertIn
(
'check_finite_and_unscale'
,
ops
)
def
test_amp_distributed_optimizer
(
self
):
""" test amp when distributed """
train_prog
,
startup_prog
=
fluid
.
Program
(),
fluid
.
Program
()
...
...
python/paddle/fluid/tests/unittests/test_fleet_base_single.py
浏览文件 @
03babe17
...
...
@@ -78,6 +78,7 @@ class TestFleetBaseSingleRunCollective(unittest.TestCase):
}
def
test_single_run_collective_minimize
(
self
):
paddle
.
enable_static
()
input_x
=
paddle
.
static
.
data
(
name
=
"x"
,
shape
=
[
-
1
,
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
static
.
data
(
name
=
"y"
,
shape
=
[
-
1
,
1
],
dtype
=
'int64'
)
...
...
@@ -114,6 +115,7 @@ class TestFleetBaseSingleRunPS(unittest.TestCase):
}
def
test_single_run_ps_minimize
(
self
):
paddle
.
enable_static
()
input_x
=
paddle
.
static
.
data
(
name
=
"x"
,
shape
=
[
-
1
,
32
],
dtype
=
'float32'
)
input_y
=
paddle
.
static
.
data
(
name
=
"y"
,
shape
=
[
-
1
,
1
],
dtype
=
'int64'
)
...
...
python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
浏览文件 @
03babe17
...
...
@@ -53,8 +53,25 @@ class TestFleetGradientMergeMetaOptimizer(TestFleetMetaOptimizer):
self
.
set_strategy
(
strategy
,
'gradient_merge'
)
self
.
set_strategy
(
strategy
,
'amp'
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
)
vars
=
[
x
.
name
for
x
in
train_prog
.
list_vars
()]
self
.
assertIn
(
'@GradientMerge'
,
''
.
join
(
vars
))
self
.
assertIn
(
'cast'
,
''
.
join
(
vars
))
def
test_gm_pure_fp16_optimizer
(
self
):
train_prog
,
startup_prog
=
paddle
.
fluid
.
Program
(),
paddle
.
fluid
.
Program
(
)
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
self
.
set_strategy
(
strategy
,
'gradient_merge'
)
self
.
set_strategy
(
strategy
,
'pure_fp16'
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
)
print
(
train_prog
)
params
=
train_prog
.
all_parameters
()
for
param
in
train_prog
.
all_parameters
():
self
.
assertEqual
(
param
.
dtype
,
paddle
.
fluid
.
core
.
VarDesc
.
VarType
.
FP16
)
vars
=
[
x
.
name
for
x
in
train_prog
.
list_vars
()]
self
.
assertIn
(
'@GradientMerge'
,
''
.
join
(
vars
))
self
.
assertIn
(
'cast'
,
''
.
join
(
vars
))
...
...
python/paddle/optimizer/adam.py
浏览文件 @
03babe17
...
...
@@ -244,7 +244,7 @@ class Adam(Optimizer):
if
p
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
and
not
self
.
_multi_precision
:
warnings
.
warn
(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the
Momentu
m optimizer."
"Consider using multi_precision=True option of the
Ada
m optimizer."
)
self
.
_add_moments_pows
(
p
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录