未验证 提交 6c2bc29c 编写于 作者: G gongweibao 提交者: GitHub

Fix float16 optimizer. (#19682)

Fix float16 optimizer
上级 713c05dd
...@@ -505,7 +505,7 @@ paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'loca ...@@ -505,7 +505,7 @@ paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'loca
paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a')) paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a'))
paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a')) paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a'))
paddle.fluid.contrib.extend_with_decoupled_weight_decay (ArgSpec(args=['base_optimizer'], varargs=None, keywords=None, defaults=None), ('document', 'a1095dfd4ec725747f662d69cd7659d4')) paddle.fluid.contrib.extend_with_decoupled_weight_decay (ArgSpec(args=['base_optimizer'], varargs=None, keywords=None, defaults=None), ('document', 'a1095dfd4ec725747f662d69cd7659d4'))
paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'amp_lists', 'init_loss_scaling', 'incr_every_n_steps', 'decr_every_n_nan_or_inf', 'incr_ratio', 'decr_ratio', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(None, 1.0, 1000, 2, 2.0, 0.8, False)), ('document', 'd05e71f5b0bd6d92bb94e70e00b3f9cf')) paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'amp_lists', 'init_loss_scaling', 'incr_every_n_steps', 'decr_every_n_nan_or_inf', 'incr_ratio', 'decr_ratio', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(None, 1.0, 1000, 2, 2.0, 0.8, False)), ('document', '5f118631fc8632afb981b3a26daae731'))
paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists ('paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists', ('document', 'c116ec6bb5d30998792daea8db21ee40')) paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists ('paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists', ('document', 'c116ec6bb5d30998792daea8db21ee40'))
paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists.__init__ (ArgSpec(args=['self', 'custom_white_list', 'custom_black_list'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists.__init__ (ArgSpec(args=['self', 'custom_white_list', 'custom_black_list'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.fused_elemwise_activation (ArgSpec(args=['x', 'y', 'functor_list', 'axis', 'scale', 'save_intermediate_out'], varargs=None, keywords=None, defaults=(-1, 0.0, True)), ('document', '1c4b247a2858cea8d9d8750693688270')) paddle.fluid.contrib.fused_elemwise_activation (ArgSpec(args=['x', 'y', 'functor_list', 'axis', 'scale', 'save_intermediate_out'], varargs=None, keywords=None, defaults=(-1, 0.0, True)), ('document', '1c4b247a2858cea8d9d8750693688270'))
......
...@@ -172,21 +172,34 @@ class OptimizerWithMixedPrecison(object): ...@@ -172,21 +172,34 @@ class OptimizerWithMixedPrecison(object):
return optimize_ops return optimize_ops
def minimize(self, loss): def minimize(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None):
""" """
Perform optimization by minimizing the given loss. Perform optimization by minimizing the given loss.
Args: Args:
loss (Variable): The loss Variable. loss (Variable): The loss Variable.
startup_program (Program): startup_program for initializing parameters
in `parameter_list`.
parameter_list (list): list of Variables to update.
no_grad_set (set|None): set of Variables should be ignored.
Returns: Returns:
The scaled loss by scaling factor, the list of optimize ops, and a The scaled loss by scaling factor, the list of optimize ops, and a
list of scaled parameters and gradients. list of scaled parameters and gradients.
""" """
scaled_params_grads, scaled_loss = self.backward(loss) scaled_params_grads, scaled_loss = self.backward(
loss,
startup_program=startup_program,
parameter_list=parameter_list,
no_grad_set=no_grad_set)
optimize_ops = self.apply_gradients(scaled_params_grads) optimize_ops = self.apply_gradients(scaled_params_grads)
return scaled_loss, optimize_ops, scaled_params_grads return optimize_ops, scaled_params_grads
def decorate(optimizer, def decorate(optimizer,
...@@ -228,7 +241,8 @@ def decorate(optimizer, ...@@ -228,7 +241,8 @@ def decorate(optimizer,
mp_optimizer = fluid.contrib.mixed_precision.decorate( mp_optimizer = fluid.contrib.mixed_precision.decorate(
optimizer=optimizer, init_loss_scaling=8.0) optimizer=optimizer, init_loss_scaling=8.0)
scaled_loss, _, _ = mp_optimizer.minimize(loss) ops, param_grads = mp_optimizer.minimize(loss)
scaled_loss = mp_optimizer.get_loss_scaling()
""" """
if amp_lists is None: if amp_lists is None:
amp_lists = AutoMixedPrecisionLists() amp_lists = AutoMixedPrecisionLists()
......
...@@ -138,7 +138,8 @@ def train(net_type, use_cuda, save_dirname, is_local): ...@@ -138,7 +138,8 @@ def train(net_type, use_cuda, save_dirname, is_local):
init_loss_scaling=8.0, init_loss_scaling=8.0,
use_dynamic_loss_scaling=True) use_dynamic_loss_scaling=True)
scaled_loss, _, _ = mp_optimizer.minimize(avg_cost) mp_optimizer.minimize(avg_cost)
scaled_loss = mp_optimizer.get_loss_scaling()
BATCH_SIZE = 128 BATCH_SIZE = 128
PASS_NUM = 1 PASS_NUM = 1
......
...@@ -23,6 +23,7 @@ from paddle.fluid.optimizer import SGD ...@@ -23,6 +23,7 @@ from paddle.fluid.optimizer import SGD
from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase
from paddle.fluid.incubate.fleet.base.role_maker import UserDefinedRoleMaker from paddle.fluid.incubate.fleet.base.role_maker import UserDefinedRoleMaker
from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecison
class Mode: class Mode:
...@@ -257,7 +258,8 @@ class DistributedOptimizer(object): ...@@ -257,7 +258,8 @@ class DistributedOptimizer(object):
__metaclass__ = abc.ABCMeta __metaclass__ = abc.ABCMeta
def __init__(self, optimizer, strategy=None): def __init__(self, optimizer, strategy=None):
if not isinstance(optimizer, SGD.__bases__): if not isinstance(optimizer, SGD.__bases__) \
and not isinstance(optimizer, OptimizerWithMixedPrecison):
raise TypeError("optimizer must be an instance of Optimizer") raise TypeError("optimizer must be an instance of Optimizer")
self._optimizer = optimizer self._optimizer = optimizer
......
...@@ -347,7 +347,10 @@ class CollectiveOptimizer(DistributedOptimizer): ...@@ -347,7 +347,10 @@ class CollectiveOptimizer(DistributedOptimizer):
self._strategy) self._strategy)
optimize_ops, param_grads = self._optimizer.minimize( optimize_ops, param_grads = self._optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set) loss,
startup_program=startup_program,
parameter_list=parameter_list,
no_grad_set=no_grad_set)
fleet._origin_program = main_program fleet._origin_program = main_program
fleet.main_program = self._try_to_compile(startup_program, main_program) fleet.main_program = self._try_to_compile(startup_program, main_program)
......
...@@ -464,6 +464,8 @@ class Optimizer(object): ...@@ -464,6 +464,8 @@ class Optimizer(object):
Examples: Examples:
See examples in `apply_gradients`. See examples in `apply_gradients`.
""" """
no_grad_set = self._get_no_grad_set(loss, no_grad_set)
self._dtype = loss.dtype self._dtype = loss.dtype
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
if parameter_list is not None: if parameter_list is not None:
...@@ -563,6 +565,23 @@ class Optimizer(object): ...@@ -563,6 +565,23 @@ class Optimizer(object):
optimize_ops = self.apply_gradients(params_grads) optimize_ops = self.apply_gradients(params_grads)
return optimize_ops return optimize_ops
def _get_no_grad_set(self, loss, no_grad_set=None):
if no_grad_set is None:
no_grad_set = set()
elif isinstance(no_grad_set, set) or isinstance(
no_grad_set, list) or isinstance(no_grad_set, tuple):
no_grad_set = set(no_grad_set)
else:
assert "no_grad_set should be a set, but the passed type is {}".format(
type(no_grad_set))
parameters = loss.block.program.global_block().all_parameters()
param_no_trainable = set(
[param.name for param in parameters if param.trainable is False])
# If the parameter is no trainable, it should not have a gradient.
no_grad_set.update(param_no_trainable)
return no_grad_set
@imperative_base.no_grad @imperative_base.no_grad
def minimize(self, def minimize(self,
loss, loss,
...@@ -589,19 +608,6 @@ class Optimizer(object): ...@@ -589,19 +608,6 @@ class Optimizer(object):
and list of (param, grad) Variables pair for optimization. and list of (param, grad) Variables pair for optimization.
""" """
assert isinstance(loss, Variable), "The loss should be an Variable." assert isinstance(loss, Variable), "The loss should be an Variable."
if no_grad_set is None:
no_grad_set = set()
elif isinstance(no_grad_set, set) or isinstance(
no_grad_set, list) or isinstance(no_grad_set, tuple):
no_grad_set = set(no_grad_set)
else:
assert "no_grad_set should be a set, but the passed type is {}".format(
type(no_grad_set))
parameters = loss.block.program.global_block().all_parameters()
param_no_trainable = set(
[param.name for param in parameters if param.trainable is False])
# If the parameter is no trainable, it should not have a gradient.
no_grad_set.update(param_no_trainable)
params_grads = self.backward( params_grads = self.backward(
loss, loss,
startup_program=startup_program, startup_program=startup_program,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册