未验证 提交 9f7c66b4 编写于 作者: H huangxu96 提交者: GitHub

[Cherry-pick] amp related PR cherry pick into Release/2.0 (#30212)

* Optimizer trans momentum (#29597)

* merge amp related function in Momentum from paddle.fluid.contrib.optimizer into paddle.optimizer.

* Add unittest for 2.0  Momentum API.

* fix some bugs in weight_decay.

* add alias for fluid.contrib.mixed_precision (#29562)

* add alias for fluid.contrib.mixed_precision

* add static.amp into setup.pu.in (#29621)

* add static.amp into setup.pu.in

* add unittest for api

* fix a bug in multi_precision_fp16 unittest. (#29756)
上级 5fe3da39
...@@ -13,9 +13,14 @@ ...@@ -13,9 +13,14 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
from . import decorator from . import decorator
from .decorator import * from .decorator import *
from .fp16_lists import AutoMixedPrecisionLists from . import fp16_lists
from .fp16_lists import *
from . import fp16_utils
from .fp16_utils import *
__all__ = decorator.__all__ __all__ = decorator.__all__
__all__ += fp16_lists.__all__ __all__ += fp16_lists.__all__
__all__ += fp16_utils.__all__
...@@ -20,6 +20,9 @@ from ... import global_scope ...@@ -20,6 +20,9 @@ from ... import global_scope
from ...log_helper import get_logger from ...log_helper import get_logger
import logging import logging
import numpy as np import numpy as np
__all__ = ["cast_model_to_fp16", "cast_parameters_to_fp16"]
_logger = get_logger( _logger = get_logger(
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s') __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
......
...@@ -24,6 +24,7 @@ import unittest ...@@ -24,6 +24,7 @@ import unittest
import os import os
import copy import copy
import numpy as np import numpy as np
from paddle.static.amp import decorate
paddle.enable_static() paddle.enable_static()
...@@ -138,7 +139,7 @@ def train(net_type, use_cuda, save_dirname, is_local): ...@@ -138,7 +139,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists( amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
custom_black_varnames={"loss", "conv2d_0.w_0"}) custom_black_varnames={"loss", "conv2d_0.w_0"})
mp_optimizer = fluid.contrib.mixed_precision.decorate( mp_optimizer = decorate(
optimizer=optimizer, optimizer=optimizer,
amp_lists=amp_lists, amp_lists=amp_lists,
init_loss_scaling=8.0, init_loss_scaling=8.0,
...@@ -442,7 +443,7 @@ class TestAmpWithNonIterableDataLoader(unittest.TestCase): ...@@ -442,7 +443,7 @@ class TestAmpWithNonIterableDataLoader(unittest.TestCase):
optimizer = fluid.optimizer.Lamb(learning_rate=0.001) optimizer = fluid.optimizer.Lamb(learning_rate=0.001)
amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists( amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
custom_black_varnames={"loss", "conv2d_0.w_0"}) custom_black_varnames={"loss", "conv2d_0.w_0"})
mp_optimizer = fluid.contrib.mixed_precision.decorate( mp_optimizer = decorate(
optimizer=optimizer, optimizer=optimizer,
amp_lists=amp_lists, amp_lists=amp_lists,
init_loss_scaling=8.0, init_loss_scaling=8.0,
......
...@@ -19,8 +19,8 @@ import paddle.fluid as fluid ...@@ -19,8 +19,8 @@ import paddle.fluid as fluid
import contextlib import contextlib
import unittest import unittest
import numpy as np import numpy as np
from paddle.fluid.contrib.mixed_precision.fp16_utils import cast_model_to_fp16 from paddle.static.amp import cast_model_to_fp16
from paddle.fluid.contrib.mixed_precision.fp16_utils import cast_parameters_to_fp16 from paddle.static.amp import cast_parameters_to_fp16
paddle.enable_static() paddle.enable_static()
...@@ -122,11 +122,11 @@ def train(use_pure_fp16=True, use_nesterov=False): ...@@ -122,11 +122,11 @@ def train(use_pure_fp16=True, use_nesterov=False):
# Test program # Test program
test_program = train_program.clone(for_test=True) test_program = train_program.clone(for_test=True)
optimizer = fluid.contrib.optimizer.Momentum( optimizer = paddle.optimizer.Momentum(
learning_rate=0.001, learning_rate=0.001,
momentum=0.9, momentum=0.9,
use_nesterov=use_nesterov, use_nesterov=use_nesterov,
regularization=fluid.regularizer.L2Decay(1e-4), weight_decay=fluid.regularizer.L2Decay(1e-4),
multi_precision=use_pure_fp16, multi_precision=use_pure_fp16,
rescale_grad=1.0 / BATCH_SIZE) rescale_grad=1.0 / BATCH_SIZE)
...@@ -155,9 +155,10 @@ def train(use_pure_fp16=True, use_nesterov=False): ...@@ -155,9 +155,10 @@ def train(use_pure_fp16=True, use_nesterov=False):
loss, = exe.run(compiled_program, loss, = exe.run(compiled_program,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[sum_cost]) fetch_list=[sum_cost])
loss_v = loss[0] if isinstance(loss, np.ndarray) else loss
print('PassID {0:1}, Train Batch ID {1:04}, train loss {2:2.4}'. print('PassID {0:1}, Train Batch ID {1:04}, train loss {2:2.4}'.
format(pass_id, batch_id + 1, float(loss))) format(pass_id, batch_id + 1, float(loss_v)))
train_loss_list.append(float(loss)) train_loss_list.append(float(loss_v))
if batch_id >= 4: # For speeding up CI if batch_id >= 4: # For speeding up CI
test_loss_list = [] test_loss_list = []
......
...@@ -17,8 +17,10 @@ from ..fluid import core ...@@ -17,8 +17,10 @@ from ..fluid import core
from ..fluid import framework from ..fluid import framework
from ..fluid.framework import Variable, name_scope from ..fluid.framework import Variable, name_scope
from ..fluid.layer_helper import LayerHelper from ..fluid.layer_helper import LayerHelper
from ..fluid import unique_name
from ..fluid import layers
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.regularizer import L2DecayRegularizer
__all__ = ["Momentum"] __all__ = ["Momentum"]
...@@ -62,6 +64,9 @@ class Momentum(Optimizer): ...@@ -62,6 +64,9 @@ class Momentum(Optimizer):
some derived class of ``GradientClipBase`` . There are three cliping strategies some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
Often choose to be ``1.0/batch_size``.
name (str, optional): The default value is None. Normally there is no need for user name (str, optional): The default value is None. Normally there is no need for user
to set this property. For more information, please refer to to set this property. For more information, please refer to
:ref:`api_guide_Name` . :ref:`api_guide_Name` .
...@@ -93,20 +98,33 @@ class Momentum(Optimizer): ...@@ -93,20 +98,33 @@ class Momentum(Optimizer):
use_nesterov=False, use_nesterov=False,
weight_decay=None, weight_decay=None,
grad_clip=None, grad_clip=None,
multi_precision=False,
rescale_grad=1.0,
name=None): name=None):
if learning_rate is None: if learning_rate is None:
raise ValueError("learning_rate is not set") raise ValueError("learning_rate is not set")
if momentum is None: if momentum is None:
raise ValueError("momentum is not set") raise ValueError("momentum is not set")
predicate = lambda regular: isinstance(regular, L2DecayRegularizer)
py_regular = None if predicate(weight_decay) else weight_decay
super(Momentum, self).__init__( super(Momentum, self).__init__(
learning_rate=learning_rate, learning_rate=learning_rate,
parameters=parameters, parameters=parameters,
weight_decay=weight_decay, weight_decay=py_regular,
grad_clip=grad_clip, grad_clip=grad_clip,
name=name) name=name)
self.type = "momentum" self.type = "momentum"
self._momentum = momentum self._momentum = momentum
self._use_nesterov = bool(use_nesterov) self._use_nesterov = bool(use_nesterov)
self._regularization_method = ""
self._regularization_coeff = 0
if (isinstance(weight_decay, L2DecayRegularizer)):
self._regularization_method = "l2_decay"
self._regularization_coeff = weight_decay._regularization_coeff
self._multi_precision = multi_precision
self._rescale_grad = rescale_grad
self._master_weights = {}
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
for p in parameters: for p in parameters:
...@@ -116,8 +134,62 @@ class Momentum(Optimizer): ...@@ -116,8 +134,62 @@ class Momentum(Optimizer):
).all_parameters() ).all_parameters()
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
for p in all_parameters: for p in all_parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_accumulator(self._velocity_acc_str, master_p)
continue
if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Momentum optimizer."
)
self._add_accumulator(self._velocity_acc_str, p) self._add_accumulator(self._velocity_acc_str, p)
def _create_master_weight(self, param):
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = layers.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32
})
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
target_param = self._master_weights[
param.name] if find_master else param
target_name = target_param.name
if (name not in self._accumulators or
target_name not in self._accumulators[name]):
raise Exception("Accumulator {} does not exist for parameter {}".
format(name, target_name))
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
# create accumulator in init func, so no implementation here # create accumulator in init func, so no implementation here
...@@ -127,16 +199,30 @@ class Momentum(Optimizer): ...@@ -127,16 +199,30 @@ class Momentum(Optimizer):
velocity_acc = self._get_accumulator(self._velocity_acc_str, velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0]) param_and_grad[0])
find_master = self._multi_precision and param_and_grad[
0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None)
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
_, _ = core.ops.momentum(param_and_grad[0], param_and_grad[1], _, _ = core.ops.momentum(
velocity_acc, lr, param_and_grad[0], param_and_grad[0], param_and_grad[1], velocity_acc, lr,
velocity_acc, 'mu', self._momentum, param_and_grad[0], velocity_acc, 'mu', self._momentum,
'use_nesterov', self._use_nesterov) 'use_nesterov', self._use_nesterov, 'regularization_method',
self._regularization_method, 'regularization_coeff',
self._regularization_coeff)
return None return None
attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov} attrs = {
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
"regularization_method": self._regularization_method,
"regularization_coeff": self._regularization_coeff,
"multi_precision": find_master,
"rescale_grad": self._rescale_grad
}
inputs = { inputs = {
"Param": [param_and_grad[0]], "Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]], "Grad": [param_and_grad[1]],
...@@ -148,6 +234,11 @@ class Momentum(Optimizer): ...@@ -148,6 +234,11 @@ class Momentum(Optimizer):
"ParamOut": [param_and_grad[0]], "ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc] "VelocityOut": [velocity_acc]
} }
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
# create the momentum optimize op # create the momentum optimize op
momentum_op = block.append_op( momentum_op = block.append_op(
type=self.type, type=self.type,
......
...@@ -24,6 +24,7 @@ __all__ = [ ...@@ -24,6 +24,7 @@ __all__ = [
] ]
from . import nn from . import nn
from . import amp
from .io import save_inference_model #DEFINE_ALIAS from .io import save_inference_model #DEFINE_ALIAS
from .io import load_inference_model #DEFINE_ALIAS from .io import load_inference_model #DEFINE_ALIAS
from .io import deserialize_persistables #DEFINE_ALIAS from .io import deserialize_persistables #DEFINE_ALIAS
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...fluid.contrib import mixed_precision
from ...fluid.contrib.mixed_precision import *
__all__ = mixed_precision.__all__
...@@ -211,6 +211,7 @@ packages=['paddle', ...@@ -211,6 +211,7 @@ packages=['paddle',
'paddle.metric', 'paddle.metric',
'paddle.static', 'paddle.static',
'paddle.static.nn', 'paddle.static.nn',
'paddle.static.amp',
'paddle.tensor', 'paddle.tensor',
'paddle.onnx', 'paddle.onnx',
] ]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册