未验证 提交 af2c31a6 编写于 作者: N niuliling123 提交者: GitHub

Delete duplicate code in optimizer.py and support master_param for bf16 in optimzer (#51367)

上级 3c7cde95
此差异已折叠。
...@@ -14,12 +14,10 @@ ...@@ -14,12 +14,10 @@
import warnings import warnings
import paddle
from paddle import _C_ops from paddle import _C_ops
from ..fluid import core, framework, unique_name from ..fluid import framework
from ..fluid.dygraph import no_grad from ..fluid.dygraph import no_grad
from ..fluid.layer_helper import LayerHelper
from ..framework import in_dygraph_mode from ..framework import in_dygraph_mode
from .optimizer import Optimizer from .optimizer import Optimizer
...@@ -144,62 +142,6 @@ class Adadelta(Optimizer): ...@@ -144,62 +142,6 @@ class Adadelta(Optimizer):
'rho': rho, 'rho': rho,
} }
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.") raise TypeError("block is not instance of framework.Block.")
...@@ -207,7 +149,7 @@ class Adadelta(Optimizer): ...@@ -207,7 +149,7 @@ class Adadelta(Optimizer):
parameters = parameters.get('params') parameters = parameters.get('params')
for p in parameters: for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p) master_p = self._create_master_weight(p)
self._add_accumulator(self._avg_squared_grad_acc_str, master_p) self._add_accumulator(self._avg_squared_grad_acc_str, master_p)
self._add_accumulator( self._add_accumulator(
...@@ -215,11 +157,11 @@ class Adadelta(Optimizer): ...@@ -215,11 +157,11 @@ class Adadelta(Optimizer):
) )
continue continue
if ( if (
p.dtype == core.VarDesc.VarType.FP16 self._is_dtype_fp16_or_bf16(p.dtype)
and not self._multi_precision and not self._multi_precision
): ):
warnings.warn( warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer." "Consider using multi_precision=True option of the Lars optimizer."
) )
self._add_accumulator(self._avg_squared_grad_acc_str, p) self._add_accumulator(self._avg_squared_grad_acc_str, p)
...@@ -229,15 +171,14 @@ class Adadelta(Optimizer): ...@@ -229,15 +171,14 @@ class Adadelta(Optimizer):
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad) param_and_grad = self._update_param_group(param_and_grad)
avg_squared_grad_acc = self._get_accumulator( avg_squared_grad_acc = self._get_accumulator_master(
self._avg_squared_grad_acc_str, param_and_grad[0] self._avg_squared_grad_acc_str, param_and_grad[0]
) )
avg_squared_update_acc = self._get_accumulator( avg_squared_update_acc = self._get_accumulator_master(
self._avg_squared_update_acc_str, param_and_grad[0] self._avg_squared_update_acc_str, param_and_grad[0]
) )
find_master = ( find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
self._multi_precision param_and_grad[0].dtype
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
) )
master_weight = ( master_weight = (
self._master_weights[param_and_grad[0].name] self._master_weights[param_and_grad[0].name]
......
...@@ -13,10 +13,7 @@ ...@@ -13,10 +13,7 @@
# limitations under the License. # limitations under the License.
import warnings import warnings
import paddle from ..fluid import framework
from ..fluid import core, framework, unique_name
from ..fluid.layer_helper import LayerHelper
from .optimizer import Optimizer from .optimizer import Optimizer
__all__ = [] __all__ = []
...@@ -138,64 +135,6 @@ class Adagrad(Optimizer): ...@@ -138,64 +135,6 @@ class Adagrad(Optimizer):
'initial_accumulator_value': initial_accumulator_value, 'initial_accumulator_value': initial_accumulator_value,
} }
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -203,16 +142,16 @@ class Adagrad(Optimizer): ...@@ -203,16 +142,16 @@ class Adagrad(Optimizer):
parameters = self._update_param_group(parameters) parameters = self._update_param_group(parameters)
for p in parameters: for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p) master_p = self._create_master_weight(p)
self._add_accumulator(self._moment_acc_str, master_p) self._add_accumulator(self._moment_acc_str, master_p)
continue continue
if ( if (
p.dtype == core.VarDesc.VarType.FP16 self._is_dtype_fp16_or_bf16(p.dtype)
and not self._multi_precision and not self._multi_precision
): ):
warnings.warn( warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Momentum optimizer." "Consider using multi_precision=True option of the Momentum optimizer."
) )
self._add_accumulator( self._add_accumulator(
...@@ -227,13 +166,12 @@ class Adagrad(Optimizer): ...@@ -227,13 +166,12 @@ class Adagrad(Optimizer):
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad) param_and_grad = self._update_param_group(param_and_grad)
moment_acc = self._get_accumulator( moment_acc = self._get_accumulator_master(
self._moment_acc_str, param_and_grad[0] self._moment_acc_str, param_and_grad[0]
) )
find_master = ( find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
self._multi_precision param_and_grad[0].dtype
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
) )
master_weight = ( master_weight = (
......
...@@ -18,10 +18,9 @@ from collections import defaultdict ...@@ -18,10 +18,9 @@ from collections import defaultdict
import paddle import paddle
from paddle import _C_ops from paddle import _C_ops
from ..fluid import core, framework, unique_name from ..fluid import core, framework
from ..fluid.dygraph import base as imperative_base from ..fluid.dygraph import base as imperative_base
from ..fluid.framework import Variable, in_dygraph_mode from ..fluid.framework import Variable, in_dygraph_mode
from ..fluid.layer_helper import LayerHelper
from .optimizer import Optimizer from .optimizer import Optimizer
__all__ = [] __all__ = []
...@@ -225,62 +224,6 @@ class Adam(Optimizer): ...@@ -225,62 +224,6 @@ class Adam(Optimizer):
self._master_weight_dict = self._create_multi_tensor_dict() self._master_weight_dict = self._create_multi_tensor_dict()
self._master_weight_dict['FP32_LODTensor'] = None self._master_weight_dict['FP32_LODTensor'] = None
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
param.dtype
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _add_moments_pows(self, p): def _add_moments_pows(self, p):
acc_dtype = p.dtype acc_dtype = p.dtype
if self._is_dtype_fp16_or_bf16(acc_dtype): if self._is_dtype_fp16_or_bf16(acc_dtype):
...@@ -336,16 +279,16 @@ class Adam(Optimizer): ...@@ -336,16 +279,16 @@ class Adam(Optimizer):
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad) param_and_grad = self._update_param_group(param_and_grad)
moment1 = self._get_accumulator( moment1 = self._get_accumulator_master(
self._moment1_acc_str, param_and_grad[0] self._moment1_acc_str, param_and_grad[0]
) )
moment2 = self._get_accumulator( moment2 = self._get_accumulator_master(
self._moment2_acc_str, param_and_grad[0] self._moment2_acc_str, param_and_grad[0]
) )
beta1_pow_acc = self._get_accumulator( beta1_pow_acc = self._get_accumulator_master(
self._beta1_pow_acc_str, param_and_grad[0] self._beta1_pow_acc_str, param_and_grad[0]
) )
beta2_pow_acc = self._get_accumulator( beta2_pow_acc = self._get_accumulator_master(
self._beta2_pow_acc_str, param_and_grad[0] self._beta2_pow_acc_str, param_and_grad[0]
) )
find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
...@@ -530,12 +473,12 @@ class Adam(Optimizer): ...@@ -530,12 +473,12 @@ class Adam(Optimizer):
""" """
self._create_accumulators(target_block, parameters) self._create_accumulators(target_block, parameters)
for param in parameters: for param in parameters:
moment1 = self._get_accumulator(self._moment1_acc_str, param) moment1 = self._get_accumulator_master(self._moment1_acc_str, param)
moment2 = self._get_accumulator(self._moment2_acc_str, param) moment2 = self._get_accumulator_master(self._moment2_acc_str, param)
beta1_pow_acc = self._get_accumulator( beta1_pow_acc = self._get_accumulator_master(
self._beta1_pow_acc_str, param self._beta1_pow_acc_str, param
) )
beta2_pow_acc = self._get_accumulator( beta2_pow_acc = self._get_accumulator_master(
self._beta2_pow_acc_str, param self._beta2_pow_acc_str, param
) )
......
...@@ -14,13 +14,11 @@ ...@@ -14,13 +14,11 @@
import warnings import warnings
import paddle
from paddle import _C_ops from paddle import _C_ops
from ..fluid import core, framework, unique_name from ..fluid import core, framework
from ..fluid.dygraph import no_grad from ..fluid.dygraph import no_grad
from ..fluid.framework import name_scope from ..fluid.framework import name_scope
from ..fluid.layer_helper import LayerHelper
from .optimizer import Optimizer from .optimizer import Optimizer
__all__ = [] __all__ = []
...@@ -191,95 +189,40 @@ class Adamax(Optimizer): ...@@ -191,95 +189,40 @@ class Adamax(Optimizer):
shape=[1], shape=[1],
) )
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if isinstance(parameters, dict): if isinstance(parameters, dict):
parameters = self._update_param_group(parameters) parameters = self._update_param_group(parameters)
# Create accumulator tensors for first moment and infinity norm # Create accumulator tensors for first moment and infinity norm
for p in parameters: for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p) master_p = self._create_master_weight(p)
self._add_moments_pows(master_p) self._add_moments_pows(master_p)
continue continue
if ( if (
p.dtype == core.VarDesc.VarType.FP16 self._is_dtype_fp16_or_bf16(p.dtype)
and not self._multi_precision and not self._multi_precision
): ):
warnings.warn( warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Adam optimizer." "Consider using multi_precision=True option of the Adam optimizer."
) )
self._add_moments_pows(p) self._add_moments_pows(p)
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
param.dtype
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad) param_and_grad = self._update_param_group(param_and_grad)
moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0]) moment = self._get_accumulator_master(
inf_norm = self._get_accumulator( self._moment_acc_str, param_and_grad[0]
)
inf_norm = self._get_accumulator_master(
self._inf_norm_acc_str, param_and_grad[0] self._inf_norm_acc_str, param_and_grad[0]
) )
find_master = ( find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
self._multi_precision param_and_grad[0].dtype
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
) )
master_weight = ( master_weight = (
self._master_weights[param_and_grad[0].name] self._master_weights[param_and_grad[0].name]
...@@ -287,7 +230,7 @@ class Adamax(Optimizer): ...@@ -287,7 +230,7 @@ class Adamax(Optimizer):
else None else None
) )
beta1_pow_acc = self._get_accumulator( beta1_pow_acc = self._get_accumulator_master(
self._beta1_pow_acc_str, param_and_grad[0] self._beta1_pow_acc_str, param_and_grad[0]
) )
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
...@@ -347,7 +290,7 @@ class Adamax(Optimizer): ...@@ -347,7 +290,7 @@ class Adamax(Optimizer):
if grad is None or param.stop_gradient is True: if grad is None or param.stop_gradient is True:
continue continue
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
beta1_pow_acc = self._get_accumulator( beta1_pow_acc = self._get_accumulator_master(
self._beta1_pow_acc_str, param self._beta1_pow_acc_str, param
) )
with no_grad(): with no_grad():
...@@ -359,7 +302,7 @@ class Adamax(Optimizer): ...@@ -359,7 +302,7 @@ class Adamax(Optimizer):
with param.block.program._optimized_guard( with param.block.program._optimized_guard(
[param, grad] [param, grad]
), name_scope('adamax'): ), name_scope('adamax'):
beta1_pow_acc = self._get_accumulator( beta1_pow_acc = self._get_accumulator_master(
self._beta1_pow_acc_str, param self._beta1_pow_acc_str, param
) )
block.append_op( block.append_op(
...@@ -374,7 +317,7 @@ class Adamax(Optimizer): ...@@ -374,7 +317,7 @@ class Adamax(Optimizer):
if grad is None or param.stop_gradient is True: if grad is None or param.stop_gradient is True:
continue continue
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
beta1_pow_acc = self._get_accumulator( beta1_pow_acc = self._get_accumulator_master(
self._beta1_pow_acc_str, param self._beta1_pow_acc_str, param
) )
self._beta1 = parameters_and_grads.get( self._beta1 = parameters_and_grads.get(
...@@ -389,7 +332,7 @@ class Adamax(Optimizer): ...@@ -389,7 +332,7 @@ class Adamax(Optimizer):
with param.block.program._optimized_guard( with param.block.program._optimized_guard(
[param, grad] [param, grad]
), name_scope('adamax'): ), name_scope('adamax'):
beta1_pow_acc = self._get_accumulator( beta1_pow_acc = self._get_accumulator_master(
self._beta1_pow_acc_str, param self._beta1_pow_acc_str, param
) )
self._beta1 = parameters_and_grads.get( self._beta1 = parameters_and_grads.get(
......
...@@ -19,10 +19,9 @@ from collections.abc import Callable ...@@ -19,10 +19,9 @@ from collections.abc import Callable
import paddle import paddle
from .. import _C_ops from .. import _C_ops
from ..fluid import core, framework, unique_name from ..fluid import core, framework
from ..fluid.dygraph import base as imperative_base from ..fluid.dygraph import base as imperative_base
from ..fluid.framework import Parameter, Variable from ..fluid.framework import Parameter, Variable
from ..fluid.layer_helper import LayerHelper
from ..nn.clip import GradientClipBase from ..nn.clip import GradientClipBase
from .lr import LRScheduler from .lr import LRScheduler
from .optimizer import Optimizer from .optimizer import Optimizer
...@@ -333,62 +332,6 @@ class AdamW(Optimizer): ...@@ -333,62 +332,6 @@ class AdamW(Optimizer):
self._param_groups.append(param_group) self._param_groups.append(param_group)
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
param.dtype
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _add_moments_pows(self, p): def _add_moments_pows(self, p):
acc_dtype = p.dtype acc_dtype = p.dtype
if self._is_dtype_fp16_or_bf16(acc_dtype): if self._is_dtype_fp16_or_bf16(acc_dtype):
...@@ -453,16 +396,16 @@ class AdamW(Optimizer): ...@@ -453,16 +396,16 @@ class AdamW(Optimizer):
): ):
with_decay = False with_decay = False
moment1 = self._get_accumulator( moment1 = self._get_accumulator_master(
self._moment1_acc_str, param_and_grad[0] self._moment1_acc_str, param_and_grad[0]
) )
moment2 = self._get_accumulator( moment2 = self._get_accumulator_master(
self._moment2_acc_str, param_and_grad[0] self._moment2_acc_str, param_and_grad[0]
) )
beta1_pow_acc = self._get_accumulator( beta1_pow_acc = self._get_accumulator_master(
self._beta1_pow_acc_str, param_and_grad[0] self._beta1_pow_acc_str, param_and_grad[0]
) )
beta2_pow_acc = self._get_accumulator( beta2_pow_acc = self._get_accumulator_master(
self._beta2_pow_acc_str, param_and_grad[0] self._beta2_pow_acc_str, param_and_grad[0]
) )
find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
......
...@@ -12,13 +12,11 @@ ...@@ -12,13 +12,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle
from paddle import _C_ops from paddle import _C_ops
from paddle.fluid.executor import global_scope from paddle.fluid.executor import global_scope
from ..fluid import core, framework, unique_name from ..fluid import core, framework
from ..fluid.framework import Variable from ..fluid.framework import Variable
from ..fluid.layer_helper import LayerHelper
from .optimizer import Optimizer from .optimizer import Optimizer
__all__ = [] __all__ = []
...@@ -154,35 +152,6 @@ class Lamb(Optimizer): ...@@ -154,35 +152,6 @@ class Lamb(Optimizer):
master_p_t = None master_p_t = None
return p_t, master_p_t return p_t, master_p_t
def _create_master_weight(self, param):
assert self._multi_precision
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(parameters, dict): if isinstance(parameters, dict):
...@@ -190,43 +159,15 @@ class Lamb(Optimizer): ...@@ -190,43 +159,15 @@ class Lamb(Optimizer):
# Create accumulator tensors for first and second moments # Create accumulator tensors for first and second moments
for p in parameters: for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p) master_p = self._create_master_weight(p)
self._add_moments_pows(master_p) self._add_moments_pows(master_p)
else: else:
self._add_moments_pows(p) self._add_moments_pows(p)
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _add_moments_pows(self, p): def _add_moments_pows(self, p):
acc_dtype = p.dtype acc_dtype = p.dtype
if acc_dtype == core.VarDesc.VarType.FP16: if self._is_dtype_fp16_or_bf16(acc_dtype):
acc_dtype = core.VarDesc.VarType.FP32 acc_dtype = core.VarDesc.VarType.FP32
self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype) self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
...@@ -261,16 +202,16 @@ class Lamb(Optimizer): ...@@ -261,16 +202,16 @@ class Lamb(Optimizer):
block.program._use_lamb = True block.program._use_lamb = True
moment1 = self._get_accumulator( moment1 = self._get_accumulator_master(
self._moment1_acc_str, param_and_grad[0] self._moment1_acc_str, param_and_grad[0]
) )
moment2 = self._get_accumulator( moment2 = self._get_accumulator_master(
self._moment2_acc_str, param_and_grad[0] self._moment2_acc_str, param_and_grad[0]
) )
beta1_pow_acc = self._get_accumulator( beta1_pow_acc = self._get_accumulator_master(
self._beta1_pow_acc_str, param_and_grad[0] self._beta1_pow_acc_str, param_and_grad[0]
) )
beta2_pow_acc = self._get_accumulator( beta2_pow_acc = self._get_accumulator_master(
self._beta2_pow_acc_str, param_and_grad[0] self._beta2_pow_acc_str, param_and_grad[0]
) )
...@@ -283,9 +224,8 @@ class Lamb(Optimizer): ...@@ -283,9 +224,8 @@ class Lamb(Optimizer):
weight_decay = self._lamb_weight_decay weight_decay = self._lamb_weight_decay
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
find_master = ( find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
self._multi_precision param_and_grad[0].dtype
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
) )
p_name = param_and_grad[0].name p_name = param_and_grad[0].name
if find_master: if find_master:
......
...@@ -19,8 +19,7 @@ from paddle import _C_ops ...@@ -19,8 +19,7 @@ from paddle import _C_ops
from paddle.fluid.framework import in_dygraph_mode from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.regularizer import L2DecayRegularizer from paddle.fluid.regularizer import L2DecayRegularizer
from ..fluid import core, framework, unique_name from ..fluid import core, framework
from ..fluid.layer_helper import LayerHelper
from .optimizer import Optimizer from .optimizer import Optimizer
__all__ = [] __all__ = []
...@@ -201,64 +200,6 @@ class Momentum(Optimizer): ...@@ -201,64 +200,6 @@ class Momentum(Optimizer):
reg_coeff = weight_decay reg_coeff = weight_decay
return reg_method, reg_coeff return reg_method, reg_coeff
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
''' '''
if framework._non_static_mode(): if framework._non_static_mode():
...@@ -270,16 +211,16 @@ class Momentum(Optimizer): ...@@ -270,16 +211,16 @@ class Momentum(Optimizer):
parameters = self._update_param_group(parameters) parameters = self._update_param_group(parameters)
for p in parameters: for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p) master_p = self._create_master_weight(p)
self._add_accumulator(self._velocity_acc_str, master_p) self._add_accumulator(self._velocity_acc_str, master_p)
continue continue
if ( if (
p.dtype == core.VarDesc.VarType.FP16 self._is_dtype_fp16_or_bf16(p.dtype)
and not self._multi_precision and not self._multi_precision
): ):
warnings.warn( warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Momentum optimizer." "Consider using multi_precision=True option of the Momentum optimizer."
) )
self._add_accumulator(self._velocity_acc_str, p) self._add_accumulator(self._velocity_acc_str, p)
...@@ -304,7 +245,7 @@ class Momentum(Optimizer): ...@@ -304,7 +245,7 @@ class Momentum(Optimizer):
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad) param_and_grad = self._update_param_group(param_and_grad)
velocity_acc = self._get_accumulator( velocity_acc = self._get_accumulator_master(
self._velocity_acc_str, param_and_grad[0] self._velocity_acc_str, param_and_grad[0]
) )
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
...@@ -323,9 +264,8 @@ class Momentum(Optimizer): ...@@ -323,9 +264,8 @@ class Momentum(Optimizer):
regularization_method = "" regularization_method = ""
regularization_coeff = 0.0 regularization_coeff = 0.0
find_master = ( find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
self._multi_precision param_and_grad[0].dtype
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
) )
master_weight = ( master_weight = (
self._master_weights[param_and_grad[0].name] self._master_weights[param_and_grad[0].name]
...@@ -388,7 +328,7 @@ class Momentum(Optimizer): ...@@ -388,7 +328,7 @@ class Momentum(Optimizer):
def _multi_tensor_init(self, target_block, parameters, param_group_idx): def _multi_tensor_init(self, target_block, parameters, param_group_idx):
""" """
All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32). All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, bf16, float32).
This function will be overridden in the corresponding optimizer file. This function will be overridden in the corresponding optimizer file.
Args: Args:
...@@ -397,7 +337,9 @@ class Momentum(Optimizer): ...@@ -397,7 +337,9 @@ class Momentum(Optimizer):
""" """
self._create_accumulators(target_block, parameters) self._create_accumulators(target_block, parameters)
for param in parameters: for param in parameters:
velocity_acc = self._get_accumulator(self._velocity_acc_str, param) velocity_acc = self._get_accumulator_master(
self._velocity_acc_str, param
)
regularization_method = self._regularization_method regularization_method = self._regularization_method
regularization_coeff = self._regularization_coeff regularization_coeff = self._regularization_coeff
if hasattr(param, 'regularizer'): if hasattr(param, 'regularizer'):
...@@ -424,7 +366,7 @@ class Momentum(Optimizer): ...@@ -424,7 +366,7 @@ class Momentum(Optimizer):
self._regularization_coeff_dict['FP32_LODTensor'][ self._regularization_coeff_dict['FP32_LODTensor'][
param_group_idx param_group_idx
].append(regularization_coeff) ].append(regularization_coeff)
elif param.dtype == paddle.float16: elif self._is_dtype_fp16_or_bf16(param.dtype):
self._param_dict['FP16_LODTensor'][param_group_idx].append( self._param_dict['FP16_LODTensor'][param_group_idx].append(
param param
) )
...@@ -447,7 +389,7 @@ class Momentum(Optimizer): ...@@ -447,7 +389,7 @@ class Momentum(Optimizer):
].append(regularization_coeff) ].append(regularization_coeff)
else: else:
raise ValueError( raise ValueError(
"Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR." "Now multi_tensor_momentum only support fp32, fp16 or bf16 parameters and grad is LOD_TENSOR."
) )
def _append_optimize_multi_tensor_op( def _append_optimize_multi_tensor_op(
...@@ -478,7 +420,7 @@ class Momentum(Optimizer): ...@@ -478,7 +420,7 @@ class Momentum(Optimizer):
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr) lr_dict['FP32_LODTensor'].append(lr)
elif ( elif (
param_and_grad[0].dtype == paddle.float16 self._is_dtype_fp16_or_bf16(param_and_grad[0].dtype)
and param_and_grad[1].type and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR == core.VarDesc.VarType.LOD_TENSOR
): ):
...@@ -509,7 +451,7 @@ class Momentum(Optimizer): ...@@ -509,7 +451,7 @@ class Momentum(Optimizer):
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
lr_dict['FP32_LODTensor'].append(lr) lr_dict['FP32_LODTensor'].append(lr)
elif ( elif (
param_and_grad[0].dtype == paddle.float16 self._is_dtype_fp16_or_bf16(param_and_grad[0].dtype)
and param_and_grad[1].type and param_and_grad[1].type
== core.VarDesc.VarType.LOD_TENSOR == core.VarDesc.VarType.LOD_TENSOR
): ):
......
...@@ -636,6 +636,34 @@ class Optimizer: ...@@ -636,6 +636,34 @@ class Optimizer:
else: else:
return self._global_learning_rate() return self._global_learning_rate()
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
"""Create all accumulators needed by the parameters """Create all accumulators needed by the parameters
...@@ -767,6 +795,34 @@ class Optimizer: ...@@ -767,6 +795,34 @@ class Optimizer:
) )
return self._accumulators[name][param.name] return self._accumulators[name][param.name]
def _get_accumulator_master(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
param.dtype
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _update_param_device_map(self, parameters_and_grads, target_block): def _update_param_device_map(self, parameters_and_grads, target_block):
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
if param_and_grad[0].stop_gradient is False: if param_and_grad[0].stop_gradient is False:
......
...@@ -14,12 +14,10 @@ ...@@ -14,12 +14,10 @@
import warnings import warnings
import paddle
from paddle import _C_ops from paddle import _C_ops
from ..fluid import core, framework, unique_name from ..fluid import framework
from ..fluid.framework import in_dygraph_mode from ..fluid.framework import in_dygraph_mode
from ..fluid.layer_helper import LayerHelper
from .optimizer import Optimizer from .optimizer import Optimizer
__all__ = [] __all__ = []
...@@ -197,62 +195,6 @@ class RMSProp(Optimizer): ...@@ -197,62 +195,6 @@ class RMSProp(Optimizer):
'centered': centered, 'centered': centered,
} }
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = (
self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
)
target_param = (
self._master_weights[param.name] if find_master else param
)
target_name = target_param.name
if (
name not in self._accumulators
or target_name not in self._accumulators[name]
):
raise Exception(
"Accumulator {} does not exist for parameter {}".format(
name, target_name
)
)
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block): if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.") raise TypeError("block is not instance of framework.Block.")
...@@ -261,14 +203,14 @@ class RMSProp(Optimizer): ...@@ -261,14 +203,14 @@ class RMSProp(Optimizer):
parameters = parameters.get('params') parameters = parameters.get('params')
for p in parameters: for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p) master_p = self._create_master_weight(p)
self._add_accumulator(self._momentum_acc_str, master_p) self._add_accumulator(self._momentum_acc_str, master_p)
self._add_accumulator(self._mean_square_acc_str, master_p) self._add_accumulator(self._mean_square_acc_str, master_p)
self._add_accumulator(self._mean_grad_acc_str, master_p) self._add_accumulator(self._mean_grad_acc_str, master_p)
continue continue
if ( if (
p.dtype == core.VarDesc.VarType.FP16 self._is_dtype_fp16_or_bf16(p.dtype)
and not self._multi_precision and not self._multi_precision
): ):
warnings.warn( warnings.warn(
...@@ -286,18 +228,17 @@ class RMSProp(Optimizer): ...@@ -286,18 +228,17 @@ class RMSProp(Optimizer):
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad) param_and_grad = self._update_param_group(param_and_grad)
momentum_acc = self._get_accumulator( momentum_acc = self._get_accumulator_master(
self._momentum_acc_str, param_and_grad[0] self._momentum_acc_str, param_and_grad[0]
) )
mean_square_acc = self._get_accumulator( mean_square_acc = self._get_accumulator_master(
self._mean_square_acc_str, param_and_grad[0] self._mean_square_acc_str, param_and_grad[0]
) )
mean_grad_acc = self._get_accumulator( mean_grad_acc = self._get_accumulator_master(
self._mean_grad_acc_str, param_and_grad[0] self._mean_grad_acc_str, param_and_grad[0]
) )
find_master = ( find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
self._multi_precision param_and_grad[0].dtype
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
) )
master_weight = ( master_weight = (
self._master_weights[param_and_grad[0].name] self._master_weights[param_and_grad[0].name]
......
...@@ -14,13 +14,11 @@ ...@@ -14,13 +14,11 @@
import warnings import warnings
import paddle
from paddle import _C_ops from paddle import _C_ops
from ..fluid import core, framework, unique_name from ..fluid import framework
from ..fluid.dygraph import no_grad from ..fluid.dygraph import no_grad
from ..fluid.framework import in_dygraph_mode from ..fluid.framework import in_dygraph_mode
from ..fluid.layer_helper import LayerHelper
from .optimizer import Optimizer from .optimizer import Optimizer
__all__ = [] __all__ = []
...@@ -94,34 +92,6 @@ class SGD(Optimizer): ...@@ -94,34 +92,6 @@ class SGD(Optimizer):
self._multi_precision = multi_precision self._multi_precision = multi_precision
self._master_weights = {} self._master_weights = {}
def _create_master_weight(self, param):
if param.name in self._master_weights:
var = self._master_weights[param.name]
else:
assert isinstance(self.helper, LayerHelper)
var_name = param.name + "_fp32_master"
var_name = unique_name.generate(var_name)
var = paddle.static.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True,
)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32,
},
)
self._master_weights[param.name] = var
return var
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
if isinstance(parameters, dict): if isinstance(parameters, dict):
...@@ -129,15 +99,15 @@ class SGD(Optimizer): ...@@ -129,15 +99,15 @@ class SGD(Optimizer):
# Create accumulator tensors for first and second moments # Create accumulator tensors for first and second moments
for p in parameters: for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p) master_p = self._create_master_weight(p)
continue continue
if ( if (
p.dtype == core.VarDesc.VarType.FP16 self._is_dtype_fp16_or_bf16(p.dtype)
and not self._multi_precision and not self._multi_precision
): ):
warnings.warn( warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Adam optimizer." "Consider using multi_precision=True option of the Adam optimizer."
) )
...@@ -146,9 +116,8 @@ class SGD(Optimizer): ...@@ -146,9 +116,8 @@ class SGD(Optimizer):
if isinstance(param_and_grad, dict): if isinstance(param_and_grad, dict):
param_and_grad = self._update_param_group(param_and_grad) param_and_grad = self._update_param_group(param_and_grad)
find_master = ( find_master = self._multi_precision and self._is_dtype_fp16_or_bf16(
self._multi_precision param_and_grad[0].dtype
and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
) )
master_weight = ( master_weight = (
self._master_weights[param_and_grad[0].name] self._master_weights[param_and_grad[0].name]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册