未验证 提交 8a503522 编写于 作者: W Weilong Wu 提交者: GitHub

Revert grad scale optimization pr (#50839)

* Revert "fixoptminizer _set_auxiliary_var bug (#50335)"

This reverts commit c44005f0.

* Revert "refine optimizer create accumulators (#50188)"

This reverts commit 244e7546.

* Revert "fix found_inf bug for custom optimizer (#50158)"

This reverts commit 64573f9f.

* Revert "refine amp scaler found_inf (#49864)"

This reverts commit 382e9a06.

* fix code format

* fix conflict
上级 09694f82
......@@ -18,7 +18,7 @@ from enum import Enum
import numpy as np
from paddle import _C_ops, _legacy_C_ops
from paddle import _legacy_C_ops
from paddle.fluid import core, in_dygraph_mode
from paddle.fluid.data_feeder import check_type
from paddle.fluid.dygraph import to_variable
......@@ -228,16 +228,11 @@ class AmpScaler:
optimize_ops, params_grads = (None, None)
if hasattr(optimizer, "_set_auxiliary_var"):
optimizer._set_auxiliary_var('found_inf', self._found_inf)
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
if self._found_inf:
self._cache_founf_inf = True
else:
if self._found_inf:
self._cache_founf_inf = True
else:
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = False
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = False
if self._use_dynamic_loss_scaling:
# uopdate the scale
......@@ -335,9 +330,6 @@ class AmpScaler:
param_grads_fp16,
self._temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp16
)
if len(param_grads_bf16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_bf16,
......@@ -346,9 +338,6 @@ class AmpScaler:
param_grads_bf16,
self._temp_found_inf_bf16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_bf16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
......@@ -357,9 +346,6 @@ class AmpScaler:
param_grads_fp32,
self._temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp32
)
else:
if len(param_grads_fp16):
_legacy_C_ops.check_finite_and_unscale(
......@@ -368,9 +354,6 @@ class AmpScaler:
param_grads_fp16,
self._temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp16
)
if len(param_grads_bf16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_bf16,
......@@ -378,9 +361,6 @@ class AmpScaler:
param_grads_bf16,
self._temp_found_inf_bf16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_bf16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
......@@ -388,9 +368,12 @@ class AmpScaler:
param_grads_fp32,
self._temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp32
)
self._found_inf = (
self._temp_found_inf_fp16
or self._temp_found_inf_bf16
or self._temp_found_inf_fp32
)
optimizer_state["state"] = OptimizerState.UNSCALED
......@@ -778,16 +761,11 @@ class GradScaler(AmpScaler):
if optimizer_state["state"] is OptimizerState.INIT:
self._unscale(optimizer)
if hasattr(optimizer, "_set_auxiliary_var"):
optimizer._set_auxiliary_var('found_inf', self._found_inf)
optimizer.step()
self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
if self._found_inf:
self._cache_founf_inf = True
else:
if self._found_inf:
self._cache_founf_inf = True
else:
optimizer.step()
self._cache_founf_inf = False
optimizer.step()
self._cache_founf_inf = False
optimizer_state["state"] = OptimizerState.STEPPED
......
......@@ -236,10 +236,6 @@ class AscendOptimizer(Optimizer):
ret_list.append(var)
return ret_list
def _set_auxiliary_var(self, key, val):
super()._set_auxiliary_var(key, val)
self.inner_opt._set_auxiliary_var(key, val)
def minimize(
self,
loss,
......
......@@ -41,16 +41,11 @@ class HybridParallelGradScaler:
optimize_ops, params_grads = (None, None)
if hasattr(optimizer, "_set_auxiliary_var"):
optimizer._set_auxiliary_var('found_inf', self._found_inf)
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
if self._found_inf:
self._cache_founf_inf = True
else:
if self._found_inf:
self._cache_founf_inf = True
else:
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = False
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
self._cache_founf_inf = False
if self._use_dynamic_loss_scaling:
self._update()
......
......@@ -25,10 +25,6 @@ class MetaOptimizerBase(Optimizer):
self.meta_optimizers_white_list = []
self.meta_optimizers_black_list = []
def _set_auxiliary_var(self, key, val):
super()._set_auxiliary_var(key, val)
self.inner_opt._set_auxiliary_var(key, val)
def _set_basic_info(
self, loss, role_maker, user_defined_optimizer, user_defined_strategy
):
......
......@@ -203,10 +203,6 @@ class GroupShardedOptimizerStage2(Optimizer):
# Update optimizer parameters and adjust parameter storage and use according to rank.
self._update_opt_status()
def _set_auxiliary_var(self, key, val):
super()._set_auxiliary_var(key, val)
self._optim._set_auxiliary_var(key, val)
@paddle.autograd.no_grad()
def _sync_params_and_buffers(self):
"""
......
......@@ -19,10 +19,10 @@ from types import MethodType
import numpy as np
import paddle
from paddle import _C_ops, _legacy_C_ops
from paddle import _legacy_C_ops
from paddle.common_ops_import import dygraph_only
from paddle.fluid import core
from paddle.fluid.dygraph import to_variable
from paddle.framework import core
from paddle.nn import clip
......@@ -270,9 +270,6 @@ def GroupShardedScaler(scaler):
param_grads_bfp16,
temp_found_inf_bfp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_bfp16
)
if len(param_grads_fp16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp16,
......@@ -280,9 +277,6 @@ def GroupShardedScaler(scaler):
param_grads_fp16,
temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_fp16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
......@@ -290,17 +284,21 @@ def GroupShardedScaler(scaler):
param_grads_fp32,
temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_fp32
)
self._found_inf = self._found_inf.cast("int32")
self._found_inf = (
1
if temp_found_inf_bfp16
or temp_found_inf_fp16
or temp_found_inf_fp32
else 0
)
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
paddle.distributed.all_reduce(
self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
is_found_inf, op=paddle.distributed.ReduceOp.SUM, group=None
)
self._found_inf = self._found_inf.cast("bool")
self._found_inf = is_found_inf.numpy()[0]
scaler._unscale = MethodType(unscale_method, scaler)
return scaler
......
......@@ -17,7 +17,7 @@ from types import MethodType
import numpy as np
import paddle
from paddle import _C_ops, _legacy_C_ops
from paddle import _legacy_C_ops
from paddle.distributed import fleet
from paddle.fluid.dygraph import to_variable
from paddle.framework import core
......@@ -73,9 +73,6 @@ def distributed_scaler(scaler):
param_grads_fp16,
temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_fp16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
......@@ -83,19 +80,17 @@ def distributed_scaler(scaler):
param_grads_fp32,
temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, temp_found_inf_fp32
)
self._found_inf = self._found_inf.cast("int32")
self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
# TODO(shenliang03) Since dp allreduce in the optimizer is
# after the gradscaler, check_finite needs to synchronize global
# information. In the future, we should use check_group to speed.
paddle.distributed.all_reduce(
self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
)
self._found_inf = self._found_inf.cast("bool")
self._found_inf = is_found_inf.numpy()[0]
# Only data_parallel doesn't need to modify scaler
fleet_env = fleet.fleet
......
......@@ -19,6 +19,7 @@ from collections import defaultdict
import paddle
from paddle.fluid.framework import (
Program,
Variable,
......@@ -899,18 +900,11 @@ class Optimizer:
self._create_global_learning_rate()
if in_dygraph_mode():
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', True)
else:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', False)
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
if param_and_grad[0].trainable is True:
self._append_optimize_op(target_block, param_and_grad)
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
if param_and_grad[0].trainable is True:
self._append_optimize_op(target_block, param_and_grad)
else:
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
......
......@@ -144,10 +144,6 @@ class LookAhead(Optimizer):
self._global_step_var = None
self._k_var = None
def _set_auxiliary_var(self, key, val):
super()._set_auxiliary_var(key, val)
self.inner_optimizer._set_auxiliary_var(key, val)
@framework.dygraph_only
@imperative_base.no_grad
def step(self):
......
......@@ -145,11 +145,8 @@ class Adadelta(Optimizer):
parameters = parameters.get('params')
for p in parameters:
if p.name in self._already_create_accumulater:
continue
self._add_accumulator(self._avg_squared_grad_acc_str, p)
self._add_accumulator(self._avg_squared_update_acc_str, p)
self._already_create_accumulater.add(p.name)
def _append_optimize_op(self, block, param_and_grad):
if isinstance(param_and_grad, dict):
......
......@@ -139,14 +139,11 @@ class Adagrad(Optimizer):
parameters = self._update_param_group(parameters)
for p in parameters:
if p.name in self._already_create_accumulater:
continue
self._add_accumulator(
self._moment_acc_str,
p,
fill_value=self.initial_accumulator_value,
)
self._already_create_accumulater.add(p.name)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
......
......@@ -317,12 +317,9 @@ class Adam(Optimizer):
# Create accumulator tensors for first and second moments
for p in parameters:
if p.name in self._already_create_accumulater:
continue
if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p)
self._add_moments_pows(master_p)
self._already_create_accumulater.add(p.name)
continue
if (
self._is_dtype_fp16_or_bf16(p.dtype)
......@@ -333,7 +330,6 @@ class Adam(Optimizer):
"Consider using multi_precision=True option of the Adam optimizer."
)
self._add_moments_pows(p)
self._already_create_accumulater.add(p.name)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
......@@ -364,6 +360,8 @@ class Adam(Optimizer):
# create the adam optimize op
if framework.in_dygraph_mode():
found_inf = self._get_auxiliary_var('found_inf')
_beta1 = (
self._beta1
if not isinstance(self._beta1, Variable)
......@@ -384,7 +382,7 @@ class Adam(Optimizer):
beta1_pow_acc,
beta2_pow_acc,
master_weight,
None,
found_inf,
_beta1,
_beta2,
self._epsilon,
......@@ -695,28 +693,21 @@ class Adam(Optimizer):
if master_weight is not None
else None
)
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', True)
else:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', False)
_, _, _, _, _, _ = _C_ops.merged_adam_(
self._param_dict[key][param_group_idx],
grad_dict[key],
lr_dict[key],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
_beta1,
_beta2,
self._epsilon,
find_master,
False,
)
_, _, _, _, _, _ = _C_ops.merged_adam_(
self._param_dict[key][param_group_idx],
grad_dict[key],
lr_dict[key],
self._moment1_dict[key][param_group_idx],
self._moment2_dict[key][param_group_idx],
self._beta1_pow_acc_dict[key][param_group_idx],
self._beta2_pow_acc_dict[key][param_group_idx],
master_weight,
_beta1,
_beta2,
self._epsilon,
find_master,
False,
)
else:
inputs = {
"Param": self._param_dict[key][param_group_idx],
......
......@@ -176,8 +176,6 @@ class Adamax(Optimizer):
# Create accumulator tensors for first moment and infinity norm
for p in parameters:
if p.name in self._already_create_accumulater:
continue
self._add_accumulator(self._moment_acc_str, p)
self._add_accumulator(self._inf_norm_acc_str, p)
self._add_accumulator(
......@@ -186,7 +184,6 @@ class Adamax(Optimizer):
fill_value=self._beta1,
shape=[1],
)
self._already_create_accumulater.add(p.name)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
......
......@@ -281,7 +281,6 @@ class AdamW(Optimizer):
self._use_multi_tensor = None
self.regularization = None
self._auxiliary_vars = {}
self._already_create_accumulater = set()
def _set_auxiliary_var(self, key, val):
self._auxiliary_vars[key] = val
......@@ -423,12 +422,9 @@ class AdamW(Optimizer):
# Create accumulator tensors for first and second moments
for p in parameters:
if p.name in self._already_create_accumulater:
continue
if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype):
master_p = self._create_master_weight(p)
self._add_moments_pows(master_p)
self._already_create_accumulater.add(p.name)
continue
if (
self._is_dtype_fp16_or_bf16(p.dtype)
......@@ -439,7 +435,6 @@ class AdamW(Optimizer):
"Consider using multi_precision=True option of the Adam optimizer."
)
self._add_moments_pows(p)
self._already_create_accumulater.add(p.name)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
......@@ -496,6 +491,7 @@ class AdamW(Optimizer):
else self._beta2.numpy().item(0)
)
found_inf = self._get_auxiliary_var('found_inf')
_, _, _, _, _, _ = _C_ops.adamw_(
param_and_grad[0],
param_and_grad[1],
......@@ -505,7 +501,7 @@ class AdamW(Optimizer):
beta1_pow_acc,
beta2_pow_acc,
master_weight,
None,
found_inf,
_beta1,
_beta2,
self._epsilon,
......
......@@ -190,15 +190,11 @@ class Lamb(Optimizer):
# Create accumulator tensors for first and second moments
for p in parameters:
if p.name in self._already_create_accumulater:
continue
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_moments_pows(master_p)
self._already_create_accumulater.add(p.name)
else:
self._add_moments_pows(p)
self._already_create_accumulater.add(p.name)
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
......@@ -297,6 +293,7 @@ class Lamb(Optimizer):
self._used_master_weights[p_name] = master_weight.name
else:
master_weight = None
found_inf = self._get_auxiliary_var('found_inf')
if framework.in_dygraph_mode():
_C_ops.lamb_(
......@@ -308,7 +305,7 @@ class Lamb(Optimizer):
beta1_pow_acc,
beta2_pow_acc,
master_weight,
None,
found_inf,
weight_decay,
self._beta1,
self._beta2,
......@@ -346,7 +343,6 @@ class Lamb(Optimizer):
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
inputs["SkipUpdate"] = found_inf
......
......@@ -270,12 +270,9 @@ class Momentum(Optimizer):
parameters = self._update_param_group(parameters)
for p in parameters:
if p.name in self._already_create_accumulater:
continue
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_accumulator(self._velocity_acc_str, master_p)
self._already_create_accumulater.add(p.name)
continue
if (
p.dtype == core.VarDesc.VarType.FP16
......@@ -286,7 +283,6 @@ class Momentum(Optimizer):
"Consider using multi_precision=True option of the Momentum optimizer."
)
self._add_accumulator(self._velocity_acc_str, p)
self._already_create_accumulater.add(p.name)
def _create_regularization_of_grad(self, param, grad, regularization=None):
"""Create and add backward regularization Operators
......@@ -534,30 +530,19 @@ class Momentum(Optimizer):
)
if in_dygraph_mode():
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', True)
else:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', False)
_, _, _ = _C_ops.merged_momentum_(
self._param_dict[key][param_group_idx],
grad_dict[key],
self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._momentum,
self._use_nesterov,
self._regularization_method_dict[key][
param_group_idx
],
self._regularization_coeff_dict[key][
param_group_idx
],
find_master,
self._rescale_grad,
)
_, _, _ = _C_ops.merged_momentum_(
self._param_dict[key][param_group_idx],
grad_dict[key],
self._velocity_dict[key][param_group_idx],
lr_dict[key],
master_weight,
self._momentum,
self._use_nesterov,
self._regularization_method_dict[key][param_group_idx],
self._regularization_coeff_dict[key][param_group_idx],
find_master,
self._rescale_grad,
)
else:
inputs = {
"Param": self._param_dict[key][param_group_idx],
......
......@@ -275,7 +275,6 @@ class Optimizer:
self._param_dict = self._create_multi_tensor_dict()
self._auxiliary_vars = {}
self._already_create_accumulater = set()
def _set_auxiliary_var(self, key, val):
self._auxiliary_vars[key] = val
......@@ -924,38 +923,31 @@ class Optimizer:
self._create_accumulators(target_block, params_acc_dict)
if framework._non_static_mode():
found_inf = self._get_auxiliary_var('found_inf')
if found_inf:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', True)
if isinstance(parameters_and_grads, list):
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
if param_and_grad[0].stop_gradient is False:
self._append_optimize_op(
target_block, param_and_grad
)
else:
if isinstance(found_inf, core.eager.Tensor):
self._set_auxiliary_var('found_inf', False)
if isinstance(parameters_and_grads, list):
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
continue
if param_and_grad[0].stop_gradient is False:
self._append_optimize_op(
target_block, param_and_grad
)
else:
for param_and_grad in parameters_and_grads['params']:
if param_and_grad[1] is None:
continue
if param_and_grad[0].stop_gradient is False:
param_grad_dict = dict()
param_grad_dict['params'] = param_and_grad
param_grad_dict.update(
{
k: v
for k, v in parameters_and_grads.items()
if k != 'params'
}
)
self._append_optimize_op(
target_block, param_grad_dict
)
for param_and_grad in parameters_and_grads['params']:
if param_and_grad[1] is None:
continue
if param_and_grad[0].stop_gradient is False:
param_grad_dict = dict()
param_grad_dict['params'] = param_and_grad
param_grad_dict.update(
{
k: v
for k, v in parameters_and_grads.items()
if k != 'params'
}
)
self._append_optimize_op(
target_block, param_grad_dict
)
else:
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is None:
......
......@@ -199,12 +199,9 @@ class RMSProp(Optimizer):
parameters = parameters.get('params')
for p in parameters:
if p.name in self._already_create_accumulater:
continue
self._add_accumulator(self._momentum_acc_str, p)
self._add_accumulator(self._mean_square_acc_str, p)
self._add_accumulator(self._mean_grad_acc_str, p)
self._already_create_accumulater.add(p.name)
def _append_optimize_op(self, block, param_and_grad):
if not isinstance(block, framework.Block):
......
......@@ -129,11 +129,8 @@ class SGD(Optimizer):
# Create accumulator tensors for first and second moments
for p in parameters:
if p.name in self._already_create_accumulater:
continue
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._already_create_accumulater.add(p.name)
continue
if (
p.dtype == core.VarDesc.VarType.FP16
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册