未验证 提交 6eaed2da 编写于 作者: L LoneRanger 提交者: GitHub

remove the optimizer base and learning rate base (#56099)

* remove the optimizer base and learning rate base

* fix bug

* fix bug
上级 f60c698f
......@@ -250,7 +250,6 @@ def _is_valid_optimizer(optimizer):
optimizer,
(
paddle.optimizer.Optimizer,
paddle.fluid.optimizer.Optimizer,
DygraphShardingOptimizer,
),
)
......@@ -260,7 +259,7 @@ def check_optimizers(optimizers):
for optimizer in optimizers:
if not _is_valid_optimizer(optimizer):
raise RuntimeError(
"Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer or DygraphShardingOptimizer, but receive {}.".format(
"Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or DygraphShardingOptimizer, but receive {}.".format(
type(optimizer)
)
)
......
......@@ -146,11 +146,10 @@ class Engine:
if optimizer and not isinstance(
optimizer,
(paddle.optimizer.Optimizer, paddle.static.Optimizer),
(paddle.optimizer.Optimizer),
):
raise TypeError(
"'optimizer' must be object of class `paddle.optimizer.Optimizer`"
" or `paddle.static.Optimizer`."
)
self._optimizer = auto_utils.validate_opt(optimizer)
......
......@@ -20,11 +20,11 @@ __all__ = []
import paddle
from paddle.common_ops_import import LayerHelper
from paddle.fluid import framework
from paddle.fluid.dygraph import base as imperative_base
from paddle.fluid.optimizer import Optimizer
from paddle.framework import core, in_dynamic_mode
from paddle.nn.clip import ClipGradByNorm, append_gradient_clip_ops
from paddle.optimizer import Momentum
from paddle.optimizer import Momentum, Optimizer
from paddle.regularizer import L1Decay, L2Decay
from paddle.static import create_global_var
......@@ -58,8 +58,8 @@ class DGCMomentumOptimizer(Optimizer):
assert momentum is not None
super().__init__(
learning_rate=learning_rate,
parameter_list=parameter_list,
regularization=regularization,
parameters=parameter_list,
weight_decay=regularization,
grad_clip=grad_clip,
name=name,
)
......@@ -396,6 +396,55 @@ class DGCMomentumOptimizer(Optimizer):
op_maker.kOpRoleVarAttrName(), [param_var.name, grad_var.name]
)
def _process_distribute_lookuptable(self, param_grads):
"""
Because distribute lookup table only support SGD optimizer for now, not support
other optimizer and regularization, so we should find the table parameter out,
and avoid to add regularization and other op for it, and add sgd optimize op
for it independently.
:param param_grads(list((Var, Var))): list of (param, grad) pair.
:param loss: the loss variable.
:param startup_program: the startup program
"""
from paddle.distributed.distribute_lookup_table import (
find_distributed_lookup_table,
)
program = framework.default_main_program()
global_block = framework.default_main_program().global_block()
table_name = find_distributed_lookup_table(program)
table_param = None
table_grad = None
new_param_grads = []
for p, g in param_grads:
if p.name == table_name:
if table_param is not None:
raise RuntimeError(
"multi dist table var found, only support one now!"
)
table_param = p
table_grad = g
else:
new_param_grads.append((p, g))
sgd_op = None
if table_param is not None:
param_and_grad = [table_param, table_grad]
with table_param.block.program._optimized_guard(
param_and_grad
), framework.name_scope("optimizer"):
self._create_global_learning_rate()
# create the optimize op
sgd_op = global_block.append_op(
type='sgd',
inputs={
"Param": table_param,
"Grad": table_grad,
"LearningRate": self._create_param_lr(param_and_grad),
},
outputs={"ParamOut": param_and_grad[0]},
)
return new_param_grads, (table_param, table_grad), sgd_op
@imperative_base.no_grad()
def apply_gradients(self, params_grads):
# Note: since we can't use all_reduce_op now,
......@@ -532,7 +581,7 @@ class DGCOptimizer(MetaOptimizerBase):
def apply_optimize(self, loss, startup_program, params_grads):
self._init_dgc_opt()
return self.dgc_opt.apply_optimize(
return self.dgc_opt._apply_optimize(
loss, startup_program=startup_program, params_grads=params_grads
)
......
......@@ -53,7 +53,6 @@ from . import initializer
from .initializer import set_global_initializer
from . import layers
from . import dygraph
from . import optimizer
from . import backward
from .backward import gradients
from . import incubate
......@@ -109,7 +108,6 @@ __all__ = (
'disable_dygraph',
'enable_imperative',
'disable_imperative',
'optimizer',
'backward',
'LoDTensor',
'LoDTensorArray',
......
......@@ -18,9 +18,6 @@ from .base import *
from . import tracer
from .tracer import *
from . import learning_rate_scheduler
from .learning_rate_scheduler import *
__all__ = []
__all__ += base.__all__
__all__ += learning_rate_scheduler.__all__
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import warnings
import numpy as np
import paddle
from .. import unique_name
from ..framework import Variable
from ..data_feeder import check_type
__all__ = []
class LearningRateDecay:
"""
Base class of learning rate decay
Define the common interface of an LearningRateDecay.
User should not use this class directly,
but need to use one of it's implementation.
"""
def __init__(self, begin=0, step=1, dtype='float32'):
self.step_num = begin
self.step_size = step
self.dtype = dtype
def __call__(self):
lr = self.step()
if isinstance(lr, float):
lr = self.create_lr_var(lr)
self.step_num += self.step_size
return lr
def create_lr_var(self, lr):
"""
convert lr from float to variable
Args:
lr: learning rate
Returns:
learning rate variable
"""
from .. import layers
lr = paddle.static.create_global_var(
name=unique_name.generate("learning_rate"),
shape=[1],
value=float(lr),
dtype=self.dtype,
persistable=False,
)
return lr
# Note: If you want to change what optimizer.state_dict stores, just overwrite this functions,
# "self.step_num" will be stored by default.
def state_dict(self):
"""
Returns the state of the scheduler as a :class:`dict`.
It is a subset of self.__dict__ .
"""
self._state_keys()
state_dict = {}
for key in self.keys:
if key not in self.__dict__:
continue
value = self.__dict__[key]
if isinstance(value, Variable):
assert (
value.size == 1
), "the size of Variable in state_dict must be 1, but its size is {} with shape {}".format(
value.size, value.shape
)
value = value.item()
state_dict[key] = value
return state_dict
def _state_keys(self):
"""
set the keys in self.__dict__ that are needed to be saved.
"""
self.keys = ['step_num']
def set_state_dict(self, state_dict):
"""
Loads the schedulers state.
"""
self._state_keys()
for key in self.keys:
if key in state_dict:
self.__dict__[key] = state_dict[key]
else:
raise RuntimeError(
"Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".format(
key
)
)
if len(state_dict) > len(self.keys):
warnings.warn(
"There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
)
# [aliases] Compatible with old method names
set_dict = set_state_dict
def step(self):
raise NotImplementedError()
class _LearningRateEpochDecay(LearningRateDecay):
"""
:api_attr: imperative
Base class of learning rate decay, which is updated each epoch.
Define the common interface of an _LearningRateEpochDecay.
User should not use this class directly,
but need to use one of it's implementation. And invoke method: `epoch()` each epoch.
"""
def __init__(self, learning_rate, dtype=None):
if not isinstance(learning_rate, (float, int)):
raise TypeError(
"The type of 'learning_rate' must be 'float, int', but received %s."
% type(learning_rate)
)
if learning_rate < 0:
raise ValueError("Invalid learning rate: {}".format(learning_rate))
self.base_lr = float(learning_rate)
self.epoch_num = -1
self.dtype = dtype
if dtype is None:
self.dtype = "float32"
self.learning_rate = self.create_lr_var(self.base_lr)
self.epoch()
# For those subclass who overload _LearningRateEpochDecay, "self.epoch_num/learning_rate" will be stored by default.
# you can change it for your subclass.
def _state_keys(self):
self.keys = ['epoch_num', 'learning_rate']
def __call__(self):
"""
Return last computed learning rate on current epoch.
"""
if not isinstance(self.learning_rate, Variable):
self.learning_rate = self.create_lr_var(self.learning_rate)
return self.learning_rate
def epoch(self, epoch=None):
"""
compueted learning_rate and update it when invoked.
"""
if epoch is None:
self.epoch_num += 1
else:
self.epoch_num = epoch
self.learning_rate = self.get_lr()
def get_lr(self):
raise NotImplementedError
此差异已折叠。
......@@ -290,10 +290,8 @@ class DistributedOptimizer(metaclass=abc.ABCMeta):
"""
def __init__(self, optimizer, strategy=None):
if (
not isinstance(optimizer, SGD.__bases__)
and not isinstance(optimizer, fluid.optimizer.Optimizer)
and not isinstance(optimizer, OptimizerWithMixedPrecision)
if not isinstance(optimizer, SGD.__bases__) and not isinstance(
optimizer, OptimizerWithMixedPrecision
):
raise TypeError("optimizer must be an instance of Optimizer")
......
......@@ -28,7 +28,6 @@ from paddle.static import (
Executor,
)
from paddle.fluid.compiler import CompiledProgram
from paddle.fluid.optimizer import Optimizer
from paddle.distributed.transpiler.distribute_transpiler import (
DistributeTranspilerConfig,
......
......@@ -19,8 +19,8 @@ from paddle.fluid import core, unique_name
from paddle.fluid.executor import global_scope
from paddle.fluid.framework import Variable, name_scope
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.optimizer import Optimizer
from paddle.nn import ClipGradByGlobalNorm
from paddle.optimizer import Optimizer
def init_communicator(block, rank, ranks, ring_id):
......
......@@ -27,7 +27,6 @@ from paddle.fluid.framework import (
default_startup_program,
in_dygraph_mode,
)
from paddle.fluid.optimizer import Optimizer
__all__ = []
......@@ -99,7 +98,6 @@ class PipelineOptimizer:
if in_dygraph_mode():
raise Exception("In dygraph, don't support PipelineOptimizer.")
valid_optimizers = (
Optimizer,
paddle.optimizer.Optimizer,
paddle.static.amp.decorator.OptimizerWithMixedPrecision,
)
......
......@@ -1114,9 +1114,6 @@ class Optimizer:
end = len(target_block.ops)
return target_block._slice_ops(start, end)
def _append_dgc_ops(self, param_and_grad):
pass
def backward(
self,
loss,
......@@ -1205,9 +1202,6 @@ class Optimizer:
params_grads = append_backward(
loss, parameter_list, act_no_grad_set, callbacks
)
# Note: since we can't use all_reduce_op now,
# dgc_op should be the last op of one grad.
self._append_dgc_ops(params_grads)
return params_grads
def apply_gradients(self, params_grads):
......
......@@ -70,7 +70,6 @@ from ..fluid.framework import ipu_shard_guard # noqa: F401
from ..fluid.framework import set_ipu_shard # noqa: F401
from .nn.control_flow import Print # noqa: F401
from ..fluid.param_attr import WeightNormParamAttr # noqa: F401
from ..fluid.optimizer import Optimizer # noqa: F401
from .nn.metric import auc # noqa: F401
......
......@@ -37,10 +37,10 @@ from .function_overload import FunctionType, overload
def _set_multi_precision(optimizer, multi_precision):
if not isinstance(
optimizer,
(paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer),
(paddle.optimizer.Optimizer),
):
raise RuntimeError(
"Current AMP training level is O2, optimizer is expected to be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer, but receive {}.".format(
"Current AMP training level is O2, optimizer is expected to be paddle.optimizer.Optimizer, but receive {}.".format(
type(optimizer)
)
)
......
......@@ -1126,7 +1126,7 @@ class TestLoadSliceVar(TranspilerTest):
y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
avg_cost = paddle.mean(cost)
optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
optimizer = paddle.optimizer.RMSProp(learning_rate=0.1)
optimizer.minimize(avg_cost)
def transpiler_test_impl(self):
......
......@@ -22,9 +22,9 @@ import paddle
from paddle import fluid
from paddle.fluid import core
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
from paddle.nn import Embedding
from paddle.optimizer import Adam
from paddle.optimizer.lr import LRScheduler
class SimpleLSTMRNN(paddle.nn.Layer):
......@@ -552,7 +552,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
if isinstance(adam._learning_rate, LearningRateDecay):
if isinstance(adam._learning_rate, LRScheduler):
adam._learning_rate.step_num = 0
adam.set_state_dict(self.opti_dict)
......@@ -673,7 +673,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
else:
np_opti_dict[k] = v
if isinstance(adam._learning_rate, LearningRateDecay):
if isinstance(adam._learning_rate, LRScheduler):
adam._learning_rate.step_num = 0
adam.set_state_dict(np_opti_dict)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册