未验证 提交 a1b99fae 编写于 作者: J Jiawei Wang 提交者: GitHub

Adadelta Optimizer (#26590)

* add doc; notest

* fix doc; notest

* update doc; notest

* refine optimizer && adam

* refine optimizer; notest

* add adam

* fix doc

* fix doc && add adamw; notest

* add error message

* bug fix

* refine rmsprop && adamax

* fix ci

* buf fix

* update comment

* unify arguments place; notest

* fix ut, test=develop

* bug fix

* fix conflicts, test=develop

* add examples code

* bug fix

* fix comments

* fix sample code

* add sample code for Optimizer

* add adamax ut, test=develop

* fix rmsprop ut, test=develop

* add ut for optimizer.py and adamw.py

* first commit of adadelta optimizer

* fix learning rate

* fix adadelta doc and add sgd momentum

* remove unused fluid

* fix codestyle

* Update test_adam_op.py

* Update test_adam_op.py

* fix SGD in 2 unittests

* fix SGD in 2 unittests

* fix ci

* fix ut
Co-authored-by: NMRXLT <xlt2024@gmail.com>
Co-authored-by: Nmapingshuo <mps2012@yeah.net>
上级 346689c6
...@@ -24,49 +24,69 @@ class AdadeltaOp : public framework::OperatorWithKernel { ...@@ -24,49 +24,69 @@ class AdadeltaOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
"Input(Param) of AdadeltaOp should not be null."); platform::errors::InvalidArgument(
PADDLE_ENFORCE(ctx->HasInput("Grad"), "Input(Param) of AdadeltaOp should not be null."));
"Input(Grad) of AdadeltaOp should not be null."); PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"), platform::errors::InvalidArgument(
"Input(AvgSquaredGrad) of AdadeltaOp should not be null."); "Input(Grad) of AdadeltaOp should not be null."));
PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"), PADDLE_ENFORCE_EQ(
"Input(AvgSquaredUpdate) of AdadeltaOp should not be null."); ctx->HasInput("AvgSquaredGrad"), true,
PADDLE_ENFORCE( platform::errors::InvalidArgument(
"Input(AvgSquaredGrad) of AdadeltaOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasInput("AvgSquaredUpdate"), true,
platform::errors::InvalidArgument(
"Input(AvgSquaredUpdate) of AdadeltaOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Param").front() == ctx->GetInputsVarType("Param").front() ==
framework::proto::VarType::LOD_TENSOR, framework::proto::VarType::LOD_TENSOR,
true,
platform::errors::InvalidArgument(
"The input var's type should be LoDTensor, but the received is %s", "The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); ctx->Inputs("Param").front(),
PADDLE_ENFORCE( ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Grad").front() == ctx->GetInputsVarType("Grad").front() ==
framework::proto::VarType::LOD_TENSOR, framework::proto::VarType::LOD_TENSOR,
true,
platform::errors::InvalidArgument(
"The input var's type should be LoDTensor, but the received is %s", "The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); ctx->Inputs("Grad").front(),
ctx->GetInputsVarType("Grad").front()));
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), PADDLE_ENFORCE_EQ(
"Output(ParamOut) of AdadeltaOp should not be null."); ctx->HasOutput("ParamOut"), true,
PADDLE_ENFORCE( platform::errors::InvalidArgument(
ctx->HasOutput("AvgSquaredGradOut"), "Output(ParamOut) of AdadeltaOp should not be null."));
"Output(AvgSquaredGradOut) of AdadeltaOp should not be null."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE( ctx->HasOutput("AvgSquaredGradOut"), true,
ctx->HasOutput("AvgSquaredUpdateOut"), platform::errors::InvalidArgument(
"Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."); "Output(AvgSquaredGradOut) of AdadeltaOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasOutput("AvgSquaredUpdateOut"), true,
platform::errors::InvalidArgument(
"Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null."));
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"), param_dim, ctx->GetInputDim("Grad"),
"param and grad input of AdadeltaOp should have same dimension"); "param and grad input of AdadeltaOp should have same dimension");
PADDLE_ENFORCE_NE(framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0, PADDLE_ENFORCE_NE(
framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0,
platform::errors::InvalidArgument(
"Maybe the Input variable AvgSquaredGrad has not " "Maybe the Input variable AvgSquaredGrad has not "
"been initialized. You may need to confirm if you put " "been initialized. You may need to confirm if you put "
"exe.run(startup_program) after optimizer.minimize " "exe.run(startup_program) after optimizer.minimize "
"function."); "function."));
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"), PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
platform::errors::InvalidArgument(
"Param and AvgSquaredGrad input of AdadeltaOp " "Param and AvgSquaredGrad input of AdadeltaOp "
"should have same dimension"); "should have same dimension"));
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"), PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
platform::errors::InvalidArgument(
"Param and AvgSquaredUpdate input of AdadeltaOp " "Param and AvgSquaredUpdate input of AdadeltaOp "
"should have same dimension"); "should have same dimension"));
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("AvgSquaredGradOut", param_dim); ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
......
...@@ -24,17 +24,19 @@ class AdadeltaOpKernel : public framework::OpKernel<T> { ...@@ -24,17 +24,19 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param"); const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(), PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, " "The Var(%s)'s type should be LoDTensor, "
"but the received is %s", "but the received is %s",
ctx.InputNames("Param").front(), ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())); framework::ToTypeName(param_var->Type())));
const auto* grad_var = ctx.InputVar("Grad"); const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(), PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, " "The Var(%s)'s type should be LoDTensor, "
"but the received is %s", "but the received is %s",
ctx.InputNames("Grad").front(), ctx.InputNames("Grad").front(),
framework::ToTypeName(grad_var->Type())); framework::ToTypeName(grad_var->Type())));
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto avg_squared_grad_out_tensor = auto avg_squared_grad_out_tensor =
......
...@@ -23,22 +23,27 @@ class TopkOp : public framework::OperatorWithKernel { ...@@ -23,22 +23,27 @@ class TopkOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
"Input(X) of TopkOp should not be null."); platform::errors::InvalidArgument(
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Input(X) of TopkOp should not be null."));
"Output(Out) of TopkOp should not be null."); PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
PADDLE_ENFORCE(ctx->HasOutput("Indices"), platform::errors::InvalidArgument(
"Output(Indices) of TopkOp should not be null."); "Output(Out) of TopkOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Indices"), true,
platform::errors::InvalidArgument(
"Output(Indices) of TopkOp should not be null."));
auto input_dims = ctx->GetInputDim("X"); auto input_dims = ctx->GetInputDim("X");
const int k = static_cast<int>(ctx->Attrs().Get<int>("k")); const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
PADDLE_ENFORCE_GE(k, 1, "k must >= 1"); PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape"); PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument(
"input must have >= 1d shape"));
if (ctx->IsRuntime()) { if (ctx->IsRuntime()) {
PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k, PADDLE_ENFORCE_GE(
"input must have >= k columns"); input_dims[input_dims.size() - 1], k,
platform::errors::InvalidArgument("input must have >= k columns"));
} }
framework::DDim dims = input_dims; framework::DDim dims = input_dims;
......
...@@ -43,8 +43,9 @@ template <typename DeviceContext, typename T> ...@@ -43,8 +43,9 @@ template <typename DeviceContext, typename T>
class TopkOpCUDAKernel : public framework::OpKernel<T> { class TopkOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE_EQ(
"It must use CUDAPlace."); platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument("It must use CUDAPlace."));
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out"); auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices"); auto* indices = ctx.Output<Tensor>("Indices");
......
...@@ -17,6 +17,8 @@ from __future__ import print_function ...@@ -17,6 +17,8 @@ from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
import paddle
import paddle.fluid as fluid
class TestAdadeltaOp1(OpTest): class TestAdadeltaOp1(OpTest):
...@@ -108,5 +110,54 @@ class TestAdadeltaOp2(OpTest): ...@@ -108,5 +110,54 @@ class TestAdadeltaOp2(OpTest):
self.check_output() self.check_output()
class TestAdadeltaV2(unittest.TestCase):
def test_adadelta_dygraph(self):
paddle.disable_static(paddle.CPUPlace())
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Adadelta(
learning_rate=0.01,
parameters=linear.parameters(),
weight_decay=0.01)
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()
def test_adadelta(self):
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
rms_optimizer = paddle.optimizer.Adadelta(learning_rate=0.1)
rms_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
def test_raise_error(self):
self.assertRaises(ValueError, paddle.optimizer.Adadelta, None)
self.assertRaises(
ValueError, paddle.optimizer.Adadelta, learning_rate=0.1, rho=None)
self.assertRaises(
ValueError,
paddle.optimizer.Adadelta,
learning_rate=0.1,
epsilon=None)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -43,7 +43,7 @@ class TestFleetBase(unittest.TestCase): ...@@ -43,7 +43,7 @@ class TestFleetBase(unittest.TestCase):
role = role_maker.PaddleCloudRoleMaker(is_collective=True) role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role) fleet.init(role)
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
......
...@@ -658,7 +658,7 @@ class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase): ...@@ -658,7 +658,7 @@ class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase):
class TestImperativePipelineOptimizer(TestImperativeOptimizerBase): class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(learning_rate=0.5, optimizer = paddle.optimizer.SGD(learning_rate=0.5,
parameter_list=parameter_list) parameters=parameter_list)
optimizer = PipelineOptimizer(optimizer) optimizer = PipelineOptimizer(optimizer)
return optimizer return optimizer
...@@ -670,7 +670,7 @@ class TestImperativePipelineOptimizer(TestImperativeOptimizerBase): ...@@ -670,7 +670,7 @@ class TestImperativePipelineOptimizer(TestImperativeOptimizerBase):
class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase): class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(learning_rate=0.5, optimizer = paddle.optimizer.SGD(learning_rate=0.5,
parameter_list=parameter_list) parameters=parameter_list)
optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5) optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5)
return optimizer return optimizer
...@@ -682,7 +682,7 @@ class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase): ...@@ -682,7 +682,7 @@ class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase):
class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase): class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
def get_optimizer_dygraph(self, parameter_list): def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.optimizer.SGD(learning_rate=0.5, optimizer = paddle.optimizer.SGD(learning_rate=0.5,
parameter_list=parameter_list) parameters=parameter_list)
optimizer = RecomputeOptimizer(optimizer) optimizer = RecomputeOptimizer(optimizer)
return optimizer return optimizer
......
...@@ -19,6 +19,8 @@ import numpy as np ...@@ -19,6 +19,8 @@ import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
from op_test import OpTest from op_test import OpTest
import paddle
import paddle.fluid as fluid
class TestMomentumOp1(OpTest): class TestMomentumOp1(OpTest):
...@@ -234,5 +236,48 @@ class TestSparseMomentumOp2(TestSparseMomentumOp): ...@@ -234,5 +236,48 @@ class TestSparseMomentumOp2(TestSparseMomentumOp):
self.use_nesterov = True self.use_nesterov = True
class TestMomentumV2(unittest.TestCase):
def test_momentum_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.Momentum(
learning_rate=0.01, momentum=0.9, parameters=linear.parameters())
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()
def test_momentum(self):
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
rms_optimizer = paddle.optimizer.Momentum(
learning_rate=0.1, momentum=0.9)
rms_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
def test_raise_error(self):
self.assertRaises(
ValueError, paddle.optimizer.Momentum, learning_rate=None)
self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -20,6 +20,7 @@ import paddle.fluid as fluid ...@@ -20,6 +20,7 @@ import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
from op_test import OpTest from op_test import OpTest
import paddle
class TestSGDOp(OpTest): class TestSGDOp(OpTest):
...@@ -208,5 +209,46 @@ class TestSGDOpWithLargeInput(unittest.TestCase): ...@@ -208,5 +209,46 @@ class TestSGDOpWithLargeInput(unittest.TestCase):
result = exe.run(compiled_prog, fetch_list=[avg_cost]) result = exe.run(compiled_prog, fetch_list=[avg_cost])
class TestSGDV2(unittest.TestCase):
def test_sgd_dygraph(self):
paddle.disable_static()
value = np.arange(26).reshape(2, 13).astype("float32")
a = paddle.to_tensor(value)
linear = paddle.nn.Linear(13, 5)
# This can be any optimizer supported by dygraph.
adam = paddle.optimizer.SGD(learning_rate=0.01,
parameters=linear.parameters(),
weight_decay=0.01)
out = linear(a)
out.backward()
adam.step()
adam.clear_gradients()
def test_sgd(self):
place = fluid.CPUPlace()
main = fluid.Program()
with fluid.program_guard(main):
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
rms_optimizer = paddle.optimizer.SGD(learning_rate=0.1)
rms_optimizer.minimize(avg_cost)
fetch_list = [avg_cost]
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=1)
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for data in train_reader():
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
def test_raise_error(self):
self.assertRaises(ValueError, paddle.optimizer.SGD, learning_rate=None)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -26,9 +26,8 @@ __all__ = [ ...@@ -26,9 +26,8 @@ __all__ = [
] ]
from ..fluid.optimizer import SGD, Momentum, Adagrad, Dpsgd, DecayedAdagrad, \ from ..fluid.optimizer import Momentum, Adagrad, Dpsgd, DecayedAdagrad, Ftrl,\
Ftrl, Adadelta, \ AdagradOptimizer,DpsgdOptimizer,\
SGDOptimizer, MomentumOptimizer, AdagradOptimizer,DpsgdOptimizer,\
DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \ DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \
ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\ ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\
ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \ ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \
...@@ -39,6 +38,9 @@ from .adam import Adam ...@@ -39,6 +38,9 @@ from .adam import Adam
from .adamw import AdamW from .adamw import AdamW
from .adamax import Adamax from .adamax import Adamax
from .rmsprop import RMSProp from .rmsprop import RMSProp
from .adadelta import Adadelta
from .sgd import SGD
from .momentum import Momentum
from . import lr_scheduler from . import lr_scheduler
from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \ from .lr_scheduler import _LRScheduler, NoamLR, PiecewiseLR, NaturalExpLR, InverseTimeLR, PolynomialLR, \
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .optimizer import Optimizer
from ..fluid import core
from ..fluid import framework
from ..fluid.framework import Variable, name_scope
__all__ = ["Adadelta"]
class Adadelta(Optimizer):
"""
**Notes: This API does not support sparse parameter optimization.**
Adadelta Optimizer. Please refer to this for details:
`ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.
The update is done as follows:
.. math::
E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2
learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) }
E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
Args:
learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
epsilon (float): a small float number for numeric stability. Default 1.0e-6.
rho (float): a floating point value indicating the decay rate. Default 0.95.
parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \
:ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
the regularization setting here in optimizer will be ignored for this parameter. \
Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): The default value is None. Normally there is no need for user
to set this property. For more information, please refer to
:ref:`api_guide_Name` .
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
adadelta = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
back = out.backward()
adadelta.step()
adadelta.clear_grad()
"""
_avg_squared_grad_acc_str = "_avg_squared_grad"
_avg_squared_update_acc_str = "_avg_squared_update"
def __init__(self,
learning_rate=0.001,
epsilon=1.0e-6,
rho=0.95,
parameters=None,
weight_decay=None,
grad_clip=None,
name=None):
if learning_rate is None:
raise ValueError("learning_rate is not set.")
if epsilon is None:
raise ValueError("epsilon is not set.")
if rho is None:
raise ValueError("rho is not set.")
super(Adadelta, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
weight_decay=weight_decay,
grad_clip=grad_clip,
name=name)
self.type = "adadelta"
self._epsilon = epsilon
self._rho = rho
def _create_accumulators(self, block, parameters):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
for p in parameters:
self._add_accumulator(self._avg_squared_grad_acc_str, p)
self._add_accumulator(self._avg_squared_update_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
if not isinstance(block, framework.Block):
raise TypeError("block is not instance of framework.Block.")
avg_squared_grad_acc = self._get_accumulator(
self._avg_squared_grad_acc_str, param_and_grad[0])
avg_squared_update_acc = self._get_accumulator(
self._avg_squared_update_acc_str, param_and_grad[0])
# Create the adadelta optimizer op
adadelta_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"AvgSquaredGrad": avg_squared_grad_acc,
"AvgSquaredUpdate": avg_squared_update_acc
},
outputs={
"ParamOut": param_and_grad[0],
"AvgSquaredGradOut": avg_squared_grad_acc,
"AvgSquaredUpdateOut": avg_squared_update_acc
},
attrs={"epsilon": self._epsilon,
"rho": self._rho},
stop_gradient=True)
return adadelta_op
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .optimizer import Optimizer
from ..fluid import core
from ..fluid import framework
from ..fluid.framework import Variable, name_scope
__all__ = ["Momentum"]
class Momentum(Optimizer):
"""
Simple Momentum optimizer with velocity state
This optimizer has a flag for Nestrov Momentum.
The update equations are as follows:
.. math::
& velocity = mu * velocity + gradient
& if (use\_nesterov):
&\quad param = param - (gradient + mu * velocity) * learning\_rate
& else:
&\quad param = param - learning\_rate * velocity
Parameters:
learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
momentum (float): Momentum factor. The default value is 0.9.
parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \
:ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
the regularization setting here in optimizer will be ignored for this parameter. \
Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): The default value is None. Normally there is no need for user
to set this property. For more information, please refer to
:ref:`api_guide_Name` .
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
momentum = paddle.optimizer.Momentum(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
back = out.backward()
momentum.step()
momentum.clear_grad()
"""
_velocity_acc_str = "velocity"
def __init__(self,
learning_rate=0.001,
momentum=0.9,
parameters=None,
use_nesterov=False,
weight_decay=None,
grad_clip=None,
name=None):
if learning_rate is None:
raise ValueError("learning_rate is not set")
if momentum is None:
raise ValueError("momentum is not set")
super(Momentum, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
weight_decay=weight_decay,
grad_clip=grad_clip,
name=name)
self.type = "momentum"
self._momentum = momentum
self._use_nesterov = bool(use_nesterov)
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
for p in parameters:
self._add_accumulator(self._velocity_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0])
lr = self._create_param_lr(param_and_grad)
if framework.in_dygraph_mode():
_, _ = core.ops.momentum(param_and_grad[0], param_and_grad[1],
velocity_acc, lr, param_and_grad[0],
velocity_acc, 'mu', self._momentum,
'use_nesterov', self._use_nesterov)
return None
attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov}
inputs = {
"Param": [param_and_grad[0]],
"Grad": [param_and_grad[1]],
"Velocity": [velocity_acc],
"LearningRate": [lr]
}
outputs = {
"ParamOut": [param_and_grad[0]],
"VelocityOut": [velocity_acc]
}
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
return momentum_op
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .optimizer import Optimizer
from ..fluid import core
from ..fluid import framework
from ..fluid.framework import Variable, name_scope
from ..fluid.dygraph import no_grad
__all__ = ["SGD"]
class SGD(Optimizer):
"""
Optimizer of the stochastic gradient descent algorithm.
.. math::
param\_out = param - learning\_rate * grad
Parameters:
learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
It canbe a float value as coeff of L2 regularization or \
:ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
the regularization setting here in optimizer will be ignored for this parameter. \
Otherwise, the regularization setting here in optimizer will take effect. \
Default None, meaning there is no regularization.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name (str, optional): The default value is None. Normally there is no need for user
to set this property. For more information, please refer to
:ref:`api_guide_Name` .
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.disable_static()
inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
linear = paddle.nn.Linear(10, 10)
inp = paddle.to_tensor(inp)
out = linear(inp)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
back = out.backward()
sgd.step()
sgd.clear_grad()
"""
def __init__(self,
learning_rate=0.001,
parameters=None,
weight_decay=None,
grad_clip=None,
name=None):
if learning_rate is None:
raise ValueError("learning_rate is not set")
super(SGD, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
weight_decay=weight_decay,
grad_clip=grad_clip,
name=name)
self.type = "sgd"
@no_grad()
def _append_optimize_op(self, block, param_and_grad):
lr = self._create_param_lr(param_and_grad)
if framework.in_dygraph_mode():
core.ops.sgd(param_and_grad[0], lr, param_and_grad[1],
param_and_grad[0])
return None
assert isinstance(block, framework.Block)
# create the optimize op
sgd_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": lr
},
outputs={"ParamOut": param_and_grad[0]},
stop_gradient=True)
return sgd_op
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册