未验证 提交 f2151330 编写于 作者: J Jiawei Wang 提交者: GitHub

add lamb optimizer and unittest (#28772) TODO:FIX BUGS LATER

* add lamb optimizer and unittest

* fix lamb

* fix lamb v2 op

* fix sampling id

* fix lamb sample code

* Update lamb.py

* fix doc

* fix doc

* Update lamb.py
上级 3815d7aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
from paddle.fluid import core
from paddle.fluid.op import Operator
import paddle.fluid as fluid
import paddle
class TestLambOpV2(unittest.TestCase):
def test_lamb_op(self):
paddle.enable_static()
place = fluid.CPUPlace()
shape = [2, 3, 8, 8]
exe = fluid.Executor(place)
train_prog = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
data = fluid.data(name="data", shape=shape)
conv = fluid.layers.conv2d(data, 8, 3)
loss = fluid.layers.reduce_mean(conv)
beta1 = 0.85
beta2 = 0.95
betas = [beta1, beta2]
opt = paddle.optimizer.Lamb(
learning_rate=1e-5, beta1=beta1, beta2=beta2, epsilon=1e-8)
opt.minimize(loss)
exe.run(startup)
data_np = np.random.random(shape).astype('float32')
rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
assert rets[0] is not None
if __name__ == "__main__":
unittest.main()
...@@ -19,47 +19,12 @@ from op_test import OpTest ...@@ -19,47 +19,12 @@ from op_test import OpTest
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
import paddle
class TestSamplingIdOp(OpTest):
def setUp(self):
self.op_type = "sampling_id"
self.use_mkldnn = False
self.init_kernel_type()
self.X = np.random.random((100, 10)).astype('float32')
self.inputs = {"X": self.X}
self.Y = np.random.random(100).astype('int64')
self.outputs = {'Out': self.Y}
self.attrs = {'max': 1.0, 'min': 0.0, 'seed': 1}
def test_check_output(self):
self.check_output_customized(self.verify_output)
y1 = self.out
self.check_output_customized(self.verify_output)
y2 = self.out
# check dtype
assert y1.dtype == np.int64
assert y2.dtype == np.int64
# check output is index ids of inputs
inputs_ids = np.arange(self.X.shape[1])
assert np.isin(y1, inputs_ids).all()
assert np.isin(y2, inputs_ids).all()
self.assertTrue(np.array_equal(y1, y2))
self.assertEqual(len(y1), len(self.Y))
def verify_output(self, outs):
out = np.array(outs[0])
self.out = out
def init_kernel_type(self):
pass
class TestSamplingIdShape(unittest.TestCase): class TestSamplingIdShape(unittest.TestCase):
def test_shape(self): def test_shape(self):
paddle.enable_static()
x = fluid.layers.data(name='x', shape=[3], dtype='float32') x = fluid.layers.data(name='x', shape=[3], dtype='float32')
output = fluid.layers.sampling_id(x) output = fluid.layers.sampling_id(x)
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
__all__ = [ __all__ = [
'Optimizer', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'RMSProp', 'Adadelta', 'Optimizer', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'RMSProp', 'Adadelta',
'SGD', 'Momentum', 'lr' 'SGD', 'Momentum', 'Lamb', 'lr'
] ]
from .optimizer import Optimizer from .optimizer import Optimizer
...@@ -26,4 +26,5 @@ from .rmsprop import RMSProp ...@@ -26,4 +26,5 @@ from .rmsprop import RMSProp
from .adadelta import Adadelta from .adadelta import Adadelta
from .sgd import SGD from .sgd import SGD
from .momentum import Momentum from .momentum import Momentum
from .lamb import Lamb
from . import lr from . import lr
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .optimizer import Optimizer
from ..fluid import core
from ..fluid import framework
from ..fluid.framework import Variable
__all__ = ["Lamb"]
class Lamb(Optimizer):
"""
LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
LAMB Optimizer is designed to scale up the batch size of training without losing
accuracy, which supports adaptive element-wise updating and accurate layer-wise
correction. For more information, please refer to `Large Batch Optimization for
Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .
The updating of parameters follows:
.. math::
m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t
v_t &= \\beta_2 v_{t - 1} + (1 - \\beta_2)g_t^2
r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon}
w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1})
where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the
learning rate, :math:`\\lambda` the LAMB weight decay rate.
Args:
learning_rate (float|Variable, optional): the learning rate used to update parameters. \
Can be a float value or a Variable with data type float32. Default 0.001.
lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01. Remind that weight_decay should be None.
beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
Default 0.9.
beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
Default 0.999.
epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
parameters (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \
This parameter is required in dygraph mode. \
The default value is None in static mode, at this time all parameters will be updated.
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
name(str|None): For detailed information, please refer to
:ref:`api_guide_Name` . Usually name is no need to set and None by default.
Examples:
.. code-block:: python
import paddle
import numpy as np
inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32')
linear = paddle.nn.Linear(10, 10)
out = linear(inp)
loss = paddle.mean(out)
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.85], dtype="float32")
lamb = paddle.optimizer.Lamb(learning_rate=0.002, parameters=linear.parameters(), lamb_weight_decay=0.01)
back = out.backward()
lamb.step()
lamb.clear_grad()
"""
_moment1_acc_str = "moment1"
_moment2_acc_str = "moment2"
# these two not used in op temporarily
_beta1_pow_acc_str = "beta1_pow_acc"
_beta2_pow_acc_str = "beta2_pow_acc"
def __init__(self,
learning_rate=0.001,
lamb_weight_decay=0.01,
beta1=0.9,
beta2=0.999,
epsilon=1e-6,
parameters=None,
grad_clip=None,
name=None):
assert learning_rate is not None
assert beta1 is not None
assert beta2 is not None
assert epsilon is not None
super(Lamb, self).__init__(
learning_rate=learning_rate,
parameters=parameters,
weight_decay=None,
grad_clip=grad_clip,
name=name)
self.type = "lamb"
self._beta1 = beta1
self._beta2 = beta2
self._epsilon = epsilon
self._lamb_weight_decay = lamb_weight_decay
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
# Create accumulator tensors for first and second moments
for p in parameters:
self._add_accumulator(self._moment1_acc_str, p)
self._add_accumulator(self._moment2_acc_str, p)
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=p,
fill_value=0.9 if isinstance(self._beta1, Variable) \
else self._beta1,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
self._add_accumulator(
name=self._beta2_pow_acc_str,
param=p,
fill_value=0.999 if isinstance(self._beta2, Variable) \
else self._beta2,
shape=[1],
type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
block.program._use_lamb = True
moment1 = self._get_accumulator(self._moment1_acc_str,
param_and_grad[0])
moment2 = self._get_accumulator(self._moment2_acc_str,
param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param_and_grad[0])
if param_and_grad[0].need_clip:
weight_decay = 0.0
else:
weight_decay = self._lamb_weight_decay
# create the lamb optimize op
lamb_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": self._create_param_lr(param_and_grad),
"Moment1": moment1,
"Moment2": moment2,
"Beta1Pow": beta1_pow_acc,
"Beta2Pow": beta2_pow_acc
},
outputs={
"ParamOut": param_and_grad[0],
"Moment1Out": moment1,
"Moment2Out": moment2
},
attrs={
"beta1": self._beta1,
"beta2": self._beta2,
"epsilon": self._epsilon,
"weight_decay": weight_decay
},
stop_gradient=True)
return lamb_op
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册