diff --git a/python/paddle/fluid/tests/unittests/test_lambv2_op.py b/python/paddle/fluid/tests/unittests/test_lambv2_op.py new file mode 100644 index 0000000000000000000000000000000000000000..cbd723db2fa0c68b8b756788fdf98cdccd6d49c9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_lambv2_op.py @@ -0,0 +1,53 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +from paddle.fluid import core +from paddle.fluid.op import Operator +import paddle.fluid as fluid +import paddle + + +class TestLambOpV2(unittest.TestCase): + def test_lamb_op(self): + paddle.enable_static() + place = fluid.CPUPlace() + shape = [2, 3, 8, 8] + exe = fluid.Executor(place) + train_prog = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(train_prog, startup): + with fluid.unique_name.guard(): + data = fluid.data(name="data", shape=shape) + conv = fluid.layers.conv2d(data, 8, 3) + loss = fluid.layers.reduce_mean(conv) + beta1 = 0.85 + beta2 = 0.95 + betas = [beta1, beta2] + opt = paddle.optimizer.Lamb( + learning_rate=1e-5, beta1=beta1, beta2=beta2, epsilon=1e-8) + opt.minimize(loss) + + exe.run(startup) + data_np = np.random.random(shape).astype('float32') + rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss]) + assert rets[0] is not None + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py index 0c784d3e49d85f0b5750c5e6d7307be754b43ab2..521cd3ae238c6f4b28313c4864bba354b5bc091a 100644 --- a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py +++ b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py @@ -19,47 +19,12 @@ from op_test import OpTest import paddle.fluid.core as core import paddle.fluid as fluid from paddle.fluid.op import Operator - - -class TestSamplingIdOp(OpTest): - def setUp(self): - self.op_type = "sampling_id" - self.use_mkldnn = False - self.init_kernel_type() - self.X = np.random.random((100, 10)).astype('float32') - self.inputs = {"X": self.X} - self.Y = np.random.random(100).astype('int64') - self.outputs = {'Out': self.Y} - self.attrs = {'max': 1.0, 'min': 0.0, 'seed': 1} - - def test_check_output(self): - self.check_output_customized(self.verify_output) - y1 = self.out - self.check_output_customized(self.verify_output) - y2 = self.out - - # check dtype - assert y1.dtype == np.int64 - assert y2.dtype == np.int64 - - # check output is index ids of inputs - inputs_ids = np.arange(self.X.shape[1]) - assert np.isin(y1, inputs_ids).all() - assert np.isin(y2, inputs_ids).all() - - self.assertTrue(np.array_equal(y1, y2)) - self.assertEqual(len(y1), len(self.Y)) - - def verify_output(self, outs): - out = np.array(outs[0]) - self.out = out - - def init_kernel_type(self): - pass +import paddle class TestSamplingIdShape(unittest.TestCase): def test_shape(self): + paddle.enable_static() x = fluid.layers.data(name='x', shape=[3], dtype='float32') output = fluid.layers.sampling_id(x) diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py index 756bf35486bf8987dc1f52c132b9a3d1b8c4023f..edebfdfcf3710049f851945632c1c09d443a7709 100644 --- a/python/paddle/optimizer/__init__.py +++ b/python/paddle/optimizer/__init__.py @@ -14,7 +14,7 @@ __all__ = [ 'Optimizer', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'RMSProp', 'Adadelta', - 'SGD', 'Momentum', 'lr' + 'SGD', 'Momentum', 'Lamb', 'lr' ] from .optimizer import Optimizer @@ -26,4 +26,5 @@ from .rmsprop import RMSProp from .adadelta import Adadelta from .sgd import SGD from .momentum import Momentum +from .lamb import Lamb from . import lr diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py new file mode 100644 index 0000000000000000000000000000000000000000..de62257588eaa798b7747641d2be5ac5bd73e543 --- /dev/null +++ b/python/paddle/optimizer/lamb.py @@ -0,0 +1,177 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .optimizer import Optimizer +from ..fluid import core +from ..fluid import framework +from ..fluid.framework import Variable + +__all__ = ["Lamb"] + + +class Lamb(Optimizer): + """ + LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer. + + LAMB Optimizer is designed to scale up the batch size of training without losing + accuracy, which supports adaptive element-wise updating and accurate layer-wise + correction. For more information, please refer to `Large Batch Optimization for + Deep Learning: Training BERT in 76 minutes `_ . + + The updating of parameters follows: + + .. math:: + + m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t + + v_t &= \\beta_2 v_{t - 1} + (1 - \\beta_2)g_t^2 + + r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon} + + w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1}) + + + where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the + learning rate, :math:`\\lambda` the LAMB weight decay rate. + + Args: + learning_rate (float|Variable, optional): the learning rate used to update parameters. \ + Can be a float value or a Variable with data type float32. Default 0.001. + lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01. Remind that weight_decay should be None. + beta1 (float, optional): The exponential decay rate for the 1st moment estimates. + Default 0.9. + beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. + Default 0.999. + epsilon (float, optional): A small float value for numerical stability. Default 1e-6. + parameters (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static mode, at this time all parameters will be updated. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name(str|None): For detailed information, please refer to + :ref:`api_guide_Name` . Usually name is no need to set and None by default. + Examples: + .. code-block:: python + import paddle + import numpy as np + inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32') + linear = paddle.nn.Linear(10, 10) + out = linear(inp) + loss = paddle.mean(out) + beta1 = paddle.to_tensor([0.9], dtype="float32") + beta2 = paddle.to_tensor([0.85], dtype="float32") + lamb = paddle.optimizer.Lamb(learning_rate=0.002, parameters=linear.parameters(), lamb_weight_decay=0.01) + back = out.backward() + lamb.step() + lamb.clear_grad() + """ + _moment1_acc_str = "moment1" + _moment2_acc_str = "moment2" + # these two not used in op temporarily + _beta1_pow_acc_str = "beta1_pow_acc" + _beta2_pow_acc_str = "beta2_pow_acc" + + def __init__(self, + learning_rate=0.001, + lamb_weight_decay=0.01, + beta1=0.9, + beta2=0.999, + epsilon=1e-6, + parameters=None, + grad_clip=None, + name=None): + assert learning_rate is not None + assert beta1 is not None + assert beta2 is not None + assert epsilon is not None + super(Lamb, self).__init__( + learning_rate=learning_rate, + parameters=parameters, + weight_decay=None, + grad_clip=grad_clip, + name=name) + self.type = "lamb" + self._beta1 = beta1 + self._beta2 = beta2 + self._epsilon = epsilon + self._lamb_weight_decay = lamb_weight_decay + + def _create_accumulators(self, block, parameters): + assert isinstance(block, framework.Block) + + # Create accumulator tensors for first and second moments + for p in parameters: + self._add_accumulator(self._moment1_acc_str, p) + self._add_accumulator(self._moment2_acc_str, p) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=p, + fill_value=0.9 if isinstance(self._beta1, Variable) \ + else self._beta1, + shape=[1], + type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') + self._add_accumulator( + name=self._beta2_pow_acc_str, + param=p, + fill_value=0.999 if isinstance(self._beta2, Variable) \ + else self._beta2, + shape=[1], + type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + block.program._use_lamb = True + + moment1 = self._get_accumulator(self._moment1_acc_str, + param_and_grad[0]) + moment2 = self._get_accumulator(self._moment2_acc_str, + param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param_and_grad[0]) + + if param_and_grad[0].need_clip: + weight_decay = 0.0 + else: + weight_decay = self._lamb_weight_decay + + # create the lamb optimize op + lamb_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": self._create_param_lr(param_and_grad), + "Moment1": moment1, + "Moment2": moment2, + "Beta1Pow": beta1_pow_acc, + "Beta2Pow": beta2_pow_acc + }, + outputs={ + "ParamOut": param_and_grad[0], + "Moment1Out": moment1, + "Moment2Out": moment2 + }, + attrs={ + "beta1": self._beta1, + "beta2": self._beta2, + "epsilon": self._epsilon, + "weight_decay": weight_decay + }, + stop_gradient=True) + + return lamb_op