# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import itertools import unittest import numpy as np from test_imperative_base import new_program_scope import paddle from paddle import fluid from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer from paddle.fluid import core from paddle.fluid.optimizer import ( Adam, DecayedAdagradOptimizer, DpsgdOptimizer, ExponentialMovingAverage, FtrlOptimizer, LarsMomentumOptimizer, LookaheadOptimizer, ModelAverage, MomentumOptimizer, PipelineOptimizer, RecomputeOptimizer, SGDOptimizer, ) # Note(wangzhongpu) # In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer. class MLP(paddle.nn.Layer): def __init__(self, param_attr=None, bias_attr=None): super().__init__() self._fc1 = paddle.nn.Linear(784, 10) self._fc2 = paddle.nn.Linear(10, 10) def forward(self, inputs): y = self._fc1(inputs) y = self._fc2(y) return y class TestImperativeOptimizerBase(unittest.TestCase): def setUp(self): self.batch_num = 20 def get_optimizer_dygraph(self, parameter_list): raise NotImplementedError() def get_optimizer(self): raise NotImplementedError() def reader_decorator(self, reader): def _reader_imple(): for item in reader(): image = np.array(item[0]).reshape(1, 784) label = np.array(item[1]).astype('int64').reshape(1) yield image, label return _reader_imple def _check_exception(self, exception_message, place=None): seed = 90 batch_size = 128 if place is None: place = ( fluid.CUDAPlace(0) if core.is_compiled_with_cuda() else fluid.CPUPlace() ) with fluid.dygraph.guard(place): try: paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) mlp = MLP() optimizer = self.get_optimizer_dygraph( parameter_list=mlp.parameters() ) except Exception as e: assert str(e) == exception_message def _check_mlp(self, place=None): seed = 90 batch_size = 128 if place is None: place = ( fluid.CPUPlace() if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) ) with fluid.dygraph.guard(place): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) mlp = MLP() optimizer = self.get_optimizer_dygraph( parameter_list=mlp.parameters() ) batch_py_reader = fluid.io.PyReader(capacity=1) batch_py_reader.decorate_sample_list_generator( paddle.batch( self.reader_decorator(paddle.dataset.mnist.train()), batch_size=batch_size, drop_last=True, ), places=fluid.CPUPlace(), ) dy_param_init_value = {} for batch_id, data in enumerate(batch_py_reader()): if batch_id >= self.batch_num: break img = data[0] label = data[1] label.stop_gradient = True img = paddle.reshape(img, shape=[batch_size, -1]) cost = mlp(img) avg_loss = paddle.mean(cost) dy_out = avg_loss.numpy() if batch_id == 0: for param in mlp.parameters(): dy_param_init_value[param.name] = param.numpy() avg_loss.backward() optimizer.minimize(avg_loss) mlp.clear_gradients() dy_param_value = {} for param in mlp.parameters(): dy_param_value[param.name] = param.numpy() with new_program_scope(): paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) if place is None: place = ( fluid.CPUPlace() if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) ) exe = fluid.Executor(place) mlp = MLP() optimizer = self.get_optimizer() train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True ) img = paddle.static.data( name='pixel', shape=[-1, 1, 28, 28], dtype='float32' ) label = paddle.static.data( name='label', shape=[-1, 1], dtype='int64' ) img = paddle.reshape(img, shape=[batch_size, 784]) cost = mlp(img) avg_loss = paddle.mean(cost) optimizer.minimize(avg_loss) # initialize params and fetch them static_param_init_value = {} static_param_name_list = [] for param in mlp.parameters(): static_param_name_list.append(param.name) out = exe.run( fluid.default_startup_program(), fetch_list=static_param_name_list, ) for i in range(len(static_param_name_list)): static_param_init_value[static_param_name_list[i]] = out[i] for batch_id, data in enumerate(train_reader()): if batch_id >= self.batch_num: break static_x_data = np.array( [x[0].reshape(1, 28, 28) for x in data] ).astype('float32') y_data = ( np.array([x[1] for x in data]) .astype('int64') .reshape([128, 1]) ) fetch_list = [avg_loss.name] fetch_list.extend(static_param_name_list) out = exe.run( fluid.default_main_program(), feed={"pixel": static_x_data, "label": y_data}, fetch_list=fetch_list, ) static_param_value = {} static_out = out[0] for i in range(1, len(out)): static_param_value[static_param_name_list[i - 1]] = out[i] for key, value in static_param_init_value.items(): np.testing.assert_allclose( value, dy_param_init_value[key], rtol=1e-05 ) if core.is_compiled_with_rocm(): np.testing.assert_allclose( static_out, dy_out, rtol=1e-05, atol=0.001 ) else: np.testing.assert_allclose(static_out, dy_out, rtol=1e-05) for key, value in static_param_value.items(): if core.is_compiled_with_rocm(): np.testing.assert_allclose( value, dy_param_value[key], rtol=1e-05, atol=0.001 ) else: np.testing.assert_allclose( value, dy_param_value[key], rtol=1e-05 ) class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): bd = [3, 6, 9] optimizer = SGDOptimizer( learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)], ), parameter_list=parameter_list, ) return optimizer def get_optimizer(self): bd = [3, 6, 9] optimizer = SGDOptimizer( learning_rate=paddle.optimizer.lr.PiecewiseDecay( boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)], ) ) return optimizer def test_sgd(self): self._check_mlp() class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = SGDOptimizer( learning_rate=fluid.layers.natural_exp_decay( learning_rate=0.1, decay_steps=10000, decay_rate=0.5, staircase=True, ), parameter_list=parameter_list, ) return optimizer def get_optimizer(self): optimizer = SGDOptimizer( learning_rate=fluid.layers.natural_exp_decay( learning_rate=0.1, decay_steps=10000, decay_rate=0.5, staircase=True, ) ) return optimizer def test_sgd(self): self._check_mlp() class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = SGDOptimizer( learning_rate=fluid.layers.exponential_decay( learning_rate=0.1, decay_steps=10000, decay_rate=0.5, staircase=True, ), parameter_list=parameter_list, ) return optimizer def get_optimizer(self): optimizer = SGDOptimizer( learning_rate=fluid.layers.exponential_decay( learning_rate=0.1, decay_steps=10000, decay_rate=0.5, staircase=True, ) ) return optimizer def test_sgd(self): self._check_mlp() class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = Adam( learning_rate=fluid.layers.inverse_time_decay( learning_rate=0.1, decay_steps=10000, decay_rate=0.5, staircase=True, ), parameter_list=parameter_list, ) return optimizer def get_optimizer(self): optimizer = Adam( learning_rate=fluid.layers.inverse_time_decay( learning_rate=0.1, decay_steps=10000, decay_rate=0.5, staircase=True, ) ) return optimizer def test_adam(self): self._check_mlp() class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = paddle.optimizer.SGD( learning_rate=paddle.optimizer.lr.PolynomialDecay( learning_rate=0.1, decay_steps=5, cycle=self.cycle ), parameters=parameter_list, ) return optimizer def get_optimizer(self): optimizer = paddle.optimizer.SGD( learning_rate=paddle.optimizer.lr.PolynomialDecay( learning_rate=0.1, decay_steps=5, cycle=self.cycle ) ) return optimizer def test_sgd_cycle(self): self.cycle = True self._check_mlp() def test_sgd(self): self.cycle = False self._check_mlp() class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = SGDOptimizer( learning_rate=fluid.layers.cosine_decay( learning_rate=0.1, step_each_epoch=10000, epochs=120 ), parameter_list=parameter_list, ) return optimizer def get_optimizer(self): optimizer = SGDOptimizer( learning_rate=fluid.layers.cosine_decay( learning_rate=0.1, step_each_epoch=10000, epochs=120 ) ) return optimizer def test_sgd(self): self._check_mlp() class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = paddle.optimizer.SGD( learning_rate=paddle.optimizer.lr.NoamDecay( d_model=512, warmup_steps=8000 ), parameters=parameter_list, ) return optimizer def get_optimizer(self): optimizer = paddle.optimizer.SGD( learning_rate=paddle.optimizer.lr.NoamDecay( d_model=512, warmup_steps=8000 ) ) return optimizer def test_sgd(self): self._check_mlp() class TestOptimizerLearningRate(unittest.TestCase): def test_constant_lr(self): with fluid.dygraph.guard(): a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) a = fluid.dygraph.to_variable(a) b = linear(a) loss = paddle.mean(b) adam = fluid.optimizer.Adam( 0.001, parameter_list=linear.parameters() ) np.testing.assert_allclose( adam.current_step_lr(), 0.001, rtol=1e-06, atol=0.0 ) for i in range(10): adam.minimize(loss) lr = adam.current_step_lr() np.testing.assert_allclose(lr, 0.001, rtol=1e-06, atol=0.0) def test_lr_decay(self): with fluid.dygraph.guard(): a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) a = fluid.dygraph.to_variable(a) b = linear(a) loss = paddle.mean(b) bd = [2, 4, 6, 8] value = [0.2, 0.4, 0.6, 0.8, 1.0] scheduler = paddle.optimizer.lr.PiecewiseDecay(bd, value) adam = paddle.optimizer.Adam( scheduler, parameters=linear.parameters(), ) np.testing.assert_allclose(adam.get_lr(), 0.2, rtol=1e-06, atol=0.0) ret = [0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0] for i in range(12): adam.minimize(loss) lr = adam.get_lr() adam.step() scheduler.step() np.testing.assert_allclose(lr, ret[i], rtol=1e-06, atol=0.0) def test_lr_decay_natural_exp(self): with fluid.dygraph.guard(): a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) a = fluid.dygraph.to_variable(a) b = linear(a) loss = paddle.mean(b) base_lr = 1.0 scheduler = paddle.optimizer.lr.NaturalExpDecay( learning_rate=base_lr, gamma=0.5, ) adam = paddle.optimizer.Adam( learning_rate=scheduler, parameters=linear.parameters(), ) np.testing.assert_allclose(adam.get_lr(), 1.0, rtol=1e-06, atol=0.0) ret = [1.0, 1.0, 1.0, np.exp(-0.5), np.exp(-0.5)] counter = 0 for i in range(5): adam.minimize(loss) lr = adam.get_lr() counter += 1 if counter % 3 == 0: adam.step() scheduler.step() np.testing.assert_allclose(lr, ret[i], rtol=1e-06, atol=0.0) def test_set_lr(self): with fluid.dygraph.guard(): a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") linear = paddle.nn.Linear(10, 10) a = fluid.dygraph.to_variable(a) b = linear(a) loss = paddle.mean(b) adam = fluid.optimizer.Adam(0.1, parameter_list=linear.parameters()) lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] for i in range(5): adam.set_lr(lr_list[i]) adam.minimize(loss) lr = adam.current_step_lr() np.testing.assert_allclose(lr, lr_list[i], rtol=1e-06, atol=0.0) lr_var = paddle.static.create_global_var( shape=[1], value=0.7, dtype='float32' ) adam.set_lr(lr_var) adam.minimize(loss) lr = adam.current_step_lr() np.testing.assert_allclose(lr, 0.7, rtol=1e-06, atol=0.0) with self.assertRaises(RuntimeError): adam = paddle.optimizer.Adam( paddle.optimizer.lr.NaturalExpDecay( learning_rate=0.1, gamma=0.5, ), parameters=linear.parameters(), ) adam.set_lr(0.01) class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = MomentumOptimizer( learning_rate=0.001, momentum=0.9, parameter_list=parameter_list ) return optimizer def get_optimizer(self): optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) return optimizer def test_momentum(self): self._check_mlp() class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = LarsMomentumOptimizer( learning_rate=0.001, momentum=0.9, parameter_list=parameter_list ) return optimizer def get_optimizer(self): optimizer = LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9) return optimizer def test_larsmomentum(self): self._check_mlp() class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = paddle.optimizer.Adagrad( learning_rate=0.2, parameters=parameter_list ) return optimizer def get_optimizer(self): optimizer = paddle.optimizer.Adagrad(learning_rate=0.2) return optimizer def test_adagrad(self): self._check_mlp() class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = paddle.optimizer.Adamax( learning_rate=0.2, parameters=parameter_list ) return optimizer def get_optimizer(self): optimizer = paddle.optimizer.Adamax(learning_rate=0.2) return optimizer def test_adamax(self): self._check_mlp() class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = DpsgdOptimizer( learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0, parameter_list=parameter_list, ) optimizer._seed = 100 return optimizer def get_optimizer(self): optimizer = DpsgdOptimizer( learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0 ) optimizer._seed = 100 return optimizer def test_dpsgd(self): self._check_mlp(place=fluid.CPUPlace()) class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = DecayedAdagradOptimizer( learning_rate=0.2, parameter_list=parameter_list ) return optimizer def get_optimizer(self): optimizer = DecayedAdagradOptimizer(learning_rate=0.2) return optimizer def test_decayadagrad(self): self._check_mlp() class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = paddle.optimizer.Adadelta( learning_rate=0.0003, epsilon=1.0e-6, rho=0.95, parameters=parameter_list, ) return optimizer def get_optimizer(self): optimizer = paddle.optimizer.Adadelta( learning_rate=0.0003, epsilon=1.0e-6, rho=0.95 ) return optimizer def test_adadelta(self): self._check_mlp() class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = paddle.optimizer.RMSProp( learning_rate=0.1, parameters=parameter_list ) return optimizer def get_optimizer(self): optimizer = paddle.optimizer.RMSProp(learning_rate=0.1) return optimizer def test_rmsprop(self): self._check_mlp() class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = FtrlOptimizer( learning_rate=0.1, parameter_list=parameter_list ) return optimizer def get_optimizer(self): optimizer = FtrlOptimizer(learning_rate=0.1) return optimizer def test_ftrl(self): self._check_mlp() def exclude_fn(param): return param.name.endswith('.b_0') class TestImperativeLambOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = paddle.optimizer.Lamb( learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn, parameters=parameter_list, ) return optimizer def get_optimizer(self): optimizer = paddle.optimizer.Lamb( learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn ) return optimizer # should fix: may fail in CI-windows def _test_lamb(self): self._check_mlp() class TestImperativeModelAverage(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = ModelAverage( 0.15, min_average_window=10000, max_average_window=12500 ) return optimizer def test_modelaverage(self): exception_message = "In dygraph, don't support ModelAverage." self._check_exception(exception_message) class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = DGCMomentumOptimizer( learning_rate=0.0001, momentum=0.9, rampup_step=1000, rampup_begin_step=1252, sparsity=[0.999, 0.999], ) return optimizer def test_dgcmomentum(self): exception_message = "In dygraph, don't support DGCMomentumOptimizer." self._check_exception(exception_message) class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = ExponentialMovingAverage(0.999) return optimizer def test_exponentialmoving(self): exception_message = ( "In dygraph, don't support ExponentialMovingAverage." ) self._check_exception(exception_message) class TestImperativePipelineOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = fluid.optimizer.SGD( learning_rate=0.5, parameter_list=parameter_list ) optimizer = PipelineOptimizer(optimizer) return optimizer def test_pipline(self): exception_message = "In dygraph, don't support PipelineOptimizer." self._check_exception(exception_message) class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = fluid.optimizer.SGD( learning_rate=0.5, parameter_list=parameter_list ) optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5) return optimizer def test_lookahead(self): exception_message = "In dygraph, don't support LookaheadOptimizer." self._check_exception(exception_message) class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = fluid.optimizer.SGD( learning_rate=0.5, parameter_list=parameter_list ) optimizer = RecomputeOptimizer(optimizer) return optimizer def test_recompute(self): exception_message = "In dygraph, don't support RecomputeOptimizer." self._check_exception(exception_message) class TestImperativeOptimizerList(unittest.TestCase): def test_parameter_list(self): with fluid.dygraph.guard(): linear_1 = paddle.nn.Linear(10, 10) linear_2 = paddle.nn.Linear(10, 10) sgd = SGDOptimizer( 1.0, parameter_list=itertools.chain( linear_1.parameters(), linear_2.parameters() ), ) in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") in_data = fluid.dygraph.to_variable(in_np) y = linear_1(in_data) y = linear_2(y) loss = paddle.mean(y) loss.backward() sgd.minimize(loss) self.assertTrue( len(sgd._parameter_list) == len(linear_1.parameters() + linear_2.parameters()) ) if __name__ == '__main__': unittest.main()