From 0ec53f987c4ec24876d47fef747e13b8918496df Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 29 Jan 2019 16:53:10 +0800 Subject: [PATCH] Support imperative learning rate decay in optimizer --- .../fluid/layers/learning_rate_scheduler.py | 51 +++-- python/paddle/fluid/optimizer.py | 43 +++- .../tests/unittests/test_imperative_mnist.py | 207 ++++++++++++++++++ .../unittests/test_imperative_optimizer.py | 105 +++------ 4 files changed, 291 insertions(+), 115 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_mnist.py diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index 617704a5313..2f489e43db1 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -28,6 +28,7 @@ from . import ops from . import tensor from ..initializer import init_on_cpu from ..framework import default_main_program, Parameter, unique_name, name_scope +from ..imperative import base as imperative_base __all__ = [ 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', @@ -277,34 +278,38 @@ def piecewise_decay(boundaries, values): if len(values) - len(boundaries) != 1: raise ValueError("len(values) - len(boundaries) should be 1") - global_step = _decay_step_counter() + if imperative_base.enabled(): + decay = imperative.PiecewiseDecay(boundaries, values, 0) + return decay + else: + global_step = _decay_step_counter() - lr = tensor.create_global_var( - shape=[1], - value=0.0, - dtype='float32', - persistable=True, - name="learning_rate") + lr = tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate") - with control_flow.Switch() as switch: - for i in range(len(boundaries)): - boundary_val = tensor.fill_constant( + with control_flow.Switch() as switch: + for i in range(len(boundaries)): + boundary_val = tensor.fill_constant( + shape=[1], + dtype='float32', + value=float(boundaries[i]), + force_cpu=True) + value_var = tensor.fill_constant( + shape=[1], dtype='float32', value=float(values[i])) + with switch.case(global_step < boundary_val): + tensor.assign(value_var, lr) + last_value_var = tensor.fill_constant( shape=[1], dtype='float32', - value=float(boundaries[i]), - force_cpu=True) - value_var = tensor.fill_constant( - shape=[1], dtype='float32', value=float(values[i])) - with switch.case(global_step < boundary_val): - tensor.assign(value_var, lr) - last_value_var = tensor.fill_constant( - shape=[1], - dtype='float32', - value=float(values[len(values) - 1])) - with switch.default(): - tensor.assign(last_value_var, lr) + value=float(values[len(values) - 1])) + with switch.default(): + tensor.assign(last_value_var, lr) - return lr + return lr def append_LARS(params_grads, learning_rate, weight_decay): diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 14f4276e2f4..63feca22759 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -72,24 +72,43 @@ class Optimizer(object): self.helper = None def _create_global_learning_rate(self): - lr = self._global_learning_rate() - - if isinstance(lr, framework.Variable): - return + if imperative_base.enabled(): + # create learning rate Variable + if isinstance(self._learning_rate, float): + self._learning_rate_map[framework.default_main_program( + )] = layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(self._learning_rate), + dtype='float32' if self._dtype is None else self._dtype, + persistable=True) + # get learning rate Variable from LearningRateDecay + elif isinstance(self._learning_rate, imperative.LearningRateDecay): + self._learning_rate_map[framework.default_main_program( + )] = self._learning_rate() + else: + raise TypeError( + "optimizer's learning rate must be float or LearningRateDecay" + ) else: + lr = self._global_learning_rate() + + if isinstance(lr, framework.Variable): + return + if not isinstance(self._learning_rate, float): raise TypeError( "learning rate variable is create outside optimizer," "can not create new learning rate variable for new program") - # create learning rate in the current main program - self._learning_rate_map[framework.default_main_program( - )] = layers.create_global_var( - name=unique_name.generate("learning_rate"), - shape=[1], - value=float(self._learning_rate), - dtype='float32' if self._dtype is None else self._dtype, - persistable=True) + # create learning rate in the current main program + self._learning_rate_map[framework.default_main_program( + )] = layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(self._learning_rate), + dtype='float32' if self._dtype is None else self._dtype, + persistable=True) def _global_learning_rate(self, program=None): """ diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py new file mode 100644 index 00000000000..d0a5a883174 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -0,0 +1,207 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np +import six + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC +from paddle.fluid.imperative.base import to_variable +from test_imperative_base import new_program_scope + + +class SimpleImgConvPool(fluid.imperative.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + pool_size, + pool_stride, + pool_padding=0, + pool_type='max', + global_pooling=False, + conv_stride=1, + conv_padding=0, + conv_dilation=1, + conv_groups=1, + act=None, + use_cudnn=False, + param_attr=None, + bias_attr=None): + super(SimpleImgConvPool, self).__init__() + + self._conv2d = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=conv_stride, + padding=conv_padding, + dilation=conv_dilation, + groups=conv_groups, + param_attr=None, + bias_attr=None, + use_cudnn=use_cudnn) + + self._pool2d = Pool2D( + pool_size=pool_size, + pool_type=pool_type, + pool_stride=pool_stride, + pool_padding=pool_padding, + global_pooling=global_pooling, + use_cudnn=use_cudnn) + + def forward(self, inputs): + x = self._conv2d(inputs) + x = self._pool2d(x) + return x + + +class MNIST(fluid.imperative.Layer): + def __init__(self, param_attr=None, bias_attr=None): + super(MNIST, self).__init__() + + self._simple_img_conv_pool_1 = SimpleImgConvPool( + 1, 20, 5, 2, 2, act="relu") + + self._simple_img_conv_pool_2 = SimpleImgConvPool( + 20, 50, 5, 2, 2, act="relu") + + pool_2_shape = 50 * 8 * 8 + SIZE = 10 + scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 + self._fc = FC(10, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale))) + + def forward(self, inputs): + x = self._simple_img_conv_pool_1(inputs) + x = self._simple_img_conv_pool_2(x) + x = self._fc(x) + return x + + +class TestImperativeMnist(unittest.TestCase): + def test_mnist_cpu_float32(self): + seed = 90 + + with fluid.imperative.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + mnist = MNIST() + sgd = SGDOptimizer(learning_rate=1e-3) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128) + + dy_param_init_value = {} + for batch_id, data in enumerate(train_reader()): + if batch_id >= 2: + break + + x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + 128, 1) + + img = to_variable(x_data) + label = to_variable(y_data) + label._stop_gradient = True + + cost = mnist(img) + loss = fluid.layers.cross_entropy(cost, label) + avg_loss = fluid.layers.mean(loss) + dy_out = avg_loss._numpy() + + if batch_id == 0: + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_init_value[param.name] = param._numpy() + + avg_loss._backward() + sgd.minimize(avg_loss) + dy_param_value = {} + for param in fluid.default_main_program().global_block( + ).all_parameters(): + dy_param_value[param.name] = param._numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + + mnist = MNIST() + sgd = SGDOptimizer(learning_rate=1e-3) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128) + + img = fluid.layers.data( + name='pixel', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + cost = mnist(img) + loss = fluid.layers.cross_entropy(cost, label) + avg_loss = fluid.layers.mean(loss) + sgd.minimize(avg_loss) + + # initialize params and fetch them + static_param_init_value = {} + static_param_name_list = [] + for param in fluid.default_startup_program().global_block( + ).all_parameters(): + static_param_name_list.append(param.name) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_init_value[static_param_name_list[i]] = out[i] + + for batch_id, data in enumerate(train_reader()): + if batch_id >= 2: + break + + x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + [128, 1]) + + fetch_list = [avg_loss.name] + fetch_list.extend(static_param_name_list) + out = exe.run(fluid.default_main_program(), + feed={"pixel": x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_out = out[0] + for i in range(1, len(out)): + static_param_value[static_param_name_list[i - 1]] = out[i] + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue( + np.allclose(value.all(), dy_param_init_value[key].all())) + self.assertTrue(np.allclose(static_out.all(), dy_out.all())) + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.allclose(value.all(), dy_param_value[key].all())) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index d0a5a883174..ec4c49a9fff 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -21,98 +21,44 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC +from paddle.fluid.imperative.nn import FC from paddle.fluid.imperative.base import to_variable from test_imperative_base import new_program_scope -class SimpleImgConvPool(fluid.imperative.Layer): - def __init__(self, - num_channels, - num_filters, - filter_size, - pool_size, - pool_stride, - pool_padding=0, - pool_type='max', - global_pooling=False, - conv_stride=1, - conv_padding=0, - conv_dilation=1, - conv_groups=1, - act=None, - use_cudnn=False, - param_attr=None, - bias_attr=None): - super(SimpleImgConvPool, self).__init__() - - self._conv2d = Conv2D( - num_channels=num_channels, - num_filters=num_filters, - filter_size=filter_size, - stride=conv_stride, - padding=conv_padding, - dilation=conv_dilation, - groups=conv_groups, - param_attr=None, - bias_attr=None, - use_cudnn=use_cudnn) - - self._pool2d = Pool2D( - pool_size=pool_size, - pool_type=pool_type, - pool_stride=pool_stride, - pool_padding=pool_padding, - global_pooling=global_pooling, - use_cudnn=use_cudnn) - - def forward(self, inputs): - x = self._conv2d(inputs) - x = self._pool2d(x) - return x - - -class MNIST(fluid.imperative.Layer): +class MLP(fluid.imperative.Layer): def __init__(self, param_attr=None, bias_attr=None): - super(MNIST, self).__init__() - - self._simple_img_conv_pool_1 = SimpleImgConvPool( - 1, 20, 5, 2, 2, act="relu") + self._fc1 = FC(10) + self._fc2 = FC(10) - self._simple_img_conv_pool_2 = SimpleImgConvPool( - 20, 50, 5, 2, 2, act="relu") + def forward(self, inputs): + y = self._fc1(inputs) + y = self._fc2(y) + return y - pool_2_shape = 50 * 8 * 8 - SIZE = 10 - scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 - self._fc = FC(10, - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.NormalInitializer( - loc=0.0, scale=scale))) - def forward(self, inputs): - x = self._simple_img_conv_pool_1(inputs) - x = self._simple_img_conv_pool_2(x) - x = self._fc(x) - return x +class TestImperativeOptimizerBase(unittest.TestCase): + def setUp(self): + self.batch_num = 2 + def get_optimizer(self): + self.optimizer = SGDOptimizer(learning_rate=1e-3) -class TestImperativeMnist(unittest.TestCase): - def test_mnist_cpu_float32(self): + def test_optimizer_float32(self): seed = 90 with fluid.imperative.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - mnist = MNIST() - sgd = SGDOptimizer(learning_rate=1e-3) + mlp = MLP() + self.get_optimizer() train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128) dy_param_init_value = {} for batch_id, data in enumerate(train_reader()): - if batch_id >= 2: + if batch_id >= self.batch_num: break x_data = np.array( @@ -124,9 +70,8 @@ class TestImperativeMnist(unittest.TestCase): label = to_variable(y_data) label._stop_gradient = True - cost = mnist(img) - loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) + cost = mlp(img) + avg_loss = fluid.layers.reduce_mean(cost) dy_out = avg_loss._numpy() if batch_id == 0: @@ -135,7 +80,8 @@ class TestImperativeMnist(unittest.TestCase): dy_param_init_value[param.name] = param._numpy() avg_loss._backward() - sgd.minimize(avg_loss) + self.optimizer.minimize(avg_loss) + dy_param_value = {} for param in fluid.default_main_program().global_block( ).all_parameters(): @@ -149,7 +95,7 @@ class TestImperativeMnist(unittest.TestCase): ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) mnist = MNIST() - sgd = SGDOptimizer(learning_rate=1e-3) + self.get_optimizer() train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128) @@ -157,9 +103,8 @@ class TestImperativeMnist(unittest.TestCase): name='pixel', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') cost = mnist(img) - loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) - sgd.minimize(avg_loss) + avg_loss = fluid.layers.reduce_mean(cost) + self.optimizer.minimize(avg_loss) # initialize params and fetch them static_param_init_value = {} @@ -175,7 +120,7 @@ class TestImperativeMnist(unittest.TestCase): static_param_init_value[static_param_name_list[i]] = out[i] for batch_id, data in enumerate(train_reader()): - if batch_id >= 2: + if batch_id >= self.batch_num: break x_data = np.array( -- GitLab