From f48159ade0f50b2d056f274ad36d40ec0075c8a7 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 2 Nov 2017 09:26:35 +0800 Subject: [PATCH] Optimizer use init program (#5275) * optimizer use init_program * create persistable variable * add create_persistable_var to block * optimizer use create_persistable_var * fix prefix * move create_global_persistable_var from Block to LayerHelper * Polish Optimizer initialization code. * Using the LayerHelper to create initialize operator and variables * add_accumulator should use an independent data type * default use param data type for accumulator --- python/paddle/v2/framework/framework.py | 5 + python/paddle/v2/framework/layer_helper.py | 23 +- python/paddle/v2/framework/optimizer.py | 234 ++++++++---------- .../v2/framework/tests/test_fit_a_line.py | 2 +- .../tests/test_image_classification_train.py | 2 +- .../tests/test_inference_model_io.py | 2 +- .../v2/framework/tests/test_optimizer.py | 90 +++++-- .../tests/test_recognize_digits_conv.py | 6 +- .../tests/test_recognize_digits_mlp.py | 5 +- .../v2/framework/tests/test_word2vec.py | 2 +- 10 files changed, 213 insertions(+), 158 deletions(-) diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py index 7da6f81359..b50b215333 100644 --- a/python/paddle/v2/framework/framework.py +++ b/python/paddle/v2/framework/framework.py @@ -7,6 +7,11 @@ import copy __all__ = ['Block', 'Variable', 'Program', 'Operator'] +def unique_name(prefix): + uid = core.unique_integer(prefix) # unique during whole process. + return "_".join([prefix, str(uid)]) + + class Variable(object): def __init__(self, block, diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/framework/layer_helper.py index 45d9cf3f48..aa7dd0b50d 100644 --- a/python/paddle/v2/framework/layer_helper.py +++ b/python/paddle/v2/framework/layer_helper.py @@ -1,19 +1,12 @@ import copy import itertools -import paddle.v2.framework.core as core - from paddle.v2.framework.framework import Variable, g_program, \ - g_init_program + g_init_program, unique_name, Program from paddle.v2.framework.initializer import ConstantInitializer, \ UniformInitializer -def unique_name(prefix): - uid = core.unique_integer(prefix) # unique during whole process. - return "_".join([prefix, str(uid)]) - - class LayerHelper(object): def __init__(self, layer_type, **kwargs): self.kwargs = kwargs @@ -138,9 +131,19 @@ class LayerHelper(object): def create_variable(self, *args, **kwargs): return self.program.current_block().create_var(*args, **kwargs) - def create_global_variable(self, *args, **kwargs): + def create_global_variable(self, persistable=False, *args, **kwargs): return self.program.global_block().create_var( - *args, persistable=False, **kwargs) + *args, persistable=persistable, **kwargs) + + def set_variable_initializer(self, var, initializer): + assert isinstance(var, Variable) + self.init_program.global_block().create_var( + name=var.name, + type=var.type, + dtype=var.data_type, + shape=var.shape, + persistable=True, + initializer=initializer) def append_bias_op(self, input_var, num_flatten_dims=None): """ diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/framework/optimizer.py index 4c608f96bd..902442297e 100644 --- a/python/paddle/v2/framework/optimizer.py +++ b/python/paddle/v2/framework/optimizer.py @@ -1,8 +1,11 @@ from collections import defaultdict import paddle.v2.framework.framework as framework +from paddle.v2.framework.framework import unique_name, Program from paddle.v2.framework.backward import append_backward_ops +from paddle.v2.framework.initializer import ConstantInitializer from paddle.v2.framework.regularizer import append_regularization_ops +from paddle.v2.framework.layer_helper import LayerHelper __all__ = [ 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', @@ -25,6 +28,7 @@ class Optimizer(object): # to train. These variables are called accumulators. # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} self._accumulators = defaultdict(lambda: dict()) + self.helper = None def _append_optimize_op(self, block, param_and_grad): """ append optimize operator to block and return all the added optimize_op @@ -63,7 +67,7 @@ class Optimizer(object): """ pass - def _add_accumulator(self, block, name, param, dtype=None, fill_value=0.0): + def _add_accumulator(self, name, param, dtype=None, fill_value=0.0): """Utility function to add an accumulator for a parameter Args: @@ -77,22 +81,17 @@ class Optimizer(object): param.name in self._accumulators[name]): raise Exception("Accumulator {} already exists for parmeter {}". format(name, param.name)) - global_block = block.program.global_block() - param_shape = list(param.shape) - param_acc = global_block.create_var( - dtype=dtype, shape=param_shape, lod_level=0) - - # Initialize the accumulator with fill_value - # FIXME: Fix when Initialization design has been implemented - # https://github.com/PaddlePaddle/Paddle/pull/4852 - global_block.append_op( - type="fill_constant", - outputs={"Out": param_acc}, - attrs={"shape": param_shape, - "value": fill_value}) - - # Add to accumulators dict - self._accumulators[name][param.name] = param_acc + + assert isinstance(self.helper, LayerHelper) + var = self.helper.create_global_variable( + name=unique_name(name), + persistable=True, + dtype=dtype or param.data_type, + type=param.type, + shape=param.shape) + self.helper.set_variable_initializer( + var, initializer=ConstantInitializer(value=float(fill_value))) + self._accumulators[name][param.name] = var def _get_accumulator(self, name, param): """Utility function to fetch an accumulator for a parameter @@ -130,7 +129,10 @@ class Optimizer(object): return increment_op - def create_optimization_pass(self, parameters_and_grads, loss): + def create_optimization_pass(self, + parameters_and_grads, + loss, + init_program=None): """Add optimization operators to update gradients to variables. Args: @@ -142,6 +144,7 @@ class Optimizer(object): optimization. This will include parameter update ops, global step update ops and any other custom ops required by subclasses to manage their internal state. + :param init_program: """ # This is a default implementation of create_optimization_pass that # can be shared by most optimizers. This implementation assumes that @@ -151,6 +154,9 @@ class Optimizer(object): # for parameters and extend _finish_update method to add custom ops. # Create any accumulators + program = loss.block.program + self.helper = LayerHelper( + self.__class__.__name__, program=program, init_program=init_program) self._create_accumulators(loss.block, [p[0] for p in parameters_and_grads]) # Create any necessary tensors @@ -177,7 +183,11 @@ class Optimizer(object): return_ops.append(self._increment_global_step(loss.block)) return return_ops - def minimize(self, loss, parameter_list=None, no_grad_set=None): + def minimize(self, + loss, + init_program=None, + parameter_list=None, + no_grad_set=None): """Add operations to minimize `loss` by updating `parameter_list`. This method combines interface `append_backward_ops()` and @@ -187,7 +197,8 @@ class Optimizer(object): set()) # Add regularization if any params_grads = append_regularization_ops(params_grads) - optimize_ops = self.create_optimization_pass(params_grads, loss) + optimize_ops = self.create_optimization_pass(params_grads, loss, + init_program) return optimize_ops @@ -202,24 +213,19 @@ class SGDOptimizer(Optimizer): self._learning_rate = learning_rate def _initialize_tensors(self, block): - assert isinstance(block, framework.Block) lr_shape = [1] # create a variable for learning_rate - self._lr = block.create_var( - dtype="float32", shape=lr_shape, lod_level=0) - - # create an op to init the learning_rate - # FIXME: Fix when Initialization design has been implemented - # https://github.com/PaddlePaddle/Paddle/pull/4852 - block.append_op( - type="fill_constant", - outputs={"Out": self._lr}, - attrs={"shape": lr_shape, - "value": self._learning_rate}) + self._lr = self.helper.create_global_variable( + name=unique_name("learning_rate"), + dtype='float32', + shape=lr_shape, + lod_level=1, + persistable=True) + self.helper.set_variable_initializer( + var=self._lr, initializer=ConstantInitializer(self._learning_rate)) def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) - # create the optimize op sgd_op = block.append_op( type=self.type, @@ -255,23 +261,20 @@ class MomentumOptimizer(Optimizer): assert isinstance(block, framework.Block) lr_shape = [1] # create a variable for learning_rate - self._lr = block.create_var( - dtype="float32", shape=lr_shape, lod_level=0) - - # create an op to init the learning_rate - # FIXME: Fix when Initialization design has been implemented - # https://github.com/PaddlePaddle/Paddle/pull/4852 - block.append_op( - type="fill_constant", - outputs={"Out": self._lr}, - attrs={"shape": lr_shape, - "value": self._learning_rate}) + self._lr = self.helper.create_global_variable( + name=unique_name("learning_rate"), + dtype='float32', + shape=lr_shape, + lod_level=1, + persistable=True) + self.helper.set_variable_initializer( + var=self._lr, initializer=ConstantInitializer(self._learning_rate)) def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) for p in parameters: - self._add_accumulator(block, self._velocity_acc_str, p, 'float32') + self._add_accumulator(self._velocity_acc_str, p) def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) @@ -311,26 +314,22 @@ class AdagradOptimizer(Optimizer): self._epsilon = epsilon def _initialize_tensors(self, block): - assert isinstance(block, framework.Block) lr_shape = [1] # create a variable for learning_rate - self._lr = block.create_var( - dtype="float32", shape=lr_shape, lod_level=0) - - # create an op to init the learning_rate - # FIXME: Fix when Initialization design has been implemented - # https://github.com/PaddlePaddle/Paddle/pull/4852 - block.append_op( - type="fill_constant", - outputs={"Out": self._lr}, - attrs={"shape": lr_shape, - "value": self._learning_rate}) + self._lr = self.helper.create_global_variable( + name=unique_name("learning_rate"), + dtype='float32', + shape=lr_shape, + lod_level=1, + persistable=True) + self.helper.set_variable_initializer( + var=self._lr, initializer=ConstantInitializer(self._learning_rate)) def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) for p in parameters: - self._add_accumulator(block, self._moment_acc_str, p, 'float32') + self._add_accumulator(self._moment_acc_str, p) def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) @@ -378,51 +377,46 @@ class AdamOptimizer(Optimizer): self._epsilon = epsilon def _initialize_tensors(self, block): - assert isinstance(block, framework.Block) lr_shape = [1] # create a variable for learning_rate - self._lr = block.create_var( - dtype="float32", shape=lr_shape, lod_level=0) - - # create an op to init the learning_rate - # FIXME: Fix when Initialization design has been implemented - # https://github.com/PaddlePaddle/Paddle/pull/4852 - block.append_op( - type="fill_constant", - outputs={"Out": self._lr}, - attrs={"shape": lr_shape, - "value": self._learning_rate}) + self._lr = self.helper.create_global_variable( + name=unique_name("learning_rate"), + dtype='float32', + shape=lr_shape, + lod_level=1, + persistable=True) + self.helper.set_variable_initializer( + var=self._lr, initializer=ConstantInitializer(self._learning_rate)) def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) - global_block = block.program.global_block() + main_block = block.program.global_block() # Create beta1 and beta2 power tensors beta_shape = [1] - # Create variables for beta1 and beta2 powers - self._beta1_pow_acc = global_block.create_var( - dtype="float32", shape=beta_shape, lod_level=0) - self._beta2_pow_acc = global_block.create_var( - dtype="float32", shape=beta_shape, lod_level=0) - - # Initialize beta1 and beta2 power accumulators - # FIXME: Fix when Initialization design has been implemented - # https://github.com/PaddlePaddle/Paddle/pull/4852 - global_block.append_op( - type="fill_constant", - outputs={"Out": self._beta1_pow_acc}, - attrs={"shape": beta_shape, - "value": self._beta1}) - global_block.append_op( - type="fill_constant", - outputs={"Out": self._beta2_pow_acc}, - attrs={"shape": beta_shape, - "value": self._beta2}) + self._beta1_pow_acc = self.helper.create_global_variable( + name=unique_name('beta1_pow_acc'), + dtype='float32', + shape=beta_shape, + lod_level=0, + persistable=True) + self.helper.set_variable_initializer( + self._beta1_pow_acc, initializer=ConstantInitializer(self._beta1)) + + self._beta2_pow_acc = self.helper.create_global_variable( + name=unique_name('beta2_pow_acc'), + dtype='float32', + shape=beta_shape, + lod_level=0, + persistable=True) + + self.helper.set_variable_initializer( + self._beta2_pow_acc, initializer=ConstantInitializer(self._beta2)) # Create accumulator tensors for first and second moments for p in parameters: - self._add_accumulator(block, self._moment1_acc_str, p, 'float32') - self._add_accumulator(block, self._moment2_acc_str, p, 'float32') + self._add_accumulator(self._moment1_acc_str, p) + self._add_accumulator(self._moment2_acc_str, p) def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) @@ -460,14 +454,14 @@ class AdamOptimizer(Optimizer): """Update Beta1 and Beta2 Power accumulators """ assert isinstance(block, framework.Block) - global_block = block.program.global_block() - scale_beta1 = global_block.append_op( + main_block = block.program.global_block() + scale_beta1 = main_block.append_op( type="scale", inputs={"X": self._beta1_pow_acc}, outputs={"Out": self._beta1_pow_acc}, attrs={"scale": self._beta1}) - scale_beta2 = global_block.append_op( + scale_beta2 = main_block.append_op( type="scale", inputs={"X": self._beta2_pow_acc}, outputs={"Out": self._beta2_pow_acc}, @@ -500,43 +494,33 @@ class AdamaxOptimizer(Optimizer): self._epsilon = epsilon def _initialize_tensors(self, block): - assert isinstance(block, framework.Block) lr_shape = [1] # create a variable for learning_rate - self._lr = block.create_var( - dtype="float32", shape=lr_shape, lod_level=0) - - # create an op to init the learning_rate - # FIXME: Fix when Initialization design has been implemented - # https://github.com/PaddlePaddle/Paddle/pull/4852 - block.append_op( - type="fill_constant", - outputs={"Out": self._lr}, - attrs={"shape": lr_shape, - "value": self._learning_rate}) + self._lr = self.helper.create_global_variable( + name=unique_name("learning_rate"), + dtype='float32', + shape=lr_shape, + lod_level=1, + persistable=True) + self.helper.set_variable_initializer( + var=self._lr, initializer=ConstantInitializer(self._learning_rate)) def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - global_block = block.program.global_block() # Create beta1 power accumulator tensor beta_shape = [1] - self._beta1_pow_acc = global_block.create_var( - dtype="float32", shape=beta_shape, lod_level=0) - - # Initialize beta1 power accumulator - # FIXME: Fix when Initialization design has been implemented - # https://github.com/PaddlePaddle/Paddle/pull/4852 - global_block.append_op( - type="fill_constant", - outputs={"Out": self._beta1_pow_acc}, - attrs={"shape": beta_shape, - "value": self._beta1}) + self._beta1_pow_acc = self.helper.create_global_variable( + name=unique_name('beta1_pow_acc'), + dtype='float32', + shape=beta_shape, + lod_level=0, + persistable=True) + self.helper.set_variable_initializer( + self._beta1_pow_acc, initializer=ConstantInitializer(self._beta1)) # Create accumulator tensors for first moment and infinity norm for p in parameters: - self._add_accumulator(block, self._moment_acc_str, p, 'float32') - self._add_accumulator(block, self._inf_norm_acc_str, p, 'float32') + self._add_accumulator(self._moment_acc_str, p) + self._add_accumulator(self._inf_norm_acc_str, p) def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) @@ -572,8 +556,8 @@ class AdamaxOptimizer(Optimizer): """Update Beta1 Power accumulator """ assert isinstance(block, framework.Block) - global_block = block.program.global_block() - scale_beta1 = global_block.append_op( + main_block = block.program.global_block() + scale_beta1 = main_block.append_op( type="scale", inputs={"X": self._beta1_pow_acc}, outputs={"Out": self._beta1_pow_acc}, diff --git a/python/paddle/v2/framework/tests/test_fit_a_line.py b/python/paddle/v2/framework/tests/test_fit_a_line.py index 7c2ef61fe1..944240629c 100644 --- a/python/paddle/v2/framework/tests/test_fit_a_line.py +++ b/python/paddle/v2/framework/tests/test_fit_a_line.py @@ -36,7 +36,7 @@ cost = layers.square_error_cost( avg_cost = layers.mean(x=cost, program=program, init_program=init_program) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) -opts = sgd_optimizer.minimize(avg_cost) +opts = sgd_optimizer.minimize(avg_cost, init_program) BATCH_SIZE = 20 diff --git a/python/paddle/v2/framework/tests/test_image_classification_train.py b/python/paddle/v2/framework/tests/test_image_classification_train.py index 6b6dec4976..21adc7f38f 100644 --- a/python/paddle/v2/framework/tests/test_image_classification_train.py +++ b/python/paddle/v2/framework/tests/test_image_classification_train.py @@ -208,7 +208,7 @@ cost = layers.cross_entropy( avg_cost = layers.mean(x=cost, program=program, init_program=init_program) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) -opts = sgd_optimizer.minimize(avg_cost) +opts = sgd_optimizer.minimize(avg_cost, init_program) BATCH_SIZE = 128 PASS_NUM = 1 diff --git a/python/paddle/v2/framework/tests/test_inference_model_io.py b/python/paddle/v2/framework/tests/test_inference_model_io.py index 4487ab989f..e9c9cd27d9 100644 --- a/python/paddle/v2/framework/tests/test_inference_model_io.py +++ b/python/paddle/v2/framework/tests/test_inference_model_io.py @@ -44,7 +44,7 @@ class TestBook(unittest.TestCase): x=cost, program=program, init_program=init_program) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) - opts = sgd_optimizer.minimize(avg_cost) + opts = sgd_optimizer.minimize(avg_cost, init_program) place = core.CPUPlace() exe = executor.Executor(place) diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/framework/tests/test_optimizer.py index 45396c9bec..9333df8f7f 100644 --- a/python/paddle/v2/framework/tests/test_optimizer.py +++ b/python/paddle/v2/framework/tests/test_optimizer.py @@ -7,6 +7,7 @@ from paddle.v2.framework.backward import append_backward_ops class TestOptimizer(unittest.TestCase): def test_sgd_optimizer(self): + init_program = framework.Program() program = framework.Program() block = program.global_block() mul_x = block.create_parameter( @@ -22,12 +23,13 @@ class TestOptimizer(unittest.TestCase): outputs={"Out": mul_out}, attrs={"x_num_col_dims": 1}) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) - opts = sgd_optimizer.minimize(mul_out) + opts = sgd_optimizer.minimize(mul_out, init_program) self.assertEqual(len(opts), 1) sgd_op = opts[0] self.assertEqual(sgd_op.type, "sgd") def test_sgd_optimizer_with_global_step(self): + init_program = framework.Program() program = framework.Program() block = program.global_block() mul_x = block.create_parameter( @@ -44,15 +46,22 @@ class TestOptimizer(unittest.TestCase): attrs={"x_num_col_dims": 1}) global_step = block.create_var( dtype="float32", shape=[1], lod_level=0, name="step") + learning_rate = 0.01 sgd_optimizer = optimizer.SGDOptimizer( - learning_rate=0.01, global_step=global_step) - opts = sgd_optimizer.minimize(mul_out) + learning_rate=learning_rate, global_step=global_step) + opts = sgd_optimizer.minimize(mul_out, init_program) self.assertEqual(len(opts), 2) sgd_op = opts[0] self.assertEqual(sgd_op.type, "sgd") increment_op = opts[1] self.assertEqual(increment_op.type, "increment") + # Check init_program + init_ops = init_program.global_block().ops + self.assertEqual(len(init_ops), 1) + self.assertEqual(init_ops[0].type, "fill_constant") + self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) + class TestMomentumOptimizer(unittest.TestCase): class MockMomentum(optimizer.MomentumOptimizer): @@ -63,6 +72,7 @@ class TestMomentumOptimizer(unittest.TestCase): return self._velocity_acc_str def test_vanilla_momentum_optimizer(self): + init_program = framework.Program() program = framework.Program() block = program.global_block() mul_x = block.create_parameter( @@ -77,12 +87,14 @@ class TestMomentumOptimizer(unittest.TestCase): "Y": mul_y}, outputs={"Out": mul_out}, attrs={"x_num_col_dims": 1}) - momentum_optimizer = self.MockMomentum(learning_rate=0.01, momentum=0.2) + learning_rate = 0.01 + momentum_optimizer = self.MockMomentum( + learning_rate=learning_rate, momentum=0.2) params_grads = append_backward_ops(mul_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) - opts = momentum_optimizer.create_optimization_pass(params_grads, - mul_out) + opts = momentum_optimizer.create_optimization_pass( + params_grads, mul_out, init_program) self.assertEqual(len(opts), 1) sgd_op = opts[0] self.assertEqual(sgd_op.type, "momentum") @@ -96,7 +108,16 @@ class TestMomentumOptimizer(unittest.TestCase): self.assertEqual(len(velocity_acc), 1) self.assertTrue(mul_x.name in velocity_acc) + # Check init_program + init_ops = init_program.global_block().ops + self.assertEqual(len(init_ops), 2) + self.assertEqual(init_ops[0].type, "fill_constant") + self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) + self.assertEqual(init_ops[1].type, "fill_constant") + self.assertAlmostEqual(init_ops[1].attr('value'), 0.0) + def test_nesterov_momentum_optimizer(self): + init_program = framework.Program() program = framework.Program() block = program.global_block() mul_x = block.create_parameter( @@ -111,13 +132,14 @@ class TestMomentumOptimizer(unittest.TestCase): "Y": mul_y}, outputs={"Out": mul_out}, attrs={"x_num_col_dims": 1}) + learning_rate = 0.01 momentum_optimizer = self.MockMomentum( - learning_rate=0.01, momentum=0.2, use_nesterov=True) + learning_rate=learning_rate, momentum=0.2, use_nesterov=True) params_grads = append_backward_ops(mul_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) - opts = momentum_optimizer.create_optimization_pass(params_grads, - mul_out) + opts = momentum_optimizer.create_optimization_pass( + params_grads, mul_out, init_program) self.assertEqual(len(opts), 1) sgd_op = opts[0] self.assertEqual(sgd_op.type, "momentum") @@ -131,6 +153,14 @@ class TestMomentumOptimizer(unittest.TestCase): self.assertEqual(len(velocity_acc), 1) self.assertTrue(mul_x.name in velocity_acc) + # Check init_program + init_ops = init_program.global_block().ops + self.assertEqual(len(init_ops), 2) + self.assertEqual(init_ops[0].type, "fill_constant") + self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) + self.assertEqual(init_ops[1].type, "fill_constant") + self.assertAlmostEqual(init_ops[1].attr('value'), 0.0) + class TestAdagradOptimizer(unittest.TestCase): class MockAdagrad(optimizer.AdagradOptimizer): @@ -141,6 +171,7 @@ class TestAdagradOptimizer(unittest.TestCase): return self._moment_acc_str def test_adagrad_optimizer(self): + init_program = framework.Program() program = framework.Program() block = program.global_block() mul_x = block.create_parameter( @@ -155,11 +186,14 @@ class TestAdagradOptimizer(unittest.TestCase): "Y": mul_y}, outputs={"Out": mul_out}, attrs={"x_num_col_dims": 1}) - adagrad_optimizer = self.MockAdagrad(learning_rate=0.01, epsilon=1.0e-6) + learning_rate = 0.01 + adagrad_optimizer = self.MockAdagrad( + learning_rate=learning_rate, epsilon=1.0e-6) params_grads = append_backward_ops(mul_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) - opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out) + opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out, + init_program) self.assertEqual(len(opts), 1) adagrad_op = opts[0] self.assertEqual(adagrad_op.type, "adagrad") @@ -172,6 +206,14 @@ class TestAdagradOptimizer(unittest.TestCase): self.assertEqual(len(moment_acc), 1) self.assertTrue(mul_x.name in moment_acc) + # Check init_program + init_ops = init_program.global_block().ops + self.assertEqual(len(init_ops), 2) + self.assertEqual(init_ops[0].type, "fill_constant") + self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) + self.assertEqual(init_ops[1].type, "fill_constant") + self.assertAlmostEqual(init_ops[1].attr('value'), 0.0) + class TestAdamOptimizer(unittest.TestCase): class MockAdam(optimizer.AdamOptimizer): @@ -185,6 +227,7 @@ class TestAdamOptimizer(unittest.TestCase): return self._moment2_acc_str def test_adam_optimizer(self): + init_program = framework.Program() program = framework.Program() block = program.global_block() mul_x = block.create_parameter( @@ -199,12 +242,14 @@ class TestAdamOptimizer(unittest.TestCase): "Y": mul_y}, outputs={"Out": mul_out}, attrs={"x_num_col_dims": 1}) + learning_rate = 0.01 adam_optimizer = self.MockAdam( - learning_rate=0.01, beta1=0.9, beta2=0.999) + learning_rate=learning_rate, beta1=0.9, beta2=0.999) params_grads = append_backward_ops(mul_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adam_optimizer.get_accumulators()), 0) - opts = adam_optimizer.create_optimization_pass(params_grads, mul_out) + opts = adam_optimizer.create_optimization_pass(params_grads, mul_out, + init_program) self.assertEqual(len(opts), 3) adam_op = opts[0] self.assertEqual(adam_op.type, "adam") @@ -221,6 +266,12 @@ class TestAdamOptimizer(unittest.TestCase): self.assertTrue(mul_x.name in moment1_acc) self.assertTrue(mul_x.name in moment2_acc) + # Check init_program + init_ops = init_program.global_block().ops + self.assertEqual(len(init_ops), 5) + self.assertEqual(init_ops[0].type, "fill_constant") + self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) + class TestAdamaxOptimizer(unittest.TestCase): class MockAdamax(optimizer.AdamaxOptimizer): @@ -234,6 +285,7 @@ class TestAdamaxOptimizer(unittest.TestCase): return self._inf_norm_acc_str def test_adamax_optimizer(self): + init_program = framework.Program() program = framework.Program() block = program.global_block() mul_x = block.create_parameter( @@ -248,12 +300,14 @@ class TestAdamaxOptimizer(unittest.TestCase): "Y": mul_y}, outputs={"Out": mul_out}, attrs={"x_num_col_dims": 1}) + learning_rate = 0.01 adamax_optimizer = self.MockAdamax( - learning_rate=0.01, beta1=0.9, beta2=0.999) + learning_rate=learning_rate, beta1=0.9, beta2=0.999) params_grads = append_backward_ops(mul_out) self.assertEqual(len(params_grads), 1) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) - opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out) + opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out, + init_program) self.assertEqual(len(opts), 2) adam_op = opts[0] self.assertEqual(adam_op.type, "adamax") @@ -270,6 +324,12 @@ class TestAdamaxOptimizer(unittest.TestCase): self.assertTrue(mul_x.name in moment_acc) self.assertTrue(mul_x.name in inf_norm_acc) + # Check init_program + init_ops = init_program.global_block().ops + self.assertEqual(len(init_ops), 4) + self.assertEqual(init_ops[0].type, "fill_constant") + self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py index 92b1d05426..695236f3df 100644 --- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py +++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py @@ -54,8 +54,10 @@ avg_cost = layers.mean(x=cost, program=program) accuracy = layers.accuracy( input=predict, label=label, program=program, init_program=init_program) -sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) -opts = sgd_optimizer.minimize(avg_cost) +# optimizer = optimizer.MomentumOptimizer(learning_rate=0.1 / 128.0, +# momentum=0.9) +optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999) +opts = optimizer.minimize(avg_cost, init_program) BATCH_SIZE = 50 PASS_NUM = 3 diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py index 9916569d04..c116d1a6d3 100644 --- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py +++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py @@ -58,8 +58,8 @@ cost = layers.cross_entropy( input=predict, label=label, program=program, init_program=init_program) avg_cost = layers.mean(x=cost, program=program, init_program=init_program) -sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) -opts = sgd_optimizer.minimize(avg_cost) +optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9) +opts = optimizer.minimize(avg_cost, init_program) train_reader = paddle.batch( paddle.reader.shuffle( @@ -89,6 +89,7 @@ for pass_id in range(PASS_NUM): 'y': tensor_y}, fetch_list=[avg_cost]) out = np.array(outs[0]) + if out[0] < 5.0: exit(0) # if avg cost less than 5.0, we think our code is good. exit(1) diff --git a/python/paddle/v2/framework/tests/test_word2vec.py b/python/paddle/v2/framework/tests/test_word2vec.py index 515d30d3e2..2aaf8d6a2b 100644 --- a/python/paddle/v2/framework/tests/test_word2vec.py +++ b/python/paddle/v2/framework/tests/test_word2vec.py @@ -109,7 +109,7 @@ cost = layers.cross_entropy( avg_cost = layers.mean(x=cost, program=program, init_program=init_program) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) -opts = sgd_optimizer.minimize(avg_cost) +opts = sgd_optimizer.minimize(avg_cost, init_program) train_reader = paddle.batch( paddle.dataset.imikolov.train(word_dict, N), batch_size) -- GitLab