diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/framework/optimizer.py index ba2713e34dbfeca6990c49d0388e0886426b921a..f7d35ca06586a2558277fa54da711de1d1ee5180 100644 --- a/python/paddle/v2/framework/optimizer.py +++ b/python/paddle/v2/framework/optimizer.py @@ -1,7 +1,9 @@ import paddle.v2.framework.framework as framework from collections import defaultdict -__all__ = ['SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer'] +__all__ = [ + 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer' +] class Optimizer(object): @@ -43,6 +45,19 @@ class Optimizer(object): """ pass + def _finish_update(self, block): + """Finish any custom updates needed + before completing an optimization step + + Args: + block: the block in which the loss variable is present + parameters: list of parameter variables for the optimizer + + Returns: + list of finish ops or None + """ + pass + def _add_accumulator(self, block, name, param, dtype=None, fill_value=0.0): """Utility function to add an accumulator for a parameter @@ -137,15 +152,17 @@ class Optimizer(object): parameters_and_grads: a list of (variable, gradient) pair to update. Returns: - optmization_op_list: a list of optimization operator that will update - parameter using gradient. + return_op_list: a list of operators that will complete one step of + optimization. This will include parameter update ops, global step + update ops and any other custom ops required by subclasses to manage + their internal state. """ # This is a default implementation of create_optimization_pass that # can be shared by most optimizers. This implementation assumes that # the subclass will implement the _append_optimize_op method and the # _initialize_tensors method. The subclass can extend the # _create_accumulators method if it needs to create accumulators - # for parameters. + # for parameters and extend _finish_update method to add custom ops. # Create any accumulators self._create_accumulators(loss.block, @@ -160,7 +177,17 @@ class Optimizer(object): param_and_grad) optimize_ops.append(optimize_op) - return optimize_ops + # Returned list of ops can include more ops in addition + # to optimization ops + return_ops = optimize_ops + + # Get custom finish ops for subclasses + # FIXME: Need to fix this once we figure out how to handle dependencies + finish_ops = self._finish_update(loss.block) + if finish_ops is not None: + return_ops += finish_ops + + return return_ops def minimize(self, loss, parameter_list=None, no_grad_set=None): """Add operations to minimize `loss` by updating `parameter_list`. @@ -329,3 +356,124 @@ class AdagradOptimizer(Optimizer): attrs={"epsilon": self._epsilon}) return adagrad_op + + +class AdamOptimizer(Optimizer): + """Implements the Adam Optimizer + """ + _moment1_acc_str = "moment1" + _moment2_acc_str = "moment2" + + def __init__(self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8): + assert learning_rate is not None + assert beta1 is not None + assert beta2 is not None + assert epsilon is not None + super(AdamOptimizer, self).__init__() + self.type = "adam" + self._learning_rate = learning_rate + self._beta1 = beta1 + self._beta2 = beta2 + self._epsilon = epsilon + + def _initialize_tensors(self, block): + assert isinstance(block, framework.Block) + lr_shape = [1] + # create a variable for learning_rate + self._lr = block.create_var( + dtype="float32", shape=lr_shape, lod_level=0) + + # create an op to init the learning_rate + # FIXME: Fix when Initialization design has been implemented + # https://github.com/PaddlePaddle/Paddle/pull/4852 + block.append_op( + type="fill_constant", + outputs={"Out": self._lr}, + attrs={"shape": lr_shape, + "value": self._learning_rate}) + + def _create_accumulators(self, block, parameters): + assert isinstance(block, framework.Block) + + global_block = block.program.global_block() + # Create beta1 and beta2 power tensors + beta_shape = [1] + # Create variables for beta1 and beta2 powers + self._beta1_pow_acc = global_block.create_var( + dtype="float32", shape=beta_shape, lod_level=0) + self._beta2_pow_acc = global_block.create_var( + dtype="float32", shape=beta_shape, lod_level=0) + + # Initialize beta1 and beta2 power accumulators + # FIXME: Fix when Initialization design has been implemented + # https://github.com/PaddlePaddle/Paddle/pull/4852 + global_block.append_op( + type="fill_constant", + outputs={"Out": self._beta1_pow_acc}, + attrs={"shape": beta_shape, + "value": self._beta1}) + global_block.append_op( + type="fill_constant", + outputs={"Out": self._beta2_pow_acc}, + attrs={"shape": beta_shape, + "value": self._beta2}) + + # Create accumulator tensors for first and second moments + for p in parameters: + self._add_accumulator(block, self._moment1_acc_str, p, 'float32') + self._add_accumulator(block, self._moment2_acc_str, p, 'float32') + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + moment1 = self._get_accumulator(self._moment1_acc_str, + param_and_grad[0]) + moment2 = self._get_accumulator(self._moment2_acc_str, + param_and_grad[0]) + # create the momentum optimize op + adam_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": self._lr, + "Moment1": moment1, + "Moment2": moment2, + "Beta1Pow": self._beta1_pow_acc, + "Beta2Pow": self._beta2_pow_acc + }, + outputs={ + "ParamOut": param_and_grad[0], + "Moment1Out": moment1, + "Moment2Out": moment2 + }, + attrs={ + "beta1": self._beta1, + "beta2": self._beta2, + "epsilon": self._epsilon + }) + + return adam_op + + def _finish_update(self, block): + """Update Beta1 and Beta2 Power accumulators + """ + assert isinstance(block, framework.Block) + global_block = block.program.global_block() + scale_beta1 = global_block.append_op( + type="scale", + inputs={"X": self._beta1_pow_acc}, + outputs={"Out": self._beta1_pow_acc}, + attrs={"scale": self._beta1}) + + scale_beta2 = global_block.append_op( + type="scale", + inputs={"X": self._beta2_pow_acc}, + outputs={"Out": self._beta2_pow_acc}, + attrs={"scale": self._beta2}) + + return [scale_beta1, scale_beta2] diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/framework/tests/test_optimizer.py index 3d1715bf627fc018856b80e0e8ff962a7922f193..4b267598efb84abeb30b5c0d82e6414984ddf19f 100644 --- a/python/paddle/v2/framework/tests/test_optimizer.py +++ b/python/paddle/v2/framework/tests/test_optimizer.py @@ -110,5 +110,54 @@ class TestAdagradOptimizer(unittest.TestCase): self.assertTrue(mul_x.name in moment_acc) +class TestAdamOptimizer(unittest.TestCase): + class MockAdam(optimizer.AdamOptimizer): + def get_accumulators(self): + return self._accumulators + + def get_moment1_str(self): + return self._moment1_acc_str + + def get_moment2_str(self): + return self._moment2_acc_str + + def test_adam_optimizer(self): + program = framework.Program() + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + block.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + adam_optimizer = self.MockAdam( + learning_rate=0.01, beta1=0.9, beta2=0.999) + params_grads = adam_optimizer.create_backward_pass(mul_out) + self.assertEqual(len(params_grads), 1) + self.assertEqual(len(adam_optimizer.get_accumulators()), 0) + opts = adam_optimizer.create_optimization_pass(params_grads, mul_out) + self.assertEqual(len(opts), 3) + adam_op = opts[0] + self.assertEqual(adam_op.type, "adam") + + # Check accumulators + accumulators = adam_optimizer.get_accumulators() + self.assertEqual(len(accumulators), 2) + self.assertTrue(adam_optimizer.get_moment1_str() in accumulators) + self.assertTrue(adam_optimizer.get_moment2_str() in accumulators) + moment1_acc = accumulators[adam_optimizer.get_moment1_str()] + moment2_acc = accumulators[adam_optimizer.get_moment2_str()] + self.assertEqual(len(moment1_acc), 1) + self.assertEqual(len(moment2_acc), 1) + self.assertTrue(mul_x.name in moment1_acc) + self.assertTrue(mul_x.name in moment2_acc) + + if __name__ == '__main__': unittest.main()