Adding the interface for the momentum optimizer (#4919)

* Adding the interface for the momentum optimizer * Adding a comment about accumulators

Adding the interface for the momentum optimizer (#4919)
* Adding the interface for the momentum optimizer * Adding a comment about accumulators
0e31d7d7 · Abhinav Arora · GitHub · 37bfd03f · 0e31d7d7 · 0e31d7d7
Showing with 213 addition and 21 deletion

python/paddle/v2/framework/optimizer.py python/paddle/v2/framework/optimizer.py +169 -19

python/paddle/v2/framework/tests/test_optimizer.py python/paddle/v2/framework/tests/test_optimizer.py +44 -2

未找到文件。
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
 import paddle.v2.framework.framework as framework
+from collections import defaultdict

-__all__ = ['SGDOptimizer']
+__all__ = ['SGDOptimizer', 'MomentumOptimizer']


 class Optimizer(object):
    """Optimizer Base class.

    Define the common interface of an optimizer.
-    User should not use this class directly, but need to use one of it's implementation.
+    User should not use this class directly,
+    but need to use one of it's implementation.
    """

    def __init__(self):
-        pass
+        # Dictionary of accumulators. Some optimizer subclasses need to
+        # allocate and manage extra variables associated with the parameters
+        # to train. These variables are called accumulators.
+        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
+        self._accumulators = defaultdict(lambda: dict())

    def _append_optimize_op(self, block, param_and_grad):
        """ append optimize operator to block and return all the added optimize_op
        """
        raise NotImplementedError()

-    def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None):
+    def _initialize_tensors(self, block):
+        """Create all necessary tensors, that will be shared for all parameter updates.
+
+        Tensors like learning rate should be initialized here.
+
+        Args:
+            block: the block in which the loss variable is present
+        """
+        pass
+
+    def _create_accumulators(self, block, parameters):
+        """Create all accumulators needed by the parameters
+
+        Args:
+            block: the block in which the loss variable is present
+            parameters: list of parameter variables for the optimizer
        """
-        create and add gradient Operators in BlockDesc to Compute gradients of `loss`
-        for parameters in parameter_list
+        pass
+
+    def _add_accumulator(self, block, name, param, dtype=None, fill_value=0.0):
+        """Utility function to add an accumulator for a parameter
+
+        Args:
+            block: the block in which the loss variable is present
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be added
+            dtype: data type of the accumulator variable
+            fill_value: value to initialize the accumulator variable
+        """
+        if (name in self._accumulators and
+                param.name in self._accumulators[name]):
+            raise Exception("Accumulator {} already exists for parmeter {}".
+                            format(name, param.name))
+        global_block = block.program.global_block()
+        param_shape = list(param.shape)
+        param_acc = global_block.create_var(
+            dtype=dtype, shape=param_shape, lod_level=0)
+
+        # Initialize the accumulator with fill_value
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        global_block.append_op(
+            type="fill_constant",
+            outputs={"Out": param_acc},
+            attrs={"shape": param_shape,
+                   "value": fill_value})
+
+        # Add to accumulators dict
+        self._accumulators[name][param.name] = param_acc
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+
+        Args:
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be fetched
+
+        Returns:
+            accumulator variable for the parameter
+        """
+        if (name not in self._accumulators or
+                param.name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, param.name))
+        return self._accumulators[name][param.name]
+
+    def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None):
+        """Create and add gradient Operators in BlockDesc to compute
+        gradients of `loss` for parameters in parameter_list

        Args:
          loss: an variable generated by cost function.
          no_grad_set: variable that should not create gradient
-          parameter_list: parameters that need to compute gradient and update to optimize the lost.
+          parameter_list: parameters that need to compute gradient and
+          update to optimize the lost.

        Returns:
          list of (parameters, gradients) pair.
@@ -48,7 +120,8 @@ class Optimizer(object):
            if not grad_block.has_var(grad_info[0]):
                raise Exception("grad block[%d] did not have grad var %s" %
                                grad_info[1], grad_info[0])
-            param_var = loss.block.var(param)
+            # Get the param var from the global block
+            param_var = loss.block.program.global_block().var(param)
            grad_var = grad_block.var(grad_info[0])
            if loss.block.has_var(grad_info[0]):
                params_and_grads.append((param_var, grad_var))
@@ -64,14 +137,29 @@ class Optimizer(object):
          parameters_and_grads: a list of (variable, gradient) pair to update.

        Returns:
-          optmization_op_list: a list of optimization operator that will update parameter using gradient.
+          optmization_op_list: a list of optimization operator that will update
+          parameter using gradient.
        """
+        # This is a default implementation of create_optimization_pass that
+        # can be shared by most optimizers. This implementation assumes that
+        # the subclass will implement the _append_optimize_op method and the
+        #  _initialize_tensors method. The subclass can extend the
+        # _create_accumulators method if it needs to create accumulators
+        # for parameters.
+
+        # Create any accumulators
+        self._create_accumulators(loss.block,
+                                  [p[0] for p in parameters_and_grads])
+        # Create any necessary tensors
+        self._initialize_tensors(loss.block)
+
        optimize_ops = []
        for param_and_grad in parameters_and_grads:
            if param_and_grad[1] is not None:
                optimize_op = self._append_optimize_op(loss.block,
                                                       param_and_grad)
                optimize_ops.append(optimize_op)
+
        return optimize_ops

    def minimize(self, loss, parameter_list=None, no_grad_set=None):
@@ -92,33 +180,95 @@ class SGDOptimizer(Optimizer):

    def __init__(self, learning_rate):
        assert learning_rate is not None
-        super(Optimizer, self).__init__()
+        super(SGDOptimizer, self).__init__()
        self.type = "sgd"
        self._learning_rate = learning_rate

-    def _append_optimize_op(self, block, param_and_grad):
+    def _initialize_tensors(self, block):
        assert isinstance(block, framework.Block)
        lr_shape = [1]
-        # create a var for learning_rate
-        lr = block.create_var(dtype="float32", shape=lr_shape, lod_level=0)
+        # create a variable for learning_rate
+        self._lr = block.create_var(
+            dtype="float32", shape=lr_shape, lod_level=0)

        # create an op to init the learning_rate
-        init_op = block.append_op(
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        block.append_op(
            type="fill_constant",
-            outputs={"Out": lr},
+            outputs={"Out": self._lr},
            attrs={"shape": lr_shape,
                   "value": self._learning_rate})

+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
        # create the optimize op
        sgd_op = block.append_op(
            type=self.type,
            inputs={
                "Param": param_and_grad[0],
                "Grad": param_and_grad[1],
-                "LearningRate": lr
+                "LearningRate": self._lr
            },
-            outputs={"ParamOut": param_and_grad[0]},
-            attrs={"shape": [1],
-                   "value": self._learning_rate})
+            outputs={"ParamOut": param_and_grad[0]})

        return sgd_op
+
+
+class MomentumOptimizer(Optimizer):
+    """Simple Momentum optimizer with velocity state
+    """
+    _velocity_acc_str = "velocity"
+
+    def __init__(self, learning_rate, momentum):
+        assert learning_rate is not None
+        assert momentum is not None
+        super(MomentumOptimizer, self).__init__()
+        self.type = "momentum"
+        self._learning_rate = learning_rate
+        self._momentum = momentum
+
+    def _initialize_tensors(self, block):
+        assert isinstance(block, framework.Block)
+        lr_shape = [1]
+        # create a variable for learning_rate
+        self._lr = block.create_var(
+            dtype="float32", shape=lr_shape, lod_level=0)
+
+        # create an op to init the learning_rate
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        block.append_op(
+            type="fill_constant",
+            outputs={"Out": self._lr},
+            attrs={"shape": lr_shape,
+                   "value": self._learning_rate})
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(block, self._velocity_acc_str, p, 'float32')
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        # create the momentum optimize op
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Velocity": velocity_acc,
+                "LearningRate": self._lr
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "VelocityOut": velocity_acc
+            },
+            attrs={"mu": self._momentum})
+
+        return momentum_op
--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/framework/tests/test_optimizer.py
@@ -6,7 +6,7 @@ import paddle.v2.framework.optimizer as optimizer

 class TestOptimizer(unittest.TestCase):
    def test_sgd_optimizer(self):
-        program = framework.g_program
+        program = framework.Program()
        block = program.global_block()
        mul_x = block.create_parameter(
            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
@@ -14,7 +14,7 @@ class TestOptimizer(unittest.TestCase):
            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
        mul_out = block.create_var(
            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mul_op = block.append_op(
+        block.append_op(
            type="mul",
            inputs={"X": mul_x,
                    "Y": mul_y},
@@ -27,5 +27,47 @@ class TestOptimizer(unittest.TestCase):
        self.assertEqual(sgd_op.type, "sgd")


+class TestMomentumOptimizer(unittest.TestCase):
+    class MockMomentum(optimizer.MomentumOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_velocity_str(self):
+            return self._velocity_acc_str
+
+    def test_momentum_optimizer(self):
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        momentum_optimizer = self.MockMomentum(learning_rate=0.01, momentum=0.2)
+        params_grads = momentum_optimizer.create_backward_pass(mul_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
+        opts = momentum_optimizer.create_optimization_pass(params_grads,
+                                                           mul_out)
+        self.assertEqual(len(opts), 1)
+        sgd_op = opts[0]
+        self.assertEqual(sgd_op.type, "momentum")
+
+        # Check accumulators
+        accumulators = momentum_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 1)
+        self.assertTrue(momentum_optimizer.get_velocity_str() in accumulators)
+        velocity_acc = accumulators[momentum_optimizer.get_velocity_str()]
+        self.assertEqual(len(velocity_acc), 1)
+        self.assertTrue(mul_x.name in velocity_acc)
+
+
 if __name__ == '__main__':
    unittest.main()