diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/framework/optimizer.py
index e356a7aadb8d6a87d0fe54a5dd2a11fea0d80a74..f992a42c40a6e9e76fd7d0b7ecf9586f01fab645 100644
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
@@ -1,32 +1,104 @@
 import paddle.v2.framework.framework as framework
+from collections import defaultdict
 
-__all__ = ['SGDOptimizer']
+__all__ = ['SGDOptimizer', 'MomentumOptimizer']
 
 
 class Optimizer(object):
     """Optimizer Base class.
 
     Define the common interface of an optimizer.
-    User should not use this class directly, but need to use one of it's implementation.
+    User should not use this class directly,
+    but need to use one of it's implementation.
     """
 
     def __init__(self):
-        pass
+        # Dictionary of accumulators. Some optimizer subclasses need to
+        # allocate and manage extra variables associated with the parameters
+        # to train. These variables are called accumulators.
+        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
+        self._accumulators = defaultdict(lambda: dict())
 
     def _append_optimize_op(self, block, param_and_grad):
         """ append optimize operator to block and return all the added optimize_op
         """
         raise NotImplementedError()
 
-    def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None):
+    def _initialize_tensors(self, block):
+        """Create all necessary tensors, that will be shared for all parameter updates.
+
+        Tensors like learning rate should be initialized here.
+
+        Args:
+            block: the block in which the loss variable is present
+        """
+        pass
+
+    def _create_accumulators(self, block, parameters):
+        """Create all accumulators needed by the parameters
+
+        Args:
+            block: the block in which the loss variable is present
+            parameters: list of parameter variables for the optimizer
         """
-        create and add gradient Operators in BlockDesc to Compute gradients of `loss`
-        for parameters in parameter_list
+        pass
+
+    def _add_accumulator(self, block, name, param, dtype=None, fill_value=0.0):
+        """Utility function to add an accumulator for a parameter
+
+        Args:
+            block: the block in which the loss variable is present
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be added
+            dtype: data type of the accumulator variable
+            fill_value: value to initialize the accumulator variable
+        """
+        if (name in self._accumulators and
+                param.name in self._accumulators[name]):
+            raise Exception("Accumulator {} already exists for parmeter {}".
+                            format(name, param.name))
+        global_block = block.program.global_block()
+        param_shape = list(param.shape)
+        param_acc = global_block.create_var(
+            dtype=dtype, shape=param_shape, lod_level=0)
+
+        # Initialize the accumulator with fill_value
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        global_block.append_op(
+            type="fill_constant",
+            outputs={"Out": param_acc},
+            attrs={"shape": param_shape,
+                   "value": fill_value})
+
+        # Add to accumulators dict
+        self._accumulators[name][param.name] = param_acc
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+
+        Args:
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be fetched
+
+        Returns:
+            accumulator variable for the parameter
+        """
+        if (name not in self._accumulators or
+                param.name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, param.name))
+        return self._accumulators[name][param.name]
+
+    def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None):
+        """Create and add gradient Operators in BlockDesc to compute
+        gradients of `loss` for parameters in parameter_list
 
         Args:
           loss: an variable generated by cost function.
           no_grad_set: variable that should not create gradient
-          parameter_list: parameters that need to compute gradient and update to optimize the lost.
+          parameter_list: parameters that need to compute gradient and
+          update to optimize the lost.
 
         Returns:
           list of (parameters, gradients) pair.
@@ -48,7 +120,8 @@ class Optimizer(object):
             if not grad_block.has_var(grad_info[0]):
                 raise Exception("grad block[%d] did not have grad var %s" %
                                 grad_info[1], grad_info[0])
-            param_var = loss.block.var(param)
+            # Get the param var from the global block
+            param_var = loss.block.program.global_block().var(param)
             grad_var = grad_block.var(grad_info[0])
             if loss.block.has_var(grad_info[0]):
                 params_and_grads.append((param_var, grad_var))
@@ -64,14 +137,29 @@ class Optimizer(object):
           parameters_and_grads: a list of (variable, gradient) pair to update.
 
         Returns:
-          optmization_op_list: a list of optimization operator that will update parameter using gradient.
+          optmization_op_list: a list of optimization operator that will update
+          parameter using gradient.
         """
+        # This is a default implementation of create_optimization_pass that
+        # can be shared by most optimizers. This implementation assumes that
+        # the subclass will implement the _append_optimize_op method and the
+        #  _initialize_tensors method. The subclass can extend the
+        # _create_accumulators method if it needs to create accumulators
+        # for parameters.
+
+        # Create any accumulators
+        self._create_accumulators(loss.block,
+                                  [p[0] for p in parameters_and_grads])
+        # Create any necessary tensors
+        self._initialize_tensors(loss.block)
+
         optimize_ops = []
         for param_and_grad in parameters_and_grads:
             if param_and_grad[1] is not None:
                 optimize_op = self._append_optimize_op(loss.block,
                                                        param_and_grad)
                 optimize_ops.append(optimize_op)
+
         return optimize_ops
 
     def minimize(self, loss, parameter_list=None, no_grad_set=None):
@@ -92,33 +180,95 @@ class SGDOptimizer(Optimizer):
 
     def __init__(self, learning_rate):
         assert learning_rate is not None
-        super(Optimizer, self).__init__()
+        super(SGDOptimizer, self).__init__()
         self.type = "sgd"
         self._learning_rate = learning_rate
 
-    def _append_optimize_op(self, block, param_and_grad):
+    def _initialize_tensors(self, block):
         assert isinstance(block, framework.Block)
         lr_shape = [1]
-        # create a var for learning_rate
-        lr = block.create_var(dtype="float32", shape=lr_shape, lod_level=0)
+        # create a variable for learning_rate
+        self._lr = block.create_var(
+            dtype="float32", shape=lr_shape, lod_level=0)
 
         # create an op to init the learning_rate
-        init_op = block.append_op(
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        block.append_op(
             type="fill_constant",
-            outputs={"Out": lr},
+            outputs={"Out": self._lr},
             attrs={"shape": lr_shape,
                    "value": self._learning_rate})
 
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
         # create the optimize op
         sgd_op = block.append_op(
             type=self.type,
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": lr
+                "LearningRate": self._lr
             },
-            outputs={"ParamOut": param_and_grad[0]},
-            attrs={"shape": [1],
-                   "value": self._learning_rate})
+            outputs={"ParamOut": param_and_grad[0]})
 
         return sgd_op
+
+
+class MomentumOptimizer(Optimizer):
+    """Simple Momentum optimizer with velocity state
+    """
+    _velocity_acc_str = "velocity"
+
+    def __init__(self, learning_rate, momentum):
+        assert learning_rate is not None
+        assert momentum is not None
+        super(MomentumOptimizer, self).__init__()
+        self.type = "momentum"
+        self._learning_rate = learning_rate
+        self._momentum = momentum
+
+    def _initialize_tensors(self, block):
+        assert isinstance(block, framework.Block)
+        lr_shape = [1]
+        # create a variable for learning_rate
+        self._lr = block.create_var(
+            dtype="float32", shape=lr_shape, lod_level=0)
+
+        # create an op to init the learning_rate
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        block.append_op(
+            type="fill_constant",
+            outputs={"Out": self._lr},
+            attrs={"shape": lr_shape,
+                   "value": self._learning_rate})
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(block, self._velocity_acc_str, p, 'float32')
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        # create the momentum optimize op
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Velocity": velocity_acc,
+                "LearningRate": self._lr
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "VelocityOut": velocity_acc
+            },
+            attrs={"mu": self._momentum})
+
+        return momentum_op
diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/framework/tests/test_optimizer.py
index 3d6fa70737bf360df53785dc602feceda471ee70..e6a142ac361b572c8df42dbb5cd1b116584ed324 100644
--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/framework/tests/test_optimizer.py
@@ -6,7 +6,7 @@ import paddle.v2.framework.optimizer as optimizer
 
 class TestOptimizer(unittest.TestCase):
     def test_sgd_optimizer(self):
-        program = framework.g_program
+        program = framework.Program()
         block = program.global_block()
         mul_x = block.create_parameter(
             dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
@@ -14,7 +14,7 @@ class TestOptimizer(unittest.TestCase):
             dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
         mul_out = block.create_var(
             dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mul_op = block.append_op(
+        block.append_op(
             type="mul",
             inputs={"X": mul_x,
                     "Y": mul_y},
@@ -27,5 +27,47 @@ class TestOptimizer(unittest.TestCase):
         self.assertEqual(sgd_op.type, "sgd")
 
 
+class TestMomentumOptimizer(unittest.TestCase):
+    class MockMomentum(optimizer.MomentumOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_velocity_str(self):
+            return self._velocity_acc_str
+
+    def test_momentum_optimizer(self):
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        momentum_optimizer = self.MockMomentum(learning_rate=0.01, momentum=0.2)
+        params_grads = momentum_optimizer.create_backward_pass(mul_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
+        opts = momentum_optimizer.create_optimization_pass(params_grads,
+                                                           mul_out)
+        self.assertEqual(len(opts), 1)
+        sgd_op = opts[0]
+        self.assertEqual(sgd_op.type, "momentum")
+
+        # Check accumulators
+        accumulators = momentum_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 1)
+        self.assertTrue(momentum_optimizer.get_velocity_str() in accumulators)
+        velocity_acc = accumulators[momentum_optimizer.get_velocity_str()]
+        self.assertEqual(len(velocity_acc), 1)
+        self.assertTrue(mul_x.name in velocity_acc)
+
+
 if __name__ == '__main__':
     unittest.main()