提交 0e31d7d7 编写于 作者: A Abhinav Arora 提交者: GitHub

Adding the interface for the momentum optimizer (#4919)

* Adding the interface for the momentum optimizer
* Adding a comment about accumulators
上级 37bfd03f
import paddle.v2.framework.framework as framework import paddle.v2.framework.framework as framework
from collections import defaultdict
__all__ = ['SGDOptimizer'] __all__ = ['SGDOptimizer', 'MomentumOptimizer']
class Optimizer(object): class Optimizer(object):
"""Optimizer Base class. """Optimizer Base class.
Define the common interface of an optimizer. Define the common interface of an optimizer.
User should not use this class directly, but need to use one of it's implementation. User should not use this class directly,
but need to use one of it's implementation.
""" """
def __init__(self): def __init__(self):
pass # Dictionary of accumulators. Some optimizer subclasses need to
# allocate and manage extra variables associated with the parameters
# to train. These variables are called accumulators.
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self._accumulators = defaultdict(lambda: dict())
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
""" append optimize operator to block and return all the added optimize_op """ append optimize operator to block and return all the added optimize_op
""" """
raise NotImplementedError() raise NotImplementedError()
def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None): def _initialize_tensors(self, block):
"""Create all necessary tensors, that will be shared for all parameter updates.
Tensors like learning rate should be initialized here.
Args:
block: the block in which the loss variable is present
"""
pass
def _create_accumulators(self, block, parameters):
"""Create all accumulators needed by the parameters
Args:
block: the block in which the loss variable is present
parameters: list of parameter variables for the optimizer
""" """
create and add gradient Operators in BlockDesc to Compute gradients of `loss` pass
for parameters in parameter_list
def _add_accumulator(self, block, name, param, dtype=None, fill_value=0.0):
"""Utility function to add an accumulator for a parameter
Args:
block: the block in which the loss variable is present
name: name of the accumulator
param: parameter variable for which accumulator is to be added
dtype: data type of the accumulator variable
fill_value: value to initialize the accumulator variable
"""
if (name in self._accumulators and
param.name in self._accumulators[name]):
raise Exception("Accumulator {} already exists for parmeter {}".
format(name, param.name))
global_block = block.program.global_block()
param_shape = list(param.shape)
param_acc = global_block.create_var(
dtype=dtype, shape=param_shape, lod_level=0)
# Initialize the accumulator with fill_value
# FIXME: Fix when Initialization design has been implemented
# https://github.com/PaddlePaddle/Paddle/pull/4852
global_block.append_op(
type="fill_constant",
outputs={"Out": param_acc},
attrs={"shape": param_shape,
"value": fill_value})
# Add to accumulators dict
self._accumulators[name][param.name] = param_acc
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if (name not in self._accumulators or
param.name not in self._accumulators[name]):
raise Exception("Accumulator {} does not exist for parameter {}".
format(name, param.name))
return self._accumulators[name][param.name]
def create_backward_pass(self, loss, parameter_list=None, no_grad_set=None):
"""Create and add gradient Operators in BlockDesc to compute
gradients of `loss` for parameters in parameter_list
Args: Args:
loss: an variable generated by cost function. loss: an variable generated by cost function.
no_grad_set: variable that should not create gradient no_grad_set: variable that should not create gradient
parameter_list: parameters that need to compute gradient and update to optimize the lost. parameter_list: parameters that need to compute gradient and
update to optimize the lost.
Returns: Returns:
list of (parameters, gradients) pair. list of (parameters, gradients) pair.
...@@ -48,7 +120,8 @@ class Optimizer(object): ...@@ -48,7 +120,8 @@ class Optimizer(object):
if not grad_block.has_var(grad_info[0]): if not grad_block.has_var(grad_info[0]):
raise Exception("grad block[%d] did not have grad var %s" % raise Exception("grad block[%d] did not have grad var %s" %
grad_info[1], grad_info[0]) grad_info[1], grad_info[0])
param_var = loss.block.var(param) # Get the param var from the global block
param_var = loss.block.program.global_block().var(param)
grad_var = grad_block.var(grad_info[0]) grad_var = grad_block.var(grad_info[0])
if loss.block.has_var(grad_info[0]): if loss.block.has_var(grad_info[0]):
params_and_grads.append((param_var, grad_var)) params_and_grads.append((param_var, grad_var))
...@@ -64,14 +137,29 @@ class Optimizer(object): ...@@ -64,14 +137,29 @@ class Optimizer(object):
parameters_and_grads: a list of (variable, gradient) pair to update. parameters_and_grads: a list of (variable, gradient) pair to update.
Returns: Returns:
optmization_op_list: a list of optimization operator that will update parameter using gradient. optmization_op_list: a list of optimization operator that will update
parameter using gradient.
""" """
# This is a default implementation of create_optimization_pass that
# can be shared by most optimizers. This implementation assumes that
# the subclass will implement the _append_optimize_op method and the
# _initialize_tensors method. The subclass can extend the
# _create_accumulators method if it needs to create accumulators
# for parameters.
# Create any accumulators
self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads])
# Create any necessary tensors
self._initialize_tensors(loss.block)
optimize_ops = [] optimize_ops = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
if param_and_grad[1] is not None: if param_and_grad[1] is not None:
optimize_op = self._append_optimize_op(loss.block, optimize_op = self._append_optimize_op(loss.block,
param_and_grad) param_and_grad)
optimize_ops.append(optimize_op) optimize_ops.append(optimize_op)
return optimize_ops return optimize_ops
def minimize(self, loss, parameter_list=None, no_grad_set=None): def minimize(self, loss, parameter_list=None, no_grad_set=None):
...@@ -92,33 +180,95 @@ class SGDOptimizer(Optimizer): ...@@ -92,33 +180,95 @@ class SGDOptimizer(Optimizer):
def __init__(self, learning_rate): def __init__(self, learning_rate):
assert learning_rate is not None assert learning_rate is not None
super(Optimizer, self).__init__() super(SGDOptimizer, self).__init__()
self.type = "sgd" self.type = "sgd"
self._learning_rate = learning_rate self._learning_rate = learning_rate
def _append_optimize_op(self, block, param_and_grad): def _initialize_tensors(self, block):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
lr_shape = [1] lr_shape = [1]
# create a var for learning_rate # create a variable for learning_rate
lr = block.create_var(dtype="float32", shape=lr_shape, lod_level=0) self._lr = block.create_var(
dtype="float32", shape=lr_shape, lod_level=0)
# create an op to init the learning_rate # create an op to init the learning_rate
init_op = block.append_op( # FIXME: Fix when Initialization design has been implemented
# https://github.com/PaddlePaddle/Paddle/pull/4852
block.append_op(
type="fill_constant", type="fill_constant",
outputs={"Out": lr}, outputs={"Out": self._lr},
attrs={"shape": lr_shape, attrs={"shape": lr_shape,
"value": self._learning_rate}) "value": self._learning_rate})
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
# create the optimize op # create the optimize op
sgd_op = block.append_op( sgd_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs={
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"LearningRate": lr "LearningRate": self._lr
}, },
outputs={"ParamOut": param_and_grad[0]}, outputs={"ParamOut": param_and_grad[0]})
attrs={"shape": [1],
"value": self._learning_rate})
return sgd_op return sgd_op
class MomentumOptimizer(Optimizer):
"""Simple Momentum optimizer with velocity state
"""
_velocity_acc_str = "velocity"
def __init__(self, learning_rate, momentum):
assert learning_rate is not None
assert momentum is not None
super(MomentumOptimizer, self).__init__()
self.type = "momentum"
self._learning_rate = learning_rate
self._momentum = momentum
def _initialize_tensors(self, block):
assert isinstance(block, framework.Block)
lr_shape = [1]
# create a variable for learning_rate
self._lr = block.create_var(
dtype="float32", shape=lr_shape, lod_level=0)
# create an op to init the learning_rate
# FIXME: Fix when Initialization design has been implemented
# https://github.com/PaddlePaddle/Paddle/pull/4852
block.append_op(
type="fill_constant",
outputs={"Out": self._lr},
attrs={"shape": lr_shape,
"value": self._learning_rate})
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
for p in parameters:
self._add_accumulator(block, self._velocity_acc_str, p, 'float32')
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0])
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Velocity": velocity_acc,
"LearningRate": self._lr
},
outputs={
"ParamOut": param_and_grad[0],
"VelocityOut": velocity_acc
},
attrs={"mu": self._momentum})
return momentum_op
...@@ -6,7 +6,7 @@ import paddle.v2.framework.optimizer as optimizer ...@@ -6,7 +6,7 @@ import paddle.v2.framework.optimizer as optimizer
class TestOptimizer(unittest.TestCase): class TestOptimizer(unittest.TestCase):
def test_sgd_optimizer(self): def test_sgd_optimizer(self):
program = framework.g_program program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
...@@ -14,7 +14,7 @@ class TestOptimizer(unittest.TestCase): ...@@ -14,7 +14,7 @@ class TestOptimizer(unittest.TestCase):
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var( mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
mul_op = block.append_op( block.append_op(
type="mul", type="mul",
inputs={"X": mul_x, inputs={"X": mul_x,
"Y": mul_y}, "Y": mul_y},
...@@ -27,5 +27,47 @@ class TestOptimizer(unittest.TestCase): ...@@ -27,5 +27,47 @@ class TestOptimizer(unittest.TestCase):
self.assertEqual(sgd_op.type, "sgd") self.assertEqual(sgd_op.type, "sgd")
class TestMomentumOptimizer(unittest.TestCase):
class MockMomentum(optimizer.MomentumOptimizer):
def get_accumulators(self):
return self._accumulators
def get_velocity_str(self):
return self._velocity_acc_str
def test_momentum_optimizer(self):
program = framework.Program()
block = program.global_block()
mul_x = block.create_parameter(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
block.append_op(
type="mul",
inputs={"X": mul_x,
"Y": mul_y},
outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1})
momentum_optimizer = self.MockMomentum(learning_rate=0.01, momentum=0.2)
params_grads = momentum_optimizer.create_backward_pass(mul_out)
self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass(params_grads,
mul_out)
self.assertEqual(len(opts), 1)
sgd_op = opts[0]
self.assertEqual(sgd_op.type, "momentum")
# Check accumulators
accumulators = momentum_optimizer.get_accumulators()
self.assertEqual(len(accumulators), 1)
self.assertTrue(momentum_optimizer.get_velocity_str() in accumulators)
velocity_acc = accumulators[momentum_optimizer.get_velocity_str()]
self.assertEqual(len(velocity_acc), 1)
self.assertTrue(mul_x.name in velocity_acc)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册