未验证 提交 f48159ad 编写于 作者: Q Qiao Longfei 提交者: GitHub

Optimizer use init program (#5275)

* optimizer use init_program

* create persistable variable

* add create_persistable_var to block

* optimizer use create_persistable_var

* fix prefix

* move create_global_persistable_var from Block to LayerHelper

* Polish Optimizer initialization code.

* Using the LayerHelper to create initialize operator and variables

* add_accumulator should use an independent data type

* default use param data type for accumulator
上级 90f4d5e9
...@@ -7,6 +7,11 @@ import copy ...@@ -7,6 +7,11 @@ import copy
__all__ = ['Block', 'Variable', 'Program', 'Operator'] __all__ = ['Block', 'Variable', 'Program', 'Operator']
def unique_name(prefix):
uid = core.unique_integer(prefix) # unique during whole process.
return "_".join([prefix, str(uid)])
class Variable(object): class Variable(object):
def __init__(self, def __init__(self,
block, block,
......
import copy import copy
import itertools import itertools
import paddle.v2.framework.core as core
from paddle.v2.framework.framework import Variable, g_program, \ from paddle.v2.framework.framework import Variable, g_program, \
g_init_program g_init_program, unique_name, Program
from paddle.v2.framework.initializer import ConstantInitializer, \ from paddle.v2.framework.initializer import ConstantInitializer, \
UniformInitializer UniformInitializer
def unique_name(prefix):
uid = core.unique_integer(prefix) # unique during whole process.
return "_".join([prefix, str(uid)])
class LayerHelper(object): class LayerHelper(object):
def __init__(self, layer_type, **kwargs): def __init__(self, layer_type, **kwargs):
self.kwargs = kwargs self.kwargs = kwargs
...@@ -138,9 +131,19 @@ class LayerHelper(object): ...@@ -138,9 +131,19 @@ class LayerHelper(object):
def create_variable(self, *args, **kwargs): def create_variable(self, *args, **kwargs):
return self.program.current_block().create_var(*args, **kwargs) return self.program.current_block().create_var(*args, **kwargs)
def create_global_variable(self, *args, **kwargs): def create_global_variable(self, persistable=False, *args, **kwargs):
return self.program.global_block().create_var( return self.program.global_block().create_var(
*args, persistable=False, **kwargs) *args, persistable=persistable, **kwargs)
def set_variable_initializer(self, var, initializer):
assert isinstance(var, Variable)
self.init_program.global_block().create_var(
name=var.name,
type=var.type,
dtype=var.data_type,
shape=var.shape,
persistable=True,
initializer=initializer)
def append_bias_op(self, input_var, num_flatten_dims=None): def append_bias_op(self, input_var, num_flatten_dims=None):
""" """
......
from collections import defaultdict from collections import defaultdict
import paddle.v2.framework.framework as framework import paddle.v2.framework.framework as framework
from paddle.v2.framework.framework import unique_name, Program
from paddle.v2.framework.backward import append_backward_ops from paddle.v2.framework.backward import append_backward_ops
from paddle.v2.framework.initializer import ConstantInitializer
from paddle.v2.framework.regularizer import append_regularization_ops from paddle.v2.framework.regularizer import append_regularization_ops
from paddle.v2.framework.layer_helper import LayerHelper
__all__ = [ __all__ = [
'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
...@@ -25,6 +28,7 @@ class Optimizer(object): ...@@ -25,6 +28,7 @@ class Optimizer(object):
# to train. These variables are called accumulators. # to train. These variables are called accumulators.
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self._accumulators = defaultdict(lambda: dict()) self._accumulators = defaultdict(lambda: dict())
self.helper = None
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
""" append optimize operator to block and return all the added optimize_op """ append optimize operator to block and return all the added optimize_op
...@@ -63,7 +67,7 @@ class Optimizer(object): ...@@ -63,7 +67,7 @@ class Optimizer(object):
""" """
pass pass
def _add_accumulator(self, block, name, param, dtype=None, fill_value=0.0): def _add_accumulator(self, name, param, dtype=None, fill_value=0.0):
"""Utility function to add an accumulator for a parameter """Utility function to add an accumulator for a parameter
Args: Args:
...@@ -77,22 +81,17 @@ class Optimizer(object): ...@@ -77,22 +81,17 @@ class Optimizer(object):
param.name in self._accumulators[name]): param.name in self._accumulators[name]):
raise Exception("Accumulator {} already exists for parmeter {}". raise Exception("Accumulator {} already exists for parmeter {}".
format(name, param.name)) format(name, param.name))
global_block = block.program.global_block()
param_shape = list(param.shape) assert isinstance(self.helper, LayerHelper)
param_acc = global_block.create_var( var = self.helper.create_global_variable(
dtype=dtype, shape=param_shape, lod_level=0) name=unique_name(name),
persistable=True,
# Initialize the accumulator with fill_value dtype=dtype or param.data_type,
# FIXME: Fix when Initialization design has been implemented type=param.type,
# https://github.com/PaddlePaddle/Paddle/pull/4852 shape=param.shape)
global_block.append_op( self.helper.set_variable_initializer(
type="fill_constant", var, initializer=ConstantInitializer(value=float(fill_value)))
outputs={"Out": param_acc}, self._accumulators[name][param.name] = var
attrs={"shape": param_shape,
"value": fill_value})
# Add to accumulators dict
self._accumulators[name][param.name] = param_acc
def _get_accumulator(self, name, param): def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter """Utility function to fetch an accumulator for a parameter
...@@ -130,7 +129,10 @@ class Optimizer(object): ...@@ -130,7 +129,10 @@ class Optimizer(object):
return increment_op return increment_op
def create_optimization_pass(self, parameters_and_grads, loss): def create_optimization_pass(self,
parameters_and_grads,
loss,
init_program=None):
"""Add optimization operators to update gradients to variables. """Add optimization operators to update gradients to variables.
Args: Args:
...@@ -142,6 +144,7 @@ class Optimizer(object): ...@@ -142,6 +144,7 @@ class Optimizer(object):
optimization. This will include parameter update ops, global step optimization. This will include parameter update ops, global step
update ops and any other custom ops required by subclasses to manage update ops and any other custom ops required by subclasses to manage
their internal state. their internal state.
:param init_program:
""" """
# This is a default implementation of create_optimization_pass that # This is a default implementation of create_optimization_pass that
# can be shared by most optimizers. This implementation assumes that # can be shared by most optimizers. This implementation assumes that
...@@ -151,6 +154,9 @@ class Optimizer(object): ...@@ -151,6 +154,9 @@ class Optimizer(object):
# for parameters and extend _finish_update method to add custom ops. # for parameters and extend _finish_update method to add custom ops.
# Create any accumulators # Create any accumulators
program = loss.block.program
self.helper = LayerHelper(
self.__class__.__name__, program=program, init_program=init_program)
self._create_accumulators(loss.block, self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads]) [p[0] for p in parameters_and_grads])
# Create any necessary tensors # Create any necessary tensors
...@@ -177,7 +183,11 @@ class Optimizer(object): ...@@ -177,7 +183,11 @@ class Optimizer(object):
return_ops.append(self._increment_global_step(loss.block)) return_ops.append(self._increment_global_step(loss.block))
return return_ops return return_ops
def minimize(self, loss, parameter_list=None, no_grad_set=None): def minimize(self,
loss,
init_program=None,
parameter_list=None,
no_grad_set=None):
"""Add operations to minimize `loss` by updating `parameter_list`. """Add operations to minimize `loss` by updating `parameter_list`.
This method combines interface `append_backward_ops()` and This method combines interface `append_backward_ops()` and
...@@ -187,7 +197,8 @@ class Optimizer(object): ...@@ -187,7 +197,8 @@ class Optimizer(object):
set()) set())
# Add regularization if any # Add regularization if any
params_grads = append_regularization_ops(params_grads) params_grads = append_regularization_ops(params_grads)
optimize_ops = self.create_optimization_pass(params_grads, loss) optimize_ops = self.create_optimization_pass(params_grads, loss,
init_program)
return optimize_ops return optimize_ops
...@@ -202,24 +213,19 @@ class SGDOptimizer(Optimizer): ...@@ -202,24 +213,19 @@ class SGDOptimizer(Optimizer):
self._learning_rate = learning_rate self._learning_rate = learning_rate
def _initialize_tensors(self, block): def _initialize_tensors(self, block):
assert isinstance(block, framework.Block)
lr_shape = [1] lr_shape = [1]
# create a variable for learning_rate # create a variable for learning_rate
self._lr = block.create_var( self._lr = self.helper.create_global_variable(
dtype="float32", shape=lr_shape, lod_level=0) name=unique_name("learning_rate"),
dtype='float32',
# create an op to init the learning_rate shape=lr_shape,
# FIXME: Fix when Initialization design has been implemented lod_level=1,
# https://github.com/PaddlePaddle/Paddle/pull/4852 persistable=True)
block.append_op( self.helper.set_variable_initializer(
type="fill_constant", var=self._lr, initializer=ConstantInitializer(self._learning_rate))
outputs={"Out": self._lr},
attrs={"shape": lr_shape,
"value": self._learning_rate})
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
# create the optimize op # create the optimize op
sgd_op = block.append_op( sgd_op = block.append_op(
type=self.type, type=self.type,
...@@ -255,23 +261,20 @@ class MomentumOptimizer(Optimizer): ...@@ -255,23 +261,20 @@ class MomentumOptimizer(Optimizer):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
lr_shape = [1] lr_shape = [1]
# create a variable for learning_rate # create a variable for learning_rate
self._lr = block.create_var( self._lr = self.helper.create_global_variable(
dtype="float32", shape=lr_shape, lod_level=0) name=unique_name("learning_rate"),
dtype='float32',
# create an op to init the learning_rate shape=lr_shape,
# FIXME: Fix when Initialization design has been implemented lod_level=1,
# https://github.com/PaddlePaddle/Paddle/pull/4852 persistable=True)
block.append_op( self.helper.set_variable_initializer(
type="fill_constant", var=self._lr, initializer=ConstantInitializer(self._learning_rate))
outputs={"Out": self._lr},
attrs={"shape": lr_shape,
"value": self._learning_rate})
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
for p in parameters: for p in parameters:
self._add_accumulator(block, self._velocity_acc_str, p, 'float32') self._add_accumulator(self._velocity_acc_str, p)
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -311,26 +314,22 @@ class AdagradOptimizer(Optimizer): ...@@ -311,26 +314,22 @@ class AdagradOptimizer(Optimizer):
self._epsilon = epsilon self._epsilon = epsilon
def _initialize_tensors(self, block): def _initialize_tensors(self, block):
assert isinstance(block, framework.Block)
lr_shape = [1] lr_shape = [1]
# create a variable for learning_rate # create a variable for learning_rate
self._lr = block.create_var( self._lr = self.helper.create_global_variable(
dtype="float32", shape=lr_shape, lod_level=0) name=unique_name("learning_rate"),
dtype='float32',
# create an op to init the learning_rate shape=lr_shape,
# FIXME: Fix when Initialization design has been implemented lod_level=1,
# https://github.com/PaddlePaddle/Paddle/pull/4852 persistable=True)
block.append_op( self.helper.set_variable_initializer(
type="fill_constant", var=self._lr, initializer=ConstantInitializer(self._learning_rate))
outputs={"Out": self._lr},
attrs={"shape": lr_shape,
"value": self._learning_rate})
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
for p in parameters: for p in parameters:
self._add_accumulator(block, self._moment_acc_str, p, 'float32') self._add_accumulator(self._moment_acc_str, p)
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -378,51 +377,46 @@ class AdamOptimizer(Optimizer): ...@@ -378,51 +377,46 @@ class AdamOptimizer(Optimizer):
self._epsilon = epsilon self._epsilon = epsilon
def _initialize_tensors(self, block): def _initialize_tensors(self, block):
assert isinstance(block, framework.Block)
lr_shape = [1] lr_shape = [1]
# create a variable for learning_rate # create a variable for learning_rate
self._lr = block.create_var( self._lr = self.helper.create_global_variable(
dtype="float32", shape=lr_shape, lod_level=0) name=unique_name("learning_rate"),
dtype='float32',
# create an op to init the learning_rate shape=lr_shape,
# FIXME: Fix when Initialization design has been implemented lod_level=1,
# https://github.com/PaddlePaddle/Paddle/pull/4852 persistable=True)
block.append_op( self.helper.set_variable_initializer(
type="fill_constant", var=self._lr, initializer=ConstantInitializer(self._learning_rate))
outputs={"Out": self._lr},
attrs={"shape": lr_shape,
"value": self._learning_rate})
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
global_block = block.program.global_block() main_block = block.program.global_block()
# Create beta1 and beta2 power tensors # Create beta1 and beta2 power tensors
beta_shape = [1] beta_shape = [1]
# Create variables for beta1 and beta2 powers self._beta1_pow_acc = self.helper.create_global_variable(
self._beta1_pow_acc = global_block.create_var( name=unique_name('beta1_pow_acc'),
dtype="float32", shape=beta_shape, lod_level=0) dtype='float32',
self._beta2_pow_acc = global_block.create_var( shape=beta_shape,
dtype="float32", shape=beta_shape, lod_level=0) lod_level=0,
persistable=True)
# Initialize beta1 and beta2 power accumulators self.helper.set_variable_initializer(
# FIXME: Fix when Initialization design has been implemented self._beta1_pow_acc, initializer=ConstantInitializer(self._beta1))
# https://github.com/PaddlePaddle/Paddle/pull/4852
global_block.append_op( self._beta2_pow_acc = self.helper.create_global_variable(
type="fill_constant", name=unique_name('beta2_pow_acc'),
outputs={"Out": self._beta1_pow_acc}, dtype='float32',
attrs={"shape": beta_shape, shape=beta_shape,
"value": self._beta1}) lod_level=0,
global_block.append_op( persistable=True)
type="fill_constant",
outputs={"Out": self._beta2_pow_acc}, self.helper.set_variable_initializer(
attrs={"shape": beta_shape, self._beta2_pow_acc, initializer=ConstantInitializer(self._beta2))
"value": self._beta2})
# Create accumulator tensors for first and second moments # Create accumulator tensors for first and second moments
for p in parameters: for p in parameters:
self._add_accumulator(block, self._moment1_acc_str, p, 'float32') self._add_accumulator(self._moment1_acc_str, p)
self._add_accumulator(block, self._moment2_acc_str, p, 'float32') self._add_accumulator(self._moment2_acc_str, p)
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -460,14 +454,14 @@ class AdamOptimizer(Optimizer): ...@@ -460,14 +454,14 @@ class AdamOptimizer(Optimizer):
"""Update Beta1 and Beta2 Power accumulators """Update Beta1 and Beta2 Power accumulators
""" """
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
global_block = block.program.global_block() main_block = block.program.global_block()
scale_beta1 = global_block.append_op( scale_beta1 = main_block.append_op(
type="scale", type="scale",
inputs={"X": self._beta1_pow_acc}, inputs={"X": self._beta1_pow_acc},
outputs={"Out": self._beta1_pow_acc}, outputs={"Out": self._beta1_pow_acc},
attrs={"scale": self._beta1}) attrs={"scale": self._beta1})
scale_beta2 = global_block.append_op( scale_beta2 = main_block.append_op(
type="scale", type="scale",
inputs={"X": self._beta2_pow_acc}, inputs={"X": self._beta2_pow_acc},
outputs={"Out": self._beta2_pow_acc}, outputs={"Out": self._beta2_pow_acc},
...@@ -500,43 +494,33 @@ class AdamaxOptimizer(Optimizer): ...@@ -500,43 +494,33 @@ class AdamaxOptimizer(Optimizer):
self._epsilon = epsilon self._epsilon = epsilon
def _initialize_tensors(self, block): def _initialize_tensors(self, block):
assert isinstance(block, framework.Block)
lr_shape = [1] lr_shape = [1]
# create a variable for learning_rate # create a variable for learning_rate
self._lr = block.create_var( self._lr = self.helper.create_global_variable(
dtype="float32", shape=lr_shape, lod_level=0) name=unique_name("learning_rate"),
dtype='float32',
# create an op to init the learning_rate shape=lr_shape,
# FIXME: Fix when Initialization design has been implemented lod_level=1,
# https://github.com/PaddlePaddle/Paddle/pull/4852 persistable=True)
block.append_op( self.helper.set_variable_initializer(
type="fill_constant", var=self._lr, initializer=ConstantInitializer(self._learning_rate))
outputs={"Out": self._lr},
attrs={"shape": lr_shape,
"value": self._learning_rate})
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
global_block = block.program.global_block()
# Create beta1 power accumulator tensor # Create beta1 power accumulator tensor
beta_shape = [1] beta_shape = [1]
self._beta1_pow_acc = global_block.create_var( self._beta1_pow_acc = self.helper.create_global_variable(
dtype="float32", shape=beta_shape, lod_level=0) name=unique_name('beta1_pow_acc'),
dtype='float32',
# Initialize beta1 power accumulator shape=beta_shape,
# FIXME: Fix when Initialization design has been implemented lod_level=0,
# https://github.com/PaddlePaddle/Paddle/pull/4852 persistable=True)
global_block.append_op( self.helper.set_variable_initializer(
type="fill_constant", self._beta1_pow_acc, initializer=ConstantInitializer(self._beta1))
outputs={"Out": self._beta1_pow_acc},
attrs={"shape": beta_shape,
"value": self._beta1})
# Create accumulator tensors for first moment and infinity norm # Create accumulator tensors for first moment and infinity norm
for p in parameters: for p in parameters:
self._add_accumulator(block, self._moment_acc_str, p, 'float32') self._add_accumulator(self._moment_acc_str, p)
self._add_accumulator(block, self._inf_norm_acc_str, p, 'float32') self._add_accumulator(self._inf_norm_acc_str, p)
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -572,8 +556,8 @@ class AdamaxOptimizer(Optimizer): ...@@ -572,8 +556,8 @@ class AdamaxOptimizer(Optimizer):
"""Update Beta1 Power accumulator """Update Beta1 Power accumulator
""" """
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
global_block = block.program.global_block() main_block = block.program.global_block()
scale_beta1 = global_block.append_op( scale_beta1 = main_block.append_op(
type="scale", type="scale",
inputs={"X": self._beta1_pow_acc}, inputs={"X": self._beta1_pow_acc},
outputs={"Out": self._beta1_pow_acc}, outputs={"Out": self._beta1_pow_acc},
......
...@@ -36,7 +36,7 @@ cost = layers.square_error_cost( ...@@ -36,7 +36,7 @@ cost = layers.square_error_cost(
avg_cost = layers.mean(x=cost, program=program, init_program=init_program) avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
opts = sgd_optimizer.minimize(avg_cost) opts = sgd_optimizer.minimize(avg_cost, init_program)
BATCH_SIZE = 20 BATCH_SIZE = 20
......
...@@ -208,7 +208,7 @@ cost = layers.cross_entropy( ...@@ -208,7 +208,7 @@ cost = layers.cross_entropy(
avg_cost = layers.mean(x=cost, program=program, init_program=init_program) avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
opts = sgd_optimizer.minimize(avg_cost) opts = sgd_optimizer.minimize(avg_cost, init_program)
BATCH_SIZE = 128 BATCH_SIZE = 128
PASS_NUM = 1 PASS_NUM = 1
......
...@@ -44,7 +44,7 @@ class TestBook(unittest.TestCase): ...@@ -44,7 +44,7 @@ class TestBook(unittest.TestCase):
x=cost, program=program, init_program=init_program) x=cost, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
opts = sgd_optimizer.minimize(avg_cost) opts = sgd_optimizer.minimize(avg_cost, init_program)
place = core.CPUPlace() place = core.CPUPlace()
exe = executor.Executor(place) exe = executor.Executor(place)
......
...@@ -7,6 +7,7 @@ from paddle.v2.framework.backward import append_backward_ops ...@@ -7,6 +7,7 @@ from paddle.v2.framework.backward import append_backward_ops
class TestOptimizer(unittest.TestCase): class TestOptimizer(unittest.TestCase):
def test_sgd_optimizer(self): def test_sgd_optimizer(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
...@@ -22,12 +23,13 @@ class TestOptimizer(unittest.TestCase): ...@@ -22,12 +23,13 @@ class TestOptimizer(unittest.TestCase):
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
opts = sgd_optimizer.minimize(mul_out) opts = sgd_optimizer.minimize(mul_out, init_program)
self.assertEqual(len(opts), 1) self.assertEqual(len(opts), 1)
sgd_op = opts[0] sgd_op = opts[0]
self.assertEqual(sgd_op.type, "sgd") self.assertEqual(sgd_op.type, "sgd")
def test_sgd_optimizer_with_global_step(self): def test_sgd_optimizer_with_global_step(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
...@@ -44,15 +46,22 @@ class TestOptimizer(unittest.TestCase): ...@@ -44,15 +46,22 @@ class TestOptimizer(unittest.TestCase):
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
global_step = block.create_var( global_step = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="step") dtype="float32", shape=[1], lod_level=0, name="step")
learning_rate = 0.01
sgd_optimizer = optimizer.SGDOptimizer( sgd_optimizer = optimizer.SGDOptimizer(
learning_rate=0.01, global_step=global_step) learning_rate=learning_rate, global_step=global_step)
opts = sgd_optimizer.minimize(mul_out) opts = sgd_optimizer.minimize(mul_out, init_program)
self.assertEqual(len(opts), 2) self.assertEqual(len(opts), 2)
sgd_op = opts[0] sgd_op = opts[0]
self.assertEqual(sgd_op.type, "sgd") self.assertEqual(sgd_op.type, "sgd")
increment_op = opts[1] increment_op = opts[1]
self.assertEqual(increment_op.type, "increment") self.assertEqual(increment_op.type, "increment")
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 1)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
class TestMomentumOptimizer(unittest.TestCase): class TestMomentumOptimizer(unittest.TestCase):
class MockMomentum(optimizer.MomentumOptimizer): class MockMomentum(optimizer.MomentumOptimizer):
...@@ -63,6 +72,7 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -63,6 +72,7 @@ class TestMomentumOptimizer(unittest.TestCase):
return self._velocity_acc_str return self._velocity_acc_str
def test_vanilla_momentum_optimizer(self): def test_vanilla_momentum_optimizer(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
...@@ -77,12 +87,14 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -77,12 +87,14 @@ class TestMomentumOptimizer(unittest.TestCase):
"Y": mul_y}, "Y": mul_y},
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
momentum_optimizer = self.MockMomentum(learning_rate=0.01, momentum=0.2) learning_rate = 0.01
momentum_optimizer = self.MockMomentum(
learning_rate=learning_rate, momentum=0.2)
params_grads = append_backward_ops(mul_out) params_grads = append_backward_ops(mul_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass(params_grads, opts = momentum_optimizer.create_optimization_pass(
mul_out) params_grads, mul_out, init_program)
self.assertEqual(len(opts), 1) self.assertEqual(len(opts), 1)
sgd_op = opts[0] sgd_op = opts[0]
self.assertEqual(sgd_op.type, "momentum") self.assertEqual(sgd_op.type, "momentum")
...@@ -96,7 +108,16 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -96,7 +108,16 @@ class TestMomentumOptimizer(unittest.TestCase):
self.assertEqual(len(velocity_acc), 1) self.assertEqual(len(velocity_acc), 1)
self.assertTrue(mul_x.name in velocity_acc) self.assertTrue(mul_x.name in velocity_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 2)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
self.assertEqual(init_ops[1].type, "fill_constant")
self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
def test_nesterov_momentum_optimizer(self): def test_nesterov_momentum_optimizer(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
...@@ -111,13 +132,14 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -111,13 +132,14 @@ class TestMomentumOptimizer(unittest.TestCase):
"Y": mul_y}, "Y": mul_y},
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
learning_rate = 0.01
momentum_optimizer = self.MockMomentum( momentum_optimizer = self.MockMomentum(
learning_rate=0.01, momentum=0.2, use_nesterov=True) learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
params_grads = append_backward_ops(mul_out) params_grads = append_backward_ops(mul_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass(params_grads, opts = momentum_optimizer.create_optimization_pass(
mul_out) params_grads, mul_out, init_program)
self.assertEqual(len(opts), 1) self.assertEqual(len(opts), 1)
sgd_op = opts[0] sgd_op = opts[0]
self.assertEqual(sgd_op.type, "momentum") self.assertEqual(sgd_op.type, "momentum")
...@@ -131,6 +153,14 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -131,6 +153,14 @@ class TestMomentumOptimizer(unittest.TestCase):
self.assertEqual(len(velocity_acc), 1) self.assertEqual(len(velocity_acc), 1)
self.assertTrue(mul_x.name in velocity_acc) self.assertTrue(mul_x.name in velocity_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 2)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
self.assertEqual(init_ops[1].type, "fill_constant")
self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
class TestAdagradOptimizer(unittest.TestCase): class TestAdagradOptimizer(unittest.TestCase):
class MockAdagrad(optimizer.AdagradOptimizer): class MockAdagrad(optimizer.AdagradOptimizer):
...@@ -141,6 +171,7 @@ class TestAdagradOptimizer(unittest.TestCase): ...@@ -141,6 +171,7 @@ class TestAdagradOptimizer(unittest.TestCase):
return self._moment_acc_str return self._moment_acc_str
def test_adagrad_optimizer(self): def test_adagrad_optimizer(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
...@@ -155,11 +186,14 @@ class TestAdagradOptimizer(unittest.TestCase): ...@@ -155,11 +186,14 @@ class TestAdagradOptimizer(unittest.TestCase):
"Y": mul_y}, "Y": mul_y},
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
adagrad_optimizer = self.MockAdagrad(learning_rate=0.01, epsilon=1.0e-6) learning_rate = 0.01
adagrad_optimizer = self.MockAdagrad(
learning_rate=learning_rate, epsilon=1.0e-6)
params_grads = append_backward_ops(mul_out) params_grads = append_backward_ops(mul_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out) opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
init_program)
self.assertEqual(len(opts), 1) self.assertEqual(len(opts), 1)
adagrad_op = opts[0] adagrad_op = opts[0]
self.assertEqual(adagrad_op.type, "adagrad") self.assertEqual(adagrad_op.type, "adagrad")
...@@ -172,6 +206,14 @@ class TestAdagradOptimizer(unittest.TestCase): ...@@ -172,6 +206,14 @@ class TestAdagradOptimizer(unittest.TestCase):
self.assertEqual(len(moment_acc), 1) self.assertEqual(len(moment_acc), 1)
self.assertTrue(mul_x.name in moment_acc) self.assertTrue(mul_x.name in moment_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 2)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
self.assertEqual(init_ops[1].type, "fill_constant")
self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
class TestAdamOptimizer(unittest.TestCase): class TestAdamOptimizer(unittest.TestCase):
class MockAdam(optimizer.AdamOptimizer): class MockAdam(optimizer.AdamOptimizer):
...@@ -185,6 +227,7 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -185,6 +227,7 @@ class TestAdamOptimizer(unittest.TestCase):
return self._moment2_acc_str return self._moment2_acc_str
def test_adam_optimizer(self): def test_adam_optimizer(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
...@@ -199,12 +242,14 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -199,12 +242,14 @@ class TestAdamOptimizer(unittest.TestCase):
"Y": mul_y}, "Y": mul_y},
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
learning_rate = 0.01
adam_optimizer = self.MockAdam( adam_optimizer = self.MockAdam(
learning_rate=0.01, beta1=0.9, beta2=0.999) learning_rate=learning_rate, beta1=0.9, beta2=0.999)
params_grads = append_backward_ops(mul_out) params_grads = append_backward_ops(mul_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adam_optimizer.get_accumulators()), 0) self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
opts = adam_optimizer.create_optimization_pass(params_grads, mul_out) opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
init_program)
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
adam_op = opts[0] adam_op = opts[0]
self.assertEqual(adam_op.type, "adam") self.assertEqual(adam_op.type, "adam")
...@@ -221,6 +266,12 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -221,6 +266,12 @@ class TestAdamOptimizer(unittest.TestCase):
self.assertTrue(mul_x.name in moment1_acc) self.assertTrue(mul_x.name in moment1_acc)
self.assertTrue(mul_x.name in moment2_acc) self.assertTrue(mul_x.name in moment2_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 5)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
class TestAdamaxOptimizer(unittest.TestCase): class TestAdamaxOptimizer(unittest.TestCase):
class MockAdamax(optimizer.AdamaxOptimizer): class MockAdamax(optimizer.AdamaxOptimizer):
...@@ -234,6 +285,7 @@ class TestAdamaxOptimizer(unittest.TestCase): ...@@ -234,6 +285,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
return self._inf_norm_acc_str return self._inf_norm_acc_str
def test_adamax_optimizer(self): def test_adamax_optimizer(self):
init_program = framework.Program()
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
...@@ -248,12 +300,14 @@ class TestAdamaxOptimizer(unittest.TestCase): ...@@ -248,12 +300,14 @@ class TestAdamaxOptimizer(unittest.TestCase):
"Y": mul_y}, "Y": mul_y},
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
learning_rate = 0.01
adamax_optimizer = self.MockAdamax( adamax_optimizer = self.MockAdamax(
learning_rate=0.01, beta1=0.9, beta2=0.999) learning_rate=learning_rate, beta1=0.9, beta2=0.999)
params_grads = append_backward_ops(mul_out) params_grads = append_backward_ops(mul_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out) opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
init_program)
self.assertEqual(len(opts), 2) self.assertEqual(len(opts), 2)
adam_op = opts[0] adam_op = opts[0]
self.assertEqual(adam_op.type, "adamax") self.assertEqual(adam_op.type, "adamax")
...@@ -270,6 +324,12 @@ class TestAdamaxOptimizer(unittest.TestCase): ...@@ -270,6 +324,12 @@ class TestAdamaxOptimizer(unittest.TestCase):
self.assertTrue(mul_x.name in moment_acc) self.assertTrue(mul_x.name in moment_acc)
self.assertTrue(mul_x.name in inf_norm_acc) self.assertTrue(mul_x.name in inf_norm_acc)
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 4)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -54,8 +54,10 @@ avg_cost = layers.mean(x=cost, program=program) ...@@ -54,8 +54,10 @@ avg_cost = layers.mean(x=cost, program=program)
accuracy = layers.accuracy( accuracy = layers.accuracy(
input=predict, label=label, program=program, init_program=init_program) input=predict, label=label, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) # optimizer = optimizer.MomentumOptimizer(learning_rate=0.1 / 128.0,
opts = sgd_optimizer.minimize(avg_cost) # momentum=0.9)
optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
opts = optimizer.minimize(avg_cost, init_program)
BATCH_SIZE = 50 BATCH_SIZE = 50
PASS_NUM = 3 PASS_NUM = 3
......
...@@ -58,8 +58,8 @@ cost = layers.cross_entropy( ...@@ -58,8 +58,8 @@ cost = layers.cross_entropy(
input=predict, label=label, program=program, init_program=init_program) input=predict, label=label, program=program, init_program=init_program)
avg_cost = layers.mean(x=cost, program=program, init_program=init_program) avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
opts = sgd_optimizer.minimize(avg_cost) opts = optimizer.minimize(avg_cost, init_program)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -89,6 +89,7 @@ for pass_id in range(PASS_NUM): ...@@ -89,6 +89,7 @@ for pass_id in range(PASS_NUM):
'y': tensor_y}, 'y': tensor_y},
fetch_list=[avg_cost]) fetch_list=[avg_cost])
out = np.array(outs[0]) out = np.array(outs[0])
if out[0] < 5.0: if out[0] < 5.0:
exit(0) # if avg cost less than 5.0, we think our code is good. exit(0) # if avg cost less than 5.0, we think our code is good.
exit(1) exit(1)
...@@ -109,7 +109,7 @@ cost = layers.cross_entropy( ...@@ -109,7 +109,7 @@ cost = layers.cross_entropy(
avg_cost = layers.mean(x=cost, program=program, init_program=init_program) avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
opts = sgd_optimizer.minimize(avg_cost) opts = sgd_optimizer.minimize(avg_cost, init_program)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), batch_size) paddle.dataset.imikolov.train(word_dict, N), batch_size)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册