未验证 提交 e42f9b7a 编写于 作者: Q Qiao Longfei 提交者: GitHub

Merge pull request #12103 from jacquesqiao/fix-optimizer-accumulator

Fix optimizer accumulator
...@@ -123,7 +123,7 @@ class Optimizer(object): ...@@ -123,7 +123,7 @@ class Optimizer(object):
""" """
pass pass
def _finish_update(self, block): def _finish_update(self, block, parameters):
"""Finish any custom updates needed """Finish any custom updates needed
before completing an optimization step before completing an optimization step
...@@ -132,7 +132,7 @@ class Optimizer(object): ...@@ -132,7 +132,7 @@ class Optimizer(object):
parameters: list of parameter variables for the optimizer parameters: list of parameter variables for the optimizer
Returns: Returns:
list of finish ops or None None
""" """
pass pass
...@@ -236,7 +236,8 @@ class Optimizer(object): ...@@ -236,7 +236,8 @@ class Optimizer(object):
# Get custom finish ops for subclasses # Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies # FIXME: Need to fix this once we figure out how to handle dependencies
self._finish_update(loss.block) self._finish_update(loss.block,
[p[0] for p in parameters_and_grads])
end = len(global_block.ops) end = len(global_block.ops)
return global_block.slice_ops(start, end) return global_block.slice_ops(start, end)
...@@ -486,6 +487,8 @@ class AdamOptimizer(Optimizer): ...@@ -486,6 +487,8 @@ class AdamOptimizer(Optimizer):
""" """
_moment1_acc_str = "moment1" _moment1_acc_str = "moment1"
_moment2_acc_str = "moment2" _moment2_acc_str = "moment2"
_beta1_pow_acc_str = "beta1_pow_acc"
_beta2_pow_acc_str = "beta2_pow_acc"
def __init__(self, def __init__(self,
learning_rate=0.001, learning_rate=0.001,
...@@ -507,32 +510,22 @@ class AdamOptimizer(Optimizer): ...@@ -507,32 +510,22 @@ class AdamOptimizer(Optimizer):
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
main_block = block.program.global_block()
# Create beta1 and beta2 power tensors
beta_shape = [1]
self._beta1_pow_acc = self.helper.create_global_variable(
name=unique_name.generate('beta1_pow_acc'),
dtype='float32' if self._dtype == None else self._dtype,
shape=beta_shape,
lod_level=0,
persistable=True)
self.helper.set_variable_initializer(
self._beta1_pow_acc, initializer=Constant(self._beta1))
self._beta2_pow_acc = self.helper.create_global_variable(
name=unique_name.generate('beta2_pow_acc'),
dtype='float32' if self._dtype == None else self._dtype,
shape=beta_shape,
lod_level=0,
persistable=True)
self.helper.set_variable_initializer(
self._beta2_pow_acc, initializer=Constant(self._beta2))
# Create accumulator tensors for first and second moments # Create accumulator tensors for first and second moments
for p in parameters: for p in parameters:
self._add_accumulator(self._moment1_acc_str, p) self._add_accumulator(self._moment1_acc_str, p)
self._add_accumulator(self._moment2_acc_str, p) self._add_accumulator(self._moment2_acc_str, p)
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=p,
dtype='float32',
fill_value=self._beta1,
shape=[1])
self._add_accumulator(
name=self._beta2_pow_acc_str,
param=p,
dtype='float32',
fill_value=self._beta2,
shape=[1])
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -541,6 +534,11 @@ class AdamOptimizer(Optimizer): ...@@ -541,6 +534,11 @@ class AdamOptimizer(Optimizer):
param_and_grad[0]) param_and_grad[0])
moment2 = self._get_accumulator(self._moment2_acc_str, moment2 = self._get_accumulator(self._moment2_acc_str,
param_and_grad[0]) param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param_and_grad[0])
# create the adam optimize op # create the adam optimize op
adam_op = block.append_op( adam_op = block.append_op(
type=self.type, type=self.type,
...@@ -550,8 +548,8 @@ class AdamOptimizer(Optimizer): ...@@ -550,8 +548,8 @@ class AdamOptimizer(Optimizer):
"LearningRate": self._create_param_lr(param_and_grad), "LearningRate": self._create_param_lr(param_and_grad),
"Moment1": moment1, "Moment1": moment1,
"Moment2": moment2, "Moment2": moment2,
"Beta1Pow": self._beta1_pow_acc, "Beta1Pow": beta1_pow_acc,
"Beta2Pow": self._beta2_pow_acc "Beta2Pow": beta2_pow_acc
}, },
outputs={ outputs={
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
...@@ -566,24 +564,28 @@ class AdamOptimizer(Optimizer): ...@@ -566,24 +564,28 @@ class AdamOptimizer(Optimizer):
return adam_op return adam_op
def _finish_update(self, block): def _finish_update(self, block, parameters):
"""Update Beta1 and Beta2 Power accumulators """Update Beta1 and Beta2 Power accumulators
""" """
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
main_block = block.program.global_block() main_block = block.program.global_block()
scale_beta1 = main_block.append_op( for param in parameters:
type="scale", with param.block.program.optimized_guard(param):
inputs={"X": self._beta1_pow_acc}, beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
outputs={"Out": self._beta1_pow_acc}, param)
attrs={"scale": self._beta1}) beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param)
scale_beta2 = main_block.append_op( main_block.append_op(
type="scale", type="scale",
inputs={"X": self._beta2_pow_acc}, inputs={"X": beta1_pow_acc},
outputs={"Out": self._beta2_pow_acc}, outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta2}) attrs={"scale": self._beta1})
return [scale_beta1, scale_beta2] main_block.append_op(
type="scale",
inputs={"X": beta2_pow_acc},
outputs={"Out": beta2_pow_acc},
attrs={"scale": self._beta2})
class AdamaxOptimizer(Optimizer): class AdamaxOptimizer(Optimizer):
...@@ -626,6 +628,7 @@ class AdamaxOptimizer(Optimizer): ...@@ -626,6 +628,7 @@ class AdamaxOptimizer(Optimizer):
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
_inf_norm_acc_str = "inf_norm" _inf_norm_acc_str = "inf_norm"
_beta1_pow_acc_str = "beta1_pow_acc"
def __init__(self, def __init__(self,
learning_rate=0.001, learning_rate=0.001,
...@@ -645,21 +648,16 @@ class AdamaxOptimizer(Optimizer): ...@@ -645,21 +648,16 @@ class AdamaxOptimizer(Optimizer):
self._epsilon = epsilon self._epsilon = epsilon
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
# Create beta1 power accumulator tensor
beta_shape = [1]
self._beta1_pow_acc = self.helper.create_global_variable(
name=unique_name.generate('beta1_pow_acc'),
dtype='float32' if self._dtype == None else self._dtype,
shape=beta_shape,
lod_level=0,
persistable=True)
self.helper.set_variable_initializer(
self._beta1_pow_acc, initializer=Constant(self._beta1))
# Create accumulator tensors for first moment and infinity norm # Create accumulator tensors for first moment and infinity norm
for p in parameters: for p in parameters:
self._add_accumulator(self._moment_acc_str, p) self._add_accumulator(self._moment_acc_str, p)
self._add_accumulator(self._inf_norm_acc_str, p) self._add_accumulator(self._inf_norm_acc_str, p)
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=p,
dtype='float32',
fill_value=self._beta1,
shape=[1])
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -667,6 +665,8 @@ class AdamaxOptimizer(Optimizer): ...@@ -667,6 +665,8 @@ class AdamaxOptimizer(Optimizer):
moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0]) moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
inf_norm = self._get_accumulator(self._inf_norm_acc_str, inf_norm = self._get_accumulator(self._inf_norm_acc_str,
param_and_grad[0]) param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
# create the adamax optimize op # create the adamax optimize op
adamax_op = block.append_op( adamax_op = block.append_op(
type=self.type, type=self.type,
...@@ -676,7 +676,7 @@ class AdamaxOptimizer(Optimizer): ...@@ -676,7 +676,7 @@ class AdamaxOptimizer(Optimizer):
"LearningRate": self._create_param_lr(param_and_grad), "LearningRate": self._create_param_lr(param_and_grad),
"Moment": moment, "Moment": moment,
"InfNorm": inf_norm, "InfNorm": inf_norm,
"Beta1Pow": self._beta1_pow_acc "Beta1Pow": beta1_pow_acc
}, },
outputs={ outputs={
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
...@@ -691,18 +691,20 @@ class AdamaxOptimizer(Optimizer): ...@@ -691,18 +691,20 @@ class AdamaxOptimizer(Optimizer):
return adamax_op return adamax_op
def _finish_update(self, block): def _finish_update(self, block, parameters):
"""Update Beta1 Power accumulator """Update Beta1 Power accumulator
""" """
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
main_block = block.program.global_block() main_block = block.program.global_block()
scale_beta1 = main_block.append_op( for param in parameters:
type="scale", with param.block.program.optimized_guard(param):
inputs={"X": self._beta1_pow_acc}, beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
outputs={"Out": self._beta1_pow_acc}, param)
attrs={"scale": self._beta1}) main_block.append_op(
type="scale",
return [scale_beta1] inputs={"X": beta1_pow_acc},
outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1})
class DecayedAdagradOptimizer(Optimizer): class DecayedAdagradOptimizer(Optimizer):
...@@ -1156,7 +1158,8 @@ class ModelAverage(Optimizer): ...@@ -1156,7 +1158,8 @@ class ModelAverage(Optimizer):
self.params_grads.append((param, grad)) self.params_grads.append((param, grad))
for param, grad in self.params_grads: for param, grad in self.params_grads:
self._append_average_accumulate_op(param) with param.block.program.optimized_guard(param):
self._append_average_accumulate_op(param)
self.apply_program = Program() self.apply_program = Program()
block = self.apply_program.global_block() block = self.apply_program.global_block()
......
...@@ -287,7 +287,7 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -287,7 +287,7 @@ class TestAdamOptimizer(unittest.TestCase):
# Check accumulators # Check accumulators
accumulators = adam_optimizer.get_accumulators() accumulators = adam_optimizer.get_accumulators()
self.assertEqual(len(accumulators), 2) self.assertEqual(len(accumulators), 4)
self.assertTrue(adam_optimizer.get_moment1_str() in accumulators) self.assertTrue(adam_optimizer.get_moment1_str() in accumulators)
self.assertTrue(adam_optimizer.get_moment2_str() in accumulators) self.assertTrue(adam_optimizer.get_moment2_str() in accumulators)
moment1_acc = accumulators[adam_optimizer.get_moment1_str()] moment1_acc = accumulators[adam_optimizer.get_moment1_str()]
...@@ -354,7 +354,7 @@ class TestAdamaxOptimizer(unittest.TestCase): ...@@ -354,7 +354,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
# Check accumulators # Check accumulators
accumulators = adamax_optimizer.get_accumulators() accumulators = adamax_optimizer.get_accumulators()
self.assertEqual(len(accumulators), 2) self.assertEqual(len(accumulators), 3)
self.assertTrue(adamax_optimizer.get_moment_str() in accumulators) self.assertTrue(adamax_optimizer.get_moment_str() in accumulators)
self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators) self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators)
moment_acc = accumulators[adamax_optimizer.get_moment_str()] moment_acc = accumulators[adamax_optimizer.get_moment_str()]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册