提交 7ce0d45e 编写于 作者: Q qiaolongfei

fix adam and adamax optimizer

上级 e91ecd5d
...@@ -123,7 +123,7 @@ class Optimizer(object): ...@@ -123,7 +123,7 @@ class Optimizer(object):
""" """
pass pass
def _finish_update(self, block): def _finish_update(self, block, parameters):
"""Finish any custom updates needed """Finish any custom updates needed
before completing an optimization step before completing an optimization step
...@@ -132,7 +132,7 @@ class Optimizer(object): ...@@ -132,7 +132,7 @@ class Optimizer(object):
parameters: list of parameter variables for the optimizer parameters: list of parameter variables for the optimizer
Returns: Returns:
list of finish ops or None None
""" """
pass pass
...@@ -236,7 +236,8 @@ class Optimizer(object): ...@@ -236,7 +236,8 @@ class Optimizer(object):
# Get custom finish ops for subclasses # Get custom finish ops for subclasses
# FIXME: Need to fix this once we figure out how to handle dependencies # FIXME: Need to fix this once we figure out how to handle dependencies
self._finish_update(loss.block) self._finish_update(loss.block,
[p[0] for p in parameters_and_grads])
end = len(global_block.ops) end = len(global_block.ops)
return global_block.slice_ops(start, end) return global_block.slice_ops(start, end)
...@@ -486,6 +487,8 @@ class AdamOptimizer(Optimizer): ...@@ -486,6 +487,8 @@ class AdamOptimizer(Optimizer):
""" """
_moment1_acc_str = "moment1" _moment1_acc_str = "moment1"
_moment2_acc_str = "moment2" _moment2_acc_str = "moment2"
_beta1_pow_acc_str = "beta1_pow_acc"
_beta2_pow_acc_str = "beta2_pow_acc"
def __init__(self, def __init__(self,
learning_rate=0.001, learning_rate=0.001,
...@@ -507,32 +510,22 @@ class AdamOptimizer(Optimizer): ...@@ -507,32 +510,22 @@ class AdamOptimizer(Optimizer):
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
main_block = block.program.global_block()
# Create beta1 and beta2 power tensors
beta_shape = [1]
self._beta1_pow_acc = self.helper.create_global_variable(
name=unique_name.generate('beta1_pow_acc'),
dtype='float32' if self._dtype == None else self._dtype,
shape=beta_shape,
lod_level=0,
persistable=True)
self.helper.set_variable_initializer(
self._beta1_pow_acc, initializer=Constant(self._beta1))
self._beta2_pow_acc = self.helper.create_global_variable(
name=unique_name.generate('beta2_pow_acc'),
dtype='float32' if self._dtype == None else self._dtype,
shape=beta_shape,
lod_level=0,
persistable=True)
self.helper.set_variable_initializer(
self._beta2_pow_acc, initializer=Constant(self._beta2))
# Create accumulator tensors for first and second moments # Create accumulator tensors for first and second moments
for p in parameters: for p in parameters:
self._add_accumulator(self._moment1_acc_str, p) self._add_accumulator(self._moment1_acc_str, p)
self._add_accumulator(self._moment2_acc_str, p) self._add_accumulator(self._moment2_acc_str, p)
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=p,
dtype='float32',
fill_value=self._beta1,
shape=[1])
self._add_accumulator(
name=self._beta2_pow_acc_str,
param=p,
dtype='float32',
fill_value=self._beta2,
shape=[1])
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -541,6 +534,11 @@ class AdamOptimizer(Optimizer): ...@@ -541,6 +534,11 @@ class AdamOptimizer(Optimizer):
param_and_grad[0]) param_and_grad[0])
moment2 = self._get_accumulator(self._moment2_acc_str, moment2 = self._get_accumulator(self._moment2_acc_str,
param_and_grad[0]) param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param_and_grad[0])
# create the adam optimize op # create the adam optimize op
adam_op = block.append_op( adam_op = block.append_op(
type=self.type, type=self.type,
...@@ -550,8 +548,8 @@ class AdamOptimizer(Optimizer): ...@@ -550,8 +548,8 @@ class AdamOptimizer(Optimizer):
"LearningRate": self._create_param_lr(param_and_grad), "LearningRate": self._create_param_lr(param_and_grad),
"Moment1": moment1, "Moment1": moment1,
"Moment2": moment2, "Moment2": moment2,
"Beta1Pow": self._beta1_pow_acc, "Beta1Pow": beta1_pow_acc,
"Beta2Pow": self._beta2_pow_acc "Beta2Pow": beta2_pow_acc
}, },
outputs={ outputs={
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
...@@ -566,24 +564,27 @@ class AdamOptimizer(Optimizer): ...@@ -566,24 +564,27 @@ class AdamOptimizer(Optimizer):
return adam_op return adam_op
def _finish_update(self, block): def _finish_update(self, block, parameters):
"""Update Beta1 and Beta2 Power accumulators """Update Beta1 and Beta2 Power accumulators
""" """
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
main_block = block.program.global_block() main_block = block.program.global_block()
scale_beta1 = main_block.append_op( for param in parameters:
type="scale", beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
inputs={"X": self._beta1_pow_acc}, param)
outputs={"Out": self._beta1_pow_acc}, beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
attrs={"scale": self._beta1}) param)
main_block.append_op(
scale_beta2 = main_block.append_op( type="scale",
type="scale", inputs={"X": beta1_pow_acc},
inputs={"X": self._beta2_pow_acc}, outputs={"Out": beta1_pow_acc},
outputs={"Out": self._beta2_pow_acc}, attrs={"scale": self._beta1})
attrs={"scale": self._beta2})
main_block.append_op(
return [scale_beta1, scale_beta2] type="scale",
inputs={"X": beta2_pow_acc},
outputs={"Out": beta2_pow_acc},
attrs={"scale": self._beta2})
class AdamaxOptimizer(Optimizer): class AdamaxOptimizer(Optimizer):
...@@ -626,6 +627,7 @@ class AdamaxOptimizer(Optimizer): ...@@ -626,6 +627,7 @@ class AdamaxOptimizer(Optimizer):
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
_inf_norm_acc_str = "inf_norm" _inf_norm_acc_str = "inf_norm"
_beta1_pow_acc_str = "beta1_pow_acc"
def __init__(self, def __init__(self,
learning_rate=0.001, learning_rate=0.001,
...@@ -645,21 +647,16 @@ class AdamaxOptimizer(Optimizer): ...@@ -645,21 +647,16 @@ class AdamaxOptimizer(Optimizer):
self._epsilon = epsilon self._epsilon = epsilon
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
# Create beta1 power accumulator tensor
beta_shape = [1]
self._beta1_pow_acc = self.helper.create_global_variable(
name=unique_name.generate('beta1_pow_acc'),
dtype='float32' if self._dtype == None else self._dtype,
shape=beta_shape,
lod_level=0,
persistable=True)
self.helper.set_variable_initializer(
self._beta1_pow_acc, initializer=Constant(self._beta1))
# Create accumulator tensors for first moment and infinity norm # Create accumulator tensors for first moment and infinity norm
for p in parameters: for p in parameters:
self._add_accumulator(self._moment_acc_str, p) self._add_accumulator(self._moment_acc_str, p)
self._add_accumulator(self._inf_norm_acc_str, p) self._add_accumulator(self._inf_norm_acc_str, p)
self._add_accumulator(
name=self._beta1_pow_acc_str,
param=p,
dtype='float32',
fill_value=self._beta1,
shape=[1])
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -667,6 +664,8 @@ class AdamaxOptimizer(Optimizer): ...@@ -667,6 +664,8 @@ class AdamaxOptimizer(Optimizer):
moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0]) moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
inf_norm = self._get_accumulator(self._inf_norm_acc_str, inf_norm = self._get_accumulator(self._inf_norm_acc_str,
param_and_grad[0]) param_and_grad[0])
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param_and_grad[0])
# create the adamax optimize op # create the adamax optimize op
adamax_op = block.append_op( adamax_op = block.append_op(
type=self.type, type=self.type,
...@@ -676,7 +675,7 @@ class AdamaxOptimizer(Optimizer): ...@@ -676,7 +675,7 @@ class AdamaxOptimizer(Optimizer):
"LearningRate": self._create_param_lr(param_and_grad), "LearningRate": self._create_param_lr(param_and_grad),
"Moment": moment, "Moment": moment,
"InfNorm": inf_norm, "InfNorm": inf_norm,
"Beta1Pow": self._beta1_pow_acc "Beta1Pow": beta1_pow_acc
}, },
outputs={ outputs={
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
...@@ -691,18 +690,19 @@ class AdamaxOptimizer(Optimizer): ...@@ -691,18 +690,19 @@ class AdamaxOptimizer(Optimizer):
return adamax_op return adamax_op
def _finish_update(self, block): def _finish_update(self, block, parameters):
"""Update Beta1 Power accumulator """Update Beta1 Power accumulator
""" """
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
main_block = block.program.global_block() main_block = block.program.global_block()
scale_beta1 = main_block.append_op( for param in parameters:
type="scale", beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
inputs={"X": self._beta1_pow_acc}, param)
outputs={"Out": self._beta1_pow_acc}, main_block.append_op(
attrs={"scale": self._beta1}) type="scale",
inputs={"X": beta1_pow_acc},
return [scale_beta1] outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1})
class DecayedAdagradOptimizer(Optimizer): class DecayedAdagradOptimizer(Optimizer):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册