未验证 提交 9c252183 编写于 作者: Q QI JUN 提交者: GitHub

create learning rate variable for every parameter (#5524)

* create learning rate variable for every parameter

* fix ci

* set parameter lr relatively to global lr
上级 1c31bb94
...@@ -35,15 +35,21 @@ class Optimizer(object): ...@@ -35,15 +35,21 @@ class Optimizer(object):
""" """
raise NotImplementedError() raise NotImplementedError()
def _initialize_tensors(self, block): def _create_param_lr(self, param_and_grad):
"""Create all necessary tensors, that will be shared for all parameter updates. # create learning rate variable for every parameter
param = param_and_grad[0]
Tensors like learning rate should be initialized here. param_lr = param.optimize_attr['learning_rate']
param_lr_shape = [1]
Args: param_lr_var = self.helper.create_global_variable(
block: the block in which the loss variable is present name=unique_name("learning_rate"),
""" dtype='float32',
pass shape=param_lr_shape,
lod_level=1,
persistable=True)
param_lr = param_lr * self._learning_rate
self.helper.set_variable_initializer(
var=param_lr_var, initializer=ConstantInitializer(param_lr))
return param_lr_var
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
"""Create all accumulators needed by the parameters """Create all accumulators needed by the parameters
...@@ -161,8 +167,6 @@ class Optimizer(object): ...@@ -161,8 +167,6 @@ class Optimizer(object):
startup_program=startup_program) startup_program=startup_program)
self._create_accumulators(loss.block, self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads]) [p[0] for p in parameters_and_grads])
# Create any necessary tensors
self._initialize_tensors(loss.block)
optimize_ops = [] optimize_ops = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
...@@ -214,27 +218,16 @@ class SGDOptimizer(Optimizer): ...@@ -214,27 +218,16 @@ class SGDOptimizer(Optimizer):
self.type = "sgd" self.type = "sgd"
self._learning_rate = learning_rate self._learning_rate = learning_rate
def _initialize_tensors(self, block):
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
# create the optimize op # create the optimize op
sgd_op = block.append_op( sgd_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs={
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"LearningRate": self._lr "LearningRate": self._create_param_lr(param_and_grad)
}, },
outputs={"ParamOut": param_and_grad[0]}) outputs={"ParamOut": param_and_grad[0]})
...@@ -259,19 +252,6 @@ class MomentumOptimizer(Optimizer): ...@@ -259,19 +252,6 @@ class MomentumOptimizer(Optimizer):
self._momentum = momentum self._momentum = momentum
self._use_nesterov = bool(use_nesterov) self._use_nesterov = bool(use_nesterov)
def _initialize_tensors(self, block):
assert isinstance(block, framework.Block)
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -290,7 +270,7 @@ class MomentumOptimizer(Optimizer): ...@@ -290,7 +270,7 @@ class MomentumOptimizer(Optimizer):
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"Velocity": velocity_acc, "Velocity": velocity_acc,
"LearningRate": self._lr "LearningRate": self._create_param_lr(param_and_grad)
}, },
outputs={ outputs={
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
...@@ -315,18 +295,6 @@ class AdagradOptimizer(Optimizer): ...@@ -315,18 +295,6 @@ class AdagradOptimizer(Optimizer):
self._learning_rate = learning_rate self._learning_rate = learning_rate
self._epsilon = epsilon self._epsilon = epsilon
def _initialize_tensors(self, block):
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -346,7 +314,7 @@ class AdagradOptimizer(Optimizer): ...@@ -346,7 +314,7 @@ class AdagradOptimizer(Optimizer):
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"Moment": moment_acc, "Moment": moment_acc,
"LearningRate": self._lr "LearningRate": self._create_param_lr(param_and_grad)
}, },
outputs={"ParamOut": param_and_grad[0], outputs={"ParamOut": param_and_grad[0],
"MomentOut": moment_acc}, "MomentOut": moment_acc},
...@@ -378,18 +346,6 @@ class AdamOptimizer(Optimizer): ...@@ -378,18 +346,6 @@ class AdamOptimizer(Optimizer):
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
def _initialize_tensors(self, block):
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -433,7 +389,7 @@ class AdamOptimizer(Optimizer): ...@@ -433,7 +389,7 @@ class AdamOptimizer(Optimizer):
inputs={ inputs={
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"LearningRate": self._lr, "LearningRate": self._create_param_lr(param_and_grad),
"Moment1": moment1, "Moment1": moment1,
"Moment2": moment2, "Moment2": moment2,
"Beta1Pow": self._beta1_pow_acc, "Beta1Pow": self._beta1_pow_acc,
...@@ -495,18 +451,6 @@ class AdamaxOptimizer(Optimizer): ...@@ -495,18 +451,6 @@ class AdamaxOptimizer(Optimizer):
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
def _initialize_tensors(self, block):
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
# Create beta1 power accumulator tensor # Create beta1 power accumulator tensor
beta_shape = [1] beta_shape = [1]
...@@ -536,7 +480,7 @@ class AdamaxOptimizer(Optimizer): ...@@ -536,7 +480,7 @@ class AdamaxOptimizer(Optimizer):
inputs={ inputs={
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"LearningRate": self._lr, "LearningRate": self._create_param_lr(param_and_grad),
"Moment": moment, "Moment": moment,
"InfNorm": inf_norm, "InfNorm": inf_norm,
"Beta1Pow": self._beta1_pow_acc "Beta1Pow": self._beta1_pow_acc
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册