diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 75ee40fa9ca94cdd84ee7acbb62d6e652ac7fa33..8b6a9b00eee1ab7904b433cc804394733d5f798a 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -123,7 +123,7 @@ class Optimizer(object): """ pass - def _finish_update(self, block): + def _finish_update(self, block, parameters): """Finish any custom updates needed before completing an optimization step @@ -132,7 +132,7 @@ class Optimizer(object): parameters: list of parameter variables for the optimizer Returns: - list of finish ops or None + None """ pass @@ -236,7 +236,8 @@ class Optimizer(object): # Get custom finish ops for subclasses # FIXME: Need to fix this once we figure out how to handle dependencies - self._finish_update(loss.block) + self._finish_update(loss.block, + [p[0] for p in parameters_and_grads]) end = len(global_block.ops) return global_block.slice_ops(start, end) @@ -486,6 +487,8 @@ class AdamOptimizer(Optimizer): """ _moment1_acc_str = "moment1" _moment2_acc_str = "moment2" + _beta1_pow_acc_str = "beta1_pow_acc" + _beta2_pow_acc_str = "beta2_pow_acc" def __init__(self, learning_rate=0.001, @@ -507,32 +510,22 @@ class AdamOptimizer(Optimizer): def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) - main_block = block.program.global_block() - # Create beta1 and beta2 power tensors - beta_shape = [1] - self._beta1_pow_acc = self.helper.create_global_variable( - name=unique_name.generate('beta1_pow_acc'), - dtype='float32' if self._dtype == None else self._dtype, - shape=beta_shape, - lod_level=0, - persistable=True) - self.helper.set_variable_initializer( - self._beta1_pow_acc, initializer=Constant(self._beta1)) - - self._beta2_pow_acc = self.helper.create_global_variable( - name=unique_name.generate('beta2_pow_acc'), - dtype='float32' if self._dtype == None else self._dtype, - shape=beta_shape, - lod_level=0, - persistable=True) - - self.helper.set_variable_initializer( - self._beta2_pow_acc, initializer=Constant(self._beta2)) - # Create accumulator tensors for first and second moments for p in parameters: self._add_accumulator(self._moment1_acc_str, p) self._add_accumulator(self._moment2_acc_str, p) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=p, + dtype='float32', + fill_value=self._beta1, + shape=[1]) + self._add_accumulator( + name=self._beta2_pow_acc_str, + param=p, + dtype='float32', + fill_value=self._beta2, + shape=[1]) def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) @@ -541,6 +534,11 @@ class AdamOptimizer(Optimizer): param_and_grad[0]) moment2 = self._get_accumulator(self._moment2_acc_str, param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param_and_grad[0]) + # create the adam optimize op adam_op = block.append_op( type=self.type, @@ -550,8 +548,8 @@ class AdamOptimizer(Optimizer): "LearningRate": self._create_param_lr(param_and_grad), "Moment1": moment1, "Moment2": moment2, - "Beta1Pow": self._beta1_pow_acc, - "Beta2Pow": self._beta2_pow_acc + "Beta1Pow": beta1_pow_acc, + "Beta2Pow": beta2_pow_acc }, outputs={ "ParamOut": param_and_grad[0], @@ -566,24 +564,27 @@ class AdamOptimizer(Optimizer): return adam_op - def _finish_update(self, block): + def _finish_update(self, block, parameters): """Update Beta1 and Beta2 Power accumulators """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - scale_beta1 = main_block.append_op( - type="scale", - inputs={"X": self._beta1_pow_acc}, - outputs={"Out": self._beta1_pow_acc}, - attrs={"scale": self._beta1}) - - scale_beta2 = main_block.append_op( - type="scale", - inputs={"X": self._beta2_pow_acc}, - outputs={"Out": self._beta2_pow_acc}, - attrs={"scale": self._beta2}) - - return [scale_beta1, scale_beta2] + for param in parameters: + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param) + main_block.append_op( + type="scale", + inputs={"X": beta1_pow_acc}, + outputs={"Out": beta1_pow_acc}, + attrs={"scale": self._beta1}) + + main_block.append_op( + type="scale", + inputs={"X": beta2_pow_acc}, + outputs={"Out": beta2_pow_acc}, + attrs={"scale": self._beta2}) class AdamaxOptimizer(Optimizer): @@ -626,6 +627,7 @@ class AdamaxOptimizer(Optimizer): """ _moment_acc_str = "moment" _inf_norm_acc_str = "inf_norm" + _beta1_pow_acc_str = "beta1_pow_acc" def __init__(self, learning_rate=0.001, @@ -645,21 +647,16 @@ class AdamaxOptimizer(Optimizer): self._epsilon = epsilon def _create_accumulators(self, block, parameters): - # Create beta1 power accumulator tensor - beta_shape = [1] - self._beta1_pow_acc = self.helper.create_global_variable( - name=unique_name.generate('beta1_pow_acc'), - dtype='float32' if self._dtype == None else self._dtype, - shape=beta_shape, - lod_level=0, - persistable=True) - self.helper.set_variable_initializer( - self._beta1_pow_acc, initializer=Constant(self._beta1)) - # Create accumulator tensors for first moment and infinity norm for p in parameters: self._add_accumulator(self._moment_acc_str, p) self._add_accumulator(self._inf_norm_acc_str, p) + self._add_accumulator( + name=self._beta1_pow_acc_str, + param=p, + dtype='float32', + fill_value=self._beta1, + shape=[1]) def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) @@ -667,6 +664,8 @@ class AdamaxOptimizer(Optimizer): moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0]) inf_norm = self._get_accumulator(self._inf_norm_acc_str, param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) # create the adamax optimize op adamax_op = block.append_op( type=self.type, @@ -676,7 +675,7 @@ class AdamaxOptimizer(Optimizer): "LearningRate": self._create_param_lr(param_and_grad), "Moment": moment, "InfNorm": inf_norm, - "Beta1Pow": self._beta1_pow_acc + "Beta1Pow": beta1_pow_acc }, outputs={ "ParamOut": param_and_grad[0], @@ -691,18 +690,19 @@ class AdamaxOptimizer(Optimizer): return adamax_op - def _finish_update(self, block): + def _finish_update(self, block, parameters): """Update Beta1 Power accumulator """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - scale_beta1 = main_block.append_op( - type="scale", - inputs={"X": self._beta1_pow_acc}, - outputs={"Out": self._beta1_pow_acc}, - attrs={"scale": self._beta1}) - - return [scale_beta1] + for param in parameters: + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param) + main_block.append_op( + type="scale", + inputs={"X": beta1_pow_acc}, + outputs={"Out": beta1_pow_acc}, + attrs={"scale": self._beta1}) class DecayedAdagradOptimizer(Optimizer):