提交 3d8077e9 编写于 作者: Q Qiao Longfei

update optimizer

上级 fbcdb29d
...@@ -40,30 +40,6 @@ __all__ = [ ...@@ -40,30 +40,6 @@ __all__ = [
] ]
def _process_distribute_lookuptable(program, param_grads, learning_rate):
table_name = find_distributed_lookup_table(program)
table_param = None
table_grad = None
new_param_grads = []
for p, g in param_grads:
if p.name == table_name:
if table_param is not None:
raise RuntimeError(
"multi dist table var found, only support one now!")
table_param = p
table_grad = g
else:
new_param_grads.append((p, g))
sgd_op = None
if table_param is not None:
with table_param.block.program._optimized_guard(
[table_param, table_grad]), framework.name_scope("optimizer"):
sgd_optimizer = SGD(learning_rate)
sgd_op = sgd_optimizer._append_optimize_op(table_param.block, (
table_param, table_grad))
return new_param_grads, (table_param, table_grad), sgd_op
class Optimizer(object): class Optimizer(object):
"""Optimizer Base class. """Optimizer Base class.
...@@ -111,7 +87,7 @@ class Optimizer(object): ...@@ -111,7 +87,7 @@ class Optimizer(object):
name=unique_name.generate("learning_rate"), name=unique_name.generate("learning_rate"),
shape=[1], shape=[1],
value=float(self._learning_rate), value=float(self._learning_rate),
dtype='float32' if self._dtype == None else self._dtype, dtype='float32' if self._dtype is None else self._dtype,
persistable=True) persistable=True)
def _global_learning_rate(self, program=None): def _global_learning_rate(self, program=None):
...@@ -251,7 +227,6 @@ class Optimizer(object): ...@@ -251,7 +227,6 @@ class Optimizer(object):
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
self._create_accumulators(loss.block, self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads]) [p[0] for p in parameters_and_grads])
self._create_global_learning_rate()
optimize_ops = [] optimize_ops = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
...@@ -271,6 +246,40 @@ class Optimizer(object): ...@@ -271,6 +246,40 @@ class Optimizer(object):
end = len(global_block.ops) end = len(global_block.ops)
return global_block._slice_ops(start, end) return global_block._slice_ops(start, end)
def _process_distribute_lookuptable(self, param_grads, loss,
startup_program):
program = loss.block.program
table_name = find_distributed_lookup_table(program)
table_param = None
table_grad = None
new_param_grads = []
for p, g in param_grads:
if p.name == table_name:
if table_param is not None:
raise RuntimeError(
"multi dist table var found, only support one now!")
table_param = p
table_grad = g
else:
new_param_grads.append((p, g))
sgd_op = None
if table_param is not None:
with program_guard(program, startup_program):
param_and_grad = [table_param, table_grad]
with table_param.block.program._optimized_guard(param_and_grad), \
framework.name_scope("optimizer"):
# create the optimize op
sgd_op = loss.block.append_op(
type='sgd',
inputs={
"Param": table_param,
"Grad": table_grad,
"LearningRate":
self._create_param_lr(param_and_grad)
},
outputs={"ParamOut": param_and_grad[0]})
return new_param_grads, (table_param, table_grad), sgd_op
def minimize(self, def minimize(self,
loss, loss,
startup_program=None, startup_program=None,
...@@ -281,26 +290,29 @@ class Optimizer(object): ...@@ -281,26 +290,29 @@ class Optimizer(object):
This method combines interface `append_backward()` and This method combines interface `append_backward()` and
`create_optimization_pass()` into one. `create_optimization_pass()` into one.
""" """
params_grads = append_backward(loss, parameter_list, no_grad_set, with program_guard(loss.block.program, startup_program):
[error_clip_callback]) self._create_global_learning_rate()
params_grads = append_backward(loss, parameter_list, no_grad_set,
[error_clip_callback])
params_grads = sorted(params_grads, key=lambda x: x[0].name) params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads, table_param_and_grad, table_optimize_op = \ params_grads, table_param_and_grad, table_optimize_op = \
_process_distribute_lookuptable(loss.block.program, params_grads, self._learning_rate) self._process_distribute_lookuptable(params_grads, loss, startup_program)
params_grads = append_gradient_clip_ops(params_grads) params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any # Add regularization if any
params_grads = append_regularization_ops(params_grads, params_grads = append_regularization_ops(params_grads,
self.regularization) self.regularization)
optimize_ops = self._create_optimization_pass(params_grads, loss, optimize_ops = self._create_optimization_pass(params_grads, loss,
startup_program) startup_program)
if table_optimize_op is not None: if table_optimize_op is not None:
optimize_ops.append(table_optimize_op) optimize_ops.append(table_optimize_op)
params_grads.append(table_param_and_grad) params_grads.append(table_param_and_grad)
return optimize_ops, params_grads return optimize_ops, params_grads
class SGDOptimizer(Optimizer): class SGDOptimizer(Optimizer):
......
...@@ -17,3 +17,4 @@ from __future__ import print_function ...@@ -17,3 +17,4 @@ from __future__ import print_function
from .program_utils import * from .program_utils import *
from .ufind import * from .ufind import *
from .checkport import * from .checkport import *
from .distribute_lookuptable_utils import *
...@@ -12,9 +12,6 @@ ...@@ -12,9 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.fluid.optimizer as optimizer
import paddle.fluid.framework as framework
LOOKUP_TABLE_TYPE = "lookup_table" LOOKUP_TABLE_TYPE = "lookup_table"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册