提交 461b3e28 编写于 作者: W wangxiao1021

update optimizer

上级 c62683d0
...@@ -19,7 +19,7 @@ import numpy as np ...@@ -19,7 +19,7 @@ import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
from hapi.model import set_device, Model, CrossEntropy, Input from hapi.model import set_device, Model, CrossEntropy, Input
from hapi.configure import Config from hapi.configure import Config
from hapi.text.senta import SentaProcessor, Optimizer from hapi.text.senta import SentaProcessor
from hapi.metrics import Accuracy from hapi.metrics import Accuracy
from models import CNN, BOW, GRU, BiGRU from models import CNN, BOW, GRU, BiGRU
import json import json
...@@ -78,11 +78,7 @@ def train(): ...@@ -78,11 +78,7 @@ def train():
model = BiGRU( args.vocab_size, args.batch_size, model = BiGRU( args.vocab_size, args.batch_size,
args.padding_size) args.padding_size)
optimizer = Optimizer( optimizer = fluid.optimizer.Adagrad(learning_rate=args.lr, parameter_list=model.parameters())
num_train_steps=max_train_steps,
model_cls=model,
learning_rate=args.lr,
parameter_list=model.parameters())
inputs = [Input([None, None], 'int64', name='doc')] inputs = [Input([None, None], 'int64', name='doc')]
labels = [Input([None, 1], 'int64', name='label')] labels = [Input([None, 1], 'int64', name='label')]
......
...@@ -13,4 +13,3 @@ ...@@ -13,4 +13,3 @@
# limitations under the License. # limitations under the License.
from hapi.text.senta.data_processer import SentaProcessor from hapi.text.senta.data_processer import SentaProcessor
from hapi.text.senta.optimization import Optimizer as Optimizer
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization and learning rate scheduling."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
class ConstantLR(LearningRateDecay):
def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
super(ConstantLR, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
def step(self):
return self.learning_rate
class LinearDecay(LearningRateDecay):
def __init__(self,
learning_rate,
warmup_steps,
decay_steps,
end_learning_rate=0.0001,
power=1.0,
cycle=False,
begin=0,
step=1,
dtype='float32'):
super(LinearDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.warmup_steps = warmup_steps
self.decay_steps = decay_steps
self.end_learning_rate = end_learning_rate
self.power = power
self.cycle = cycle
def step(self):
if self.step_num < self.warmup_steps:
decayed_lr = self.learning_rate * (self.step_num /
self.warmup_steps)
decayed_lr = self.create_lr_var(decayed_lr)
else:
tmp_step_num = self.step_num
tmp_decay_steps = self.decay_steps
if self.cycle:
div_res = fluid.layers.ceil(
self.create_lr_var(tmp_step_num / float(self.decay_steps)))
if tmp_step_num == 0:
div_res = self.create_lr_var(1.0)
tmp_decay_steps = self.decay_steps * div_res
else:
tmp_step_num = self.create_lr_var(
tmp_step_num
if tmp_step_num < self.decay_steps else self.decay_steps)
decayed_lr = (self.learning_rate - self.end_learning_rate) * \
((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
return decayed_lr
class Optimizer(object):
def __init__(self,
num_train_steps,
learning_rate,
model_cls,
weight_decay=0,
warmup_steps=0,
scheduler='linear_warmup_decay',
loss_scaling=1.0,
parameter_list=None):
self.warmup_steps = warmup_steps
self.num_train_steps = num_train_steps
self.learning_rate = learning_rate
self.model_cls = model_cls
self.weight_decay = weight_decay
self.scheduler = scheduler
self.loss_scaling = loss_scaling
self.parameter_list = parameter_list
self.scheduled_lr = 0.0
self.optimizer = self.lr_schedule()
def lr_schedule(self):
if self.warmup_steps > 0:
if self.scheduler == 'noam_decay':
self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
self.warmup_steps * (self.learning_rate**2)),
self.warmup_steps)
elif self.scheduler == 'linear_warmup_decay':
self.scheduled_lr = LinearDecay(self.learning_rate,
self.warmup_steps,
self.num_train_steps, 0.0)
else:
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(
learning_rate=self.scheduled_lr,
parameter_list=self.parameter_list)
else:
self.scheduled_lr = ConstantLR(self.learning_rate)
optimizer = fluid.optimizer.Adam(
learning_rate=self.scheduled_lr,
parameter_list=self.parameter_list)
return optimizer
def exclude_from_weight_decay(self, name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
def state_dict(self):
return self.optimizer.state_dict()
def set_dict(self, state_dict):
return self.optimizer.set_dict(state_dict)
def get_opti_var_name_list(self):
return self.optimizer.get_opti_var_name_list()
def current_step_lr(self):
return self.optimizer.current_step_lr()
def minimize(self, loss, use_data_parallel=False, model=None):
param_list = dict()
clip_norm_thres = 1.0
#grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
if use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
if self.weight_decay > 0:
for param in self.model_cls.parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
if use_data_parallel:
assert model is not None
model.apply_collective_grads()
#_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
_, param_grads = self.optimizer.minimize(loss)
if self.weight_decay > 0:
for param, grad in param_grads:
if self.exclude_from_weight_decay(param.name):
continue
if isinstance(self.scheduled_lr.step(), float):
updated_param = param.numpy() - param_list[
param.name].numpy(
) * self.weight_decay * self.scheduled_lr.step()
else:
updated_param = param.numpy(
) - param_list[param.name].numpy(
) * self.weight_decay * self.scheduled_lr.step().numpy()
updated_param_var = fluid.dygraph.to_variable(updated_param)
param = updated_param_var
#param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册