未验证 提交 6c87d487 编写于 作者: Z Zhou Wei 提交者: GitHub

fix grad_clip in dygraph mode, grad_clip strategy has been upgraded since Paddle2.0 (#4541)

上级 53723856
......@@ -361,10 +361,12 @@ def train_ptb_lm():
max(i + 1 - epoch_start_decay, 0.0))
lr_arr.append(new_lr)
grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
sgd = AdagradOptimizer(
parameter_list=ptb_model.parameters(),
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr))
boundaries=bd, values=lr_arr),
grad_clip=grad_clip)
print("parameters:--------------------------------")
for para in ptb_model.parameters():
......@@ -408,7 +410,6 @@ def train_ptb_lm():
if args.ce:
print("kpis\ttest_ppl\t%0.3f" % ppl[0])
grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
for epoch_id in range(max_epoch):
ptb_model.train()
total_loss = 0.0
......@@ -434,7 +435,7 @@ def train_ptb_lm():
init_hidden = last_hidden
dy_loss.backward()
sgd.minimize(dy_loss, grad_clip=grad_clip)
sgd.minimize(dy_loss)
ptb_model.clear_gradients()
total_loss += out_loss
iters += num_steps
......
......@@ -73,8 +73,10 @@ def train(args):
else:
learning_rate = LR
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, parameter_list=ocr_attention.parameters())
grad_clip = fluid.clip.GradientClipByGlobalNorm(args.gradient_clip)
optimizer = fluid.optimizer.Adam(
learning_rate=learning_rate, parameter_list=ocr_attention.parameters(), grad_clip=grad_clip)
train_reader = data_reader.data_reader(
args.batch_size,
......@@ -122,7 +124,7 @@ def train(args):
total_loss += avg_loss.numpy()
avg_loss.backward()
optimizer.minimize(avg_loss, grad_clip=grad_clip)
optimizer.minimize(avg_loss)
ocr_attention.clear_gradients()
if batch_id > 0 and batch_id % args.log_period == 0:
......
......@@ -332,8 +332,11 @@ def train_ptb_lm():
max(i + 1 - epoch_start_decay, 0.0))
lr_arr.append(new_lr)
sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters())
grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
sgd = SGDOptimizer(
learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr_arr),
parameter_list=ptb_model.parameters(),
grad_clip=grad_clip)
def eval(model, data):
print("begin to eval")
......@@ -371,7 +374,6 @@ def train_ptb_lm():
ce_time = []
ce_ppl = []
grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
for epoch_id in range(max_epoch):
ptb_model.train()
total_loss = 0.0
......@@ -402,7 +404,7 @@ def train_ptb_lm():
out_loss = dy_loss.numpy()
dy_loss.backward()
sgd.minimize(dy_loss, grad_clip=grad_clip)
sgd.minimize(dy_loss)
ptb_model.clear_gradients()
total_loss += out_loss
......
......@@ -88,9 +88,9 @@ def main():
lr = args.learning_rate
opt_type = args.optimizer
if opt_type == "sgd":
optimizer = fluid.optimizer.SGD(lr, parameter_list=model.parameters())
optimizer = fluid.optimizer.SGD(lr, parameter_list=model.parameters(), grad_clip = gloabl_norm_clip)
elif opt_type == "adam":
optimizer = fluid.optimizer.Adam(lr, parameter_list=model.parameters())
optimizer = fluid.optimizer.Adam(lr, parameter_list=model.parameters(), grad_clip = gloabl_norm_clip)
else:
print("only support [sgd|adam]")
raise Exception("opt type not support")
......@@ -161,7 +161,7 @@ def main():
loss = model(input_data_feed)
# print(loss.numpy()[0])
loss.backward()
optimizer.minimize(loss, grad_clip = gloabl_norm_clip)
optimizer.minimize(loss)
model.clear_gradients()
total_loss += loss * batch_size
batch_end_time = time.time()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册