未验证 提交 6c87d487 编写于 作者: Z Zhou Wei 提交者: GitHub

fix grad_clip in dygraph mode, grad_clip strategy has been upgraded since Paddle2.0 (#4541)

上级 53723856
...@@ -361,10 +361,12 @@ def train_ptb_lm(): ...@@ -361,10 +361,12 @@ def train_ptb_lm():
max(i + 1 - epoch_start_decay, 0.0)) max(i + 1 - epoch_start_decay, 0.0))
lr_arr.append(new_lr) lr_arr.append(new_lr)
grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
sgd = AdagradOptimizer( sgd = AdagradOptimizer(
parameter_list=ptb_model.parameters(), parameter_list=ptb_model.parameters(),
learning_rate=fluid.layers.piecewise_decay( learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr)) boundaries=bd, values=lr_arr),
grad_clip=grad_clip)
print("parameters:--------------------------------") print("parameters:--------------------------------")
for para in ptb_model.parameters(): for para in ptb_model.parameters():
...@@ -408,7 +410,6 @@ def train_ptb_lm(): ...@@ -408,7 +410,6 @@ def train_ptb_lm():
if args.ce: if args.ce:
print("kpis\ttest_ppl\t%0.3f" % ppl[0]) print("kpis\ttest_ppl\t%0.3f" % ppl[0])
grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
for epoch_id in range(max_epoch): for epoch_id in range(max_epoch):
ptb_model.train() ptb_model.train()
total_loss = 0.0 total_loss = 0.0
...@@ -434,7 +435,7 @@ def train_ptb_lm(): ...@@ -434,7 +435,7 @@ def train_ptb_lm():
init_hidden = last_hidden init_hidden = last_hidden
dy_loss.backward() dy_loss.backward()
sgd.minimize(dy_loss, grad_clip=grad_clip) sgd.minimize(dy_loss)
ptb_model.clear_gradients() ptb_model.clear_gradients()
total_loss += out_loss total_loss += out_loss
iters += num_steps iters += num_steps
......
...@@ -73,8 +73,10 @@ def train(args): ...@@ -73,8 +73,10 @@ def train(args):
else: else:
learning_rate = LR learning_rate = LR
optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, parameter_list=ocr_attention.parameters())
grad_clip = fluid.clip.GradientClipByGlobalNorm(args.gradient_clip) grad_clip = fluid.clip.GradientClipByGlobalNorm(args.gradient_clip)
optimizer = fluid.optimizer.Adam(
learning_rate=learning_rate, parameter_list=ocr_attention.parameters(), grad_clip=grad_clip)
train_reader = data_reader.data_reader( train_reader = data_reader.data_reader(
args.batch_size, args.batch_size,
...@@ -122,7 +124,7 @@ def train(args): ...@@ -122,7 +124,7 @@ def train(args):
total_loss += avg_loss.numpy() total_loss += avg_loss.numpy()
avg_loss.backward() avg_loss.backward()
optimizer.minimize(avg_loss, grad_clip=grad_clip) optimizer.minimize(avg_loss)
ocr_attention.clear_gradients() ocr_attention.clear_gradients()
if batch_id > 0 and batch_id % args.log_period == 0: if batch_id > 0 and batch_id % args.log_period == 0:
......
...@@ -332,8 +332,11 @@ def train_ptb_lm(): ...@@ -332,8 +332,11 @@ def train_ptb_lm():
max(i + 1 - epoch_start_decay, 0.0)) max(i + 1 - epoch_start_decay, 0.0))
lr_arr.append(new_lr) lr_arr.append(new_lr)
sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters()) sgd = SGDOptimizer(
learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr_arr),
parameter_list=ptb_model.parameters(),
grad_clip=grad_clip)
def eval(model, data): def eval(model, data):
print("begin to eval") print("begin to eval")
...@@ -371,7 +374,6 @@ def train_ptb_lm(): ...@@ -371,7 +374,6 @@ def train_ptb_lm():
ce_time = [] ce_time = []
ce_ppl = [] ce_ppl = []
grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
for epoch_id in range(max_epoch): for epoch_id in range(max_epoch):
ptb_model.train() ptb_model.train()
total_loss = 0.0 total_loss = 0.0
...@@ -402,7 +404,7 @@ def train_ptb_lm(): ...@@ -402,7 +404,7 @@ def train_ptb_lm():
out_loss = dy_loss.numpy() out_loss = dy_loss.numpy()
dy_loss.backward() dy_loss.backward()
sgd.minimize(dy_loss, grad_clip=grad_clip) sgd.minimize(dy_loss)
ptb_model.clear_gradients() ptb_model.clear_gradients()
total_loss += out_loss total_loss += out_loss
......
...@@ -88,9 +88,9 @@ def main(): ...@@ -88,9 +88,9 @@ def main():
lr = args.learning_rate lr = args.learning_rate
opt_type = args.optimizer opt_type = args.optimizer
if opt_type == "sgd": if opt_type == "sgd":
optimizer = fluid.optimizer.SGD(lr, parameter_list=model.parameters()) optimizer = fluid.optimizer.SGD(lr, parameter_list=model.parameters(), grad_clip = gloabl_norm_clip)
elif opt_type == "adam": elif opt_type == "adam":
optimizer = fluid.optimizer.Adam(lr, parameter_list=model.parameters()) optimizer = fluid.optimizer.Adam(lr, parameter_list=model.parameters(), grad_clip = gloabl_norm_clip)
else: else:
print("only support [sgd|adam]") print("only support [sgd|adam]")
raise Exception("opt type not support") raise Exception("opt type not support")
...@@ -161,7 +161,7 @@ def main(): ...@@ -161,7 +161,7 @@ def main():
loss = model(input_data_feed) loss = model(input_data_feed)
# print(loss.numpy()[0]) # print(loss.numpy()[0])
loss.backward() loss.backward()
optimizer.minimize(loss, grad_clip = gloabl_norm_clip) optimizer.minimize(loss)
model.clear_gradients() model.clear_gradients()
total_loss += loss * batch_size total_loss += loss * batch_size
batch_end_time = time.time() batch_end_time = time.time()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册