提交 255f2d28 编写于 作者: X xixiaoyao

delete fp16

上级 696ef944
...@@ -31,6 +31,5 @@ max_seq_len: 512 ...@@ -31,6 +31,5 @@ max_seq_len: 512
use_ema: True use_ema: True
ema_decay: 0.9999 ema_decay: 0.9999
random_seed: 0 random_seed: 0
use_fp16: False
loss_scaling: 1.0 loss_scaling: 1.0
...@@ -275,14 +275,14 @@ def train(multitask_config): ...@@ -275,14 +275,14 @@ def train(multitask_config):
exe, exe,
args.pretrain_model_path, args.pretrain_model_path,
main_program=startup_prog, main_program=startup_prog,
use_fp16=args.use_fp16) use_fp16=False)
if args.checkpoint_path: if args.checkpoint_path:
if os.path.exists(args.checkpoint_path): if os.path.exists(args.checkpoint_path):
init_checkpoint( init_checkpoint(
exe, exe,
args.checkpoint_path, args.checkpoint_path,
main_program=startup_prog, main_program=startup_prog,
use_fp16=args.use_fp16) use_fp16=False)
else: else:
os.makedirs(args.checkpoint_path) os.makedirs(args.checkpoint_path)
...@@ -294,7 +294,7 @@ def train(multitask_config): ...@@ -294,7 +294,7 @@ def train(multitask_config):
exe, exe,
args.checkpoint_path, args.checkpoint_path,
main_program=test_prog, main_program=test_prog,
use_fp16=args.use_fp16) use_fp16=False)
if args.do_train: if args.do_train:
print('start training...') print('start training...')
......
...@@ -19,8 +19,6 @@ from __future__ import print_function ...@@ -19,8 +19,6 @@ from __future__ import print_function
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
from utils.fp16 import create_master_params_grads, master_param_to_train_param
def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps): def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
""" Applies linear warmup of learning rate from 0 and decay to 0.""" """ Applies linear warmup of learning rate from 0 and decay to 0."""
...@@ -73,8 +71,6 @@ def optimization(loss, programs, args): ...@@ -73,8 +71,6 @@ def optimization(loss, programs, args):
clip_norm_thres = 1.0 clip_norm_thres = 1.0
# When using mixed precision training, scale the gradient clip threshold # When using mixed precision training, scale the gradient clip threshold
# by loss_scaling # by loss_scaling
if args.use_fp16 and args.loss_scaling > 1.0:
clip_norm_thres *= args.loss_scaling
fluid.clip.set_gradient_clip( fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres)) clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=clip_norm_thres))
...@@ -89,44 +85,19 @@ def optimization(loss, programs, args): ...@@ -89,44 +85,19 @@ def optimization(loss, programs, args):
param_list = dict() param_list = dict()
if args.use_fp16: for param in train_program.global_block().all_parameters():
param_grads = optimizer.backward(loss) param_list[param.name] = param * 1.0
master_param_grads = create_master_params_grads( param_list[param.name].stop_gradient = True
param_grads, train_program, startup_prog, args.loss_scaling)
_, param_grads = optimizer.minimize(loss)
for param, _ in master_param_grads:
param_list[param.name] = param * 1.0 if args.weight_decay > 0:
param_list[param.name].stop_gradient = True for param, grad in param_grads:
if exclude_from_weight_decay(param.name):
optimizer.apply_gradients(master_param_grads) continue
with param.block.program._optimized_guard(
if args.weight_decay > 0: [param, grad]), fluid.framework.name_scope("weight_decay"):
for param, grad in master_param_grads: updated_param = param - param_list[
if exclude_from_weight_decay(param.name.rstrip(".master")): param.name] * args.weight_decay * scheduled_lr
continue fluid.layers.assign(output=param, input=updated_param)
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
master_param_to_train_param(master_param_grads, param_grads,
train_program)
else:
for param in train_program.global_block().all_parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
_, param_grads = optimizer.minimize(loss)
if args.weight_decay > 0:
for param, grad in param_grads:
if exclude_from_weight_decay(param.name):
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * args.weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
...@@ -25,9 +25,6 @@ def compute_loss(output_tensors, args=None): ...@@ -25,9 +25,6 @@ def compute_loss(output_tensors, args=None):
logits=logits, label=labels, return_softmax=True) logits=logits, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss) loss = fluid.layers.mean(x=ce_loss)
if args.use_fp16 and args.loss_scaling > 1.0:
loss *= args.loss_scaling
return loss return loss
......
...@@ -42,7 +42,7 @@ def create_model(reader_input, base_model=None, is_training=True, args=None): ...@@ -42,7 +42,7 @@ def create_model(reader_input, base_model=None, is_training=True, args=None):
_hidden_act = config['hidden_act'] _hidden_act = config['hidden_act']
_word_emb_name = "word_embedding" _word_emb_name = "word_embedding"
_dtype = "float16" if args.use_fp16 else "float32" _dtype = "float32"
_param_initializer = fluid.initializer.TruncatedNormal( _param_initializer = fluid.initializer.TruncatedNormal(
scale=config['initializer_range']) scale=config['initializer_range'])
......
...@@ -35,8 +35,6 @@ def compute_loss(output_tensors, args=None): ...@@ -35,8 +35,6 @@ def compute_loss(output_tensors, args=None):
start_loss = _compute_single_loss(start_logits, start_positions) start_loss = _compute_single_loss(start_logits, start_positions)
end_loss = _compute_single_loss(end_logits, end_positions) end_loss = _compute_single_loss(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2.0 total_loss = (start_loss + end_loss) / 2.0
if args.use_fp16 and args.loss_scaling > 1.0:
total_loss = total_loss * args.loss_scaling
return total_loss return total_loss
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册