提交 6015332e 编写于 作者: O ouyangyu

reduce mean

上级 3583a5f4
......@@ -86,8 +86,7 @@ def TrainNet():
else:
loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss")
if not args.use_fp16:
loss = flow.math.reduce_mean(loss)
loss = flow.math.reduce_mean(loss)
predictions = flow.nn.softmax(logits)
outputs = {"loss": loss, "predictions": predictions, "labels": labels}
......
......@@ -101,9 +101,11 @@ def set_up_optimizer(loss, args):
grad_clipping = grad_clipping
).minimize(loss)
elif args.optimizer=='adam':
loss_scale_policy = None
if args.use_fp16:
loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(increment_period=2000);
if args.wd > 0 and args.wd < 1.0 :
print("Optimizer: AdamW")
loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(increment_period=20);
flow.optimizer.AdamW(
lr_scheduler = lr_scheduler,
weight_decay = args.wd,
......@@ -116,7 +118,8 @@ def set_up_optimizer(loss, args):
print("Optimizer: Adam")
flow.optimizer.Adam(lr_scheduler=lr_scheduler,
grad_clipping=grad_clipping,
epsilon=args.epsilon
epsilon=args.epsilon,
loss_scale_policy=loss_scale_policy
).minimize(loss)
elif args.optimizer=='rmsprop':
print("Optimizer: RMSProp")
......
......@@ -82,9 +82,8 @@ def PreTrain(
initializer_range=initializer_range,
)
with flow.scope.namespace("cls-loss"):
if not use_fp16:
lm_loss = flow.math.reduce_mean(lm_loss)
ns_loss = flow.math.reduce_mean(ns_loss)
lm_loss = flow.math.reduce_mean(lm_loss)
ns_loss = flow.math.reduce_mean(ns_loss)
total_loss = lm_loss + ns_loss
return total_loss, lm_loss, ns_loss
......
......@@ -166,7 +166,9 @@ def CreateOptimizer(args):
lr_warmup = flow.optimizer.warmup.linear(warmup_batches, 0)
lr_scheduler = flow.optimizer.PolynomialSchduler(args.learning_rate, args.iter_num, 0.0,
warmup=lr_warmup)
loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(increment_period=20);
loss_scale_policy = None
if args.use_fp16:
loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(increment_period=2000);
return flow.optimizer.AdamW(lr_scheduler, epsilon=1e-6, weight_decay=args.weight_decay_rate,
weight_decay_excludes=["bias", "LayerNorm", "layer_norm"],
grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册