diff --git a/Classification/cnns/of_cnn_train_val.py b/Classification/cnns/of_cnn_train_val.py index 31a924cc879d6f001e998ce8edbc830ddfaba95a..ee968e403f6e269a856b086bb9f7f976f1ab0eb2 100755 --- a/Classification/cnns/of_cnn_train_val.py +++ b/Classification/cnns/of_cnn_train_val.py @@ -86,8 +86,7 @@ def TrainNet(): else: loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss") - if not args.use_fp16: - loss = flow.math.reduce_mean(loss) + loss = flow.math.reduce_mean(loss) predictions = flow.nn.softmax(logits) outputs = {"loss": loss, "predictions": predictions, "labels": labels} diff --git a/Classification/cnns/optimizer_util.py b/Classification/cnns/optimizer_util.py index 55814b9185d3b09a939d859e0b7fadc7ce0f32ca..6d6812505cd9d7179d034d19e0848cdc49c13487 100755 --- a/Classification/cnns/optimizer_util.py +++ b/Classification/cnns/optimizer_util.py @@ -101,9 +101,11 @@ def set_up_optimizer(loss, args): grad_clipping = grad_clipping ).minimize(loss) elif args.optimizer=='adam': + loss_scale_policy = None + if args.use_fp16: + loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(increment_period=2000); if args.wd > 0 and args.wd < 1.0 : print("Optimizer: AdamW") - loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(increment_period=20); flow.optimizer.AdamW( lr_scheduler = lr_scheduler, weight_decay = args.wd, @@ -116,7 +118,8 @@ def set_up_optimizer(loss, args): print("Optimizer: Adam") flow.optimizer.Adam(lr_scheduler=lr_scheduler, grad_clipping=grad_clipping, - epsilon=args.epsilon + epsilon=args.epsilon, + loss_scale_policy=loss_scale_policy ).minimize(loss) elif args.optimizer=='rmsprop': print("Optimizer: RMSProp") diff --git a/LanguageModeling/BERT/pretrain.py b/LanguageModeling/BERT/pretrain.py index 7b212a87465399d1ee357f1ca8a1e59af1d69569..bc983e87acd81b4446d8cbde06514d6ad31e7ab3 100755 --- a/LanguageModeling/BERT/pretrain.py +++ b/LanguageModeling/BERT/pretrain.py @@ -82,9 +82,8 @@ def PreTrain( initializer_range=initializer_range, ) with flow.scope.namespace("cls-loss"): - if not use_fp16: - lm_loss = flow.math.reduce_mean(lm_loss) - ns_loss = flow.math.reduce_mean(ns_loss) + lm_loss = flow.math.reduce_mean(lm_loss) + ns_loss = flow.math.reduce_mean(ns_loss) total_loss = lm_loss + ns_loss return total_loss, lm_loss, ns_loss diff --git a/LanguageModeling/BERT/util.py b/LanguageModeling/BERT/util.py index 325d5ffbfce59e11c9a48679d1edb63ecda6d370..5401eb2aaefa8c6a456ba86d9b1f15632266e460 100755 --- a/LanguageModeling/BERT/util.py +++ b/LanguageModeling/BERT/util.py @@ -166,7 +166,9 @@ def CreateOptimizer(args): lr_warmup = flow.optimizer.warmup.linear(warmup_batches, 0) lr_scheduler = flow.optimizer.PolynomialSchduler(args.learning_rate, args.iter_num, 0.0, warmup=lr_warmup) - loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(increment_period=20); + loss_scale_policy = None + if args.use_fp16: + loss_scale_policy = flow.optimizer.loss_scale.dynamic_loss_scale(increment_period=2000); return flow.optimizer.AdamW(lr_scheduler, epsilon=1e-6, weight_decay=args.weight_decay_rate, weight_decay_excludes=["bias", "LayerNorm", "layer_norm"], grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0),