提交 0ae082d3 编写于 作者: M mir-of

format

上级 38fe8151
......@@ -5,6 +5,7 @@ from __future__ import print_function
import oneflow as flow
from optimizer_util import get_optimizer
def _default_config(args):
config = flow.function_config()
config.default_distribute_strategy(flow.distribute.consistent_strategy())
......@@ -13,11 +14,12 @@ def _default_config(args):
config.enable_auto_mixed_precision(True)
return config
def get_train_config(args):
train_config = _default_config(args)
train_config.train.primary_lr(args.learning_rate)
train_config.disable_all_reduce_sequence(False)
#train_config.cudnn_conv_enable_pseudo_half(True)
# train_config.cudnn_conv_enable_pseudo_half(True)
train_config.all_reduce_group_min_mbyte(8)
train_config.all_reduce_group_num(128)
# train_config.all_reduce_lazy_ratio(0)
......@@ -28,12 +30,12 @@ def get_train_config(args):
if args.use_boxing_v2:
train_config.use_boxing_v2(True)
train_config.prune_parallel_cast_ops(True)
train_config.train.model_update_conf(get_optimizer(args))
train_config.enable_inplace(True)
return train_config
def get_val_config(args):
return _default_config(args)
......@@ -3,26 +3,24 @@ from __future__ import division
from __future__ import print_function
import os
import time
import math
import numpy as np
import config as configs
parser = configs.get_parser()
args = parser.parse_args()
configs.print_args(args)
import oneflow as flow
from util import Snapshot, Summary, InitNodes, Metric
import ofrecord_util
import config as configs
from util import Snapshot, Summary, InitNodes, Metric
from job_function_util import get_train_config, get_val_config
import oneflow as flow
import alexnet_model
import vgg_model
import resnet_model
import inception_model
import resnet_model
import vgg_model
import alexnet_model
parser = configs.get_parser()
args = parser.parse_args()
configs.print_args(args)
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
......@@ -36,7 +34,7 @@ model_dict = {
"resnet50": resnet_model.resnet50,
"vgg16": vgg_model.vgg16,
"alexnet": alexnet_model.alexnet,
"inceptionv3":inception_model.inceptionv3,
"inceptionv3": inception_model.inceptionv3,
}
......@@ -47,6 +45,7 @@ if args.use_boxing_v2:
flow.config.collective_boxing.nccl_fusion_threshold_mb(8)
flow.config.collective_boxing.nccl_fusion_all_reduce_use_buffer(False)
@flow.function(get_train_config(args))
def TrainNet():
if args.train_data_dir:
......@@ -61,12 +60,14 @@ def TrainNet():
print("Loading synthetic data.")
(labels, images) = ofrecord_util.load_synthetic(args)
logits = model_dict[args.model](images, need_transpose=not args.use_new_dataloader)
loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss")
logits = model_dict[args.model](
images, need_transpose=not args.use_new_dataloader)
loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
labels, logits, name="softmax_loss")
loss = flow.math.reduce_mean(loss)
flow.losses.add_loss(loss)
predictions = flow.nn.softmax(logits)
outputs = {"loss": loss, "predictions":predictions, "labels": labels}
outputs = {"loss": loss, "predictions": predictions, "labels": labels}
return outputs
......@@ -83,9 +84,10 @@ def InferenceNet():
print("Loading synthetic data.")
(labels, images) = ofrecord_util.load_synthetic(args)
logits = model_dict[args.model](images, need_transpose=not args.use_new_dataloader)
logits = model_dict[args.model](
images, need_transpose=not args.use_new_dataloader)
predictions = flow.nn.softmax(logits)
outputs = {"predictions":predictions, "labels": labels}
outputs = {"predictions": predictions, "labels": labels}
return outputs
......@@ -104,9 +106,7 @@ def main():
batch_size=train_batch_size, loss_key='loss')
for i in range(epoch_size):
TrainNet().async_get(metric.metric_cb(epoch, i))
# if i > 40:#debug
# break
#break
if args.val_data_dir:
metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary,
save_summary_steps=num_val_steps, batch_size=val_batch_size)
......
......@@ -3,23 +3,24 @@ from __future__ import division
from __future__ import print_function
import os
import time
import math
import numpy as np
import oneflow as flow
import ofrecord_util
import config as configs
from util import Snapshot, Summary, InitNodes, Metric
from job_function_util import get_val_config
import alexnet_model
import resnet_model
import vgg_model
parser = configs.get_parser()
args = parser.parse_args()
configs.print_args(args)
from util import Snapshot, Summary, InitNodes, Metric
import ofrecord_util
from job_function_util import get_train_config, get_val_config
import oneflow as flow
#import vgg_model
import resnet_model
#import alexnet_model
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
......@@ -31,13 +32,15 @@ num_val_steps = int(args.num_val_examples / val_batch_size)
model_dict = {
"resnet50": resnet_model.resnet50,
#"vgg16": vgg_model.vgg16,
#"alexnet": alexnet_model.alexnet,
"vgg16": vgg_model.vgg16,
"alexnet": alexnet_model.alexnet,
"inceptionv3": inception_model.inceptionv3,
}
flow.config.gpu_device_num(args.gpu_num_per_node)
flow.config.enable_debug_mode(True)
@flow.function(get_val_config(args))
def InferenceNet():
if args.val_data_dir:
......@@ -50,7 +53,7 @@ def InferenceNet():
logits = model_dict[args.model](images)
predictions = flow.nn.softmax(logits)
outputs = {"predictions":predictions, "labels": labels}
outputs = {"predictions": predictions, "labels": labels}
return outputs
......@@ -64,7 +67,8 @@ def main():
summary = Summary(args.log_dir, args)
for epoch in range(args.num_epochs):
model_load_dir = os.path.join(args.model_load_dir, 'snapshot_epoch_{}'.format(epoch))
model_load_dir = os.path.join(
args.model_load_dir, 'snapshot_epoch_{}'.format(epoch))
snapshot = Snapshot(args.model_save_dir, model_load_dir)
metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary,
save_summary_steps=num_val_steps, batch_size=val_batch_size)
......
......@@ -4,17 +4,19 @@ from __future__ import print_function
import math
def add_optimizer_args(parser):
group = parser.add_argument_group('optimizer parameters',
'entire group applies only to optimizer parameters')
group.add_argument("--optimizer", type=str, default="momentum-cosine-decay",
help="sgd, adam, momentum, momentum-cosine-decay")
#group.add_argument("--weight_decay_rate", type=float, default=1.0/32768, help="weight decay")
# group.add_argument("--weight_decay_rate", type=float, default=1.0/32768, help="weight decay")
group.add_argument("--learning_rate", type=float, default=0.256)
group.add_argument('--warmup-epochs', type=int, default=5,
help='the epochs to ramp-up lr to scaled large-batch value')
return parser
def get_optimizer(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
......@@ -29,18 +31,18 @@ def get_optimizer(args):
"momentum-decay": {
"momentum_conf": {"beta": 0.9},
"learning_rate_decay": {
"polynomial_conf": {"decay_batches": 300000, "end_learning_rate": 0.0001,},
"polynomial_conf": {"decay_batches": 300000, "end_learning_rate": 0.0001, },
},
},
"momentum-cosine-decay": {
"momentum_conf": {"beta": 0.875},
"warmup_conf": {"linear_conf": {"warmup_batches":num_warmup_batches, "start_multiplier":0}},
"warmup_conf": {"linear_conf": {"warmup_batches": num_warmup_batches, "start_multiplier": 0}},
"learning_rate_decay": {"cosine_conf": {"decay_batches": decay_batches}},
#"weight_decay_conf": {
# "weight_decay_conf": {
# "weight_decay_rate": args.weight_decay_rate,
# #"excludes": {"pattern": ['', '']},
# "includes": {"pattern": ['weight']},
#}
# }
},
}
return optimizer_dict[args.optimizer]
\ No newline at end of file
return optimizer_dict[args.optimizer]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册