提交 215f9e46 编写于 作者: T tsai

Merge branch 'bert_with_xla' of...

Merge branch 'bert_with_xla' of https://github.com/Oneflow-Inc/OneFlow-Benchmark into add_xla_option
......@@ -129,25 +129,49 @@ rm -rf ./output/snapshots/*
DATA_ROOT=data/mini-imagenet/ofrecord
# training with mini-imagenet
DATA_ROOT=data/mini-imagenet/ofrecord
python3 of_cnn_train_val.py \
--train_data_dir=$DATA_ROOT/train \
--num_examples=50 \
--train_data_part_num=1 \
--val_data_dir=$DATA_ROOT/validation \
--train_data_dir=$DATA_ROOT/train \
--num_examples=50 \
--train_data_part_num=1 \
--val_data_dir=$DATA_ROOT/validation \
--num_val_examples=50 \
--val_data_part_num=1 \
--num_nodes=1 \
--gpu_num_per_node=1 \
--optimizer="sgd" \
--momentum=0.875 \
--learning_rate=0.001 \
--loss_print_every_n_iter=1 \
--batch_size_per_device=16 \
--val_batch_size_per_device=10 \
--num_epoch=10 \
--model="resnet50"
```
运行此脚本,将在仅有50张金鱼图片的迷你imagenet数据集上,训练出一个分类模型,利用它,你可以对金鱼图片进行分类。
训练完成后,你也可以修改evaluate.sh脚本以对模型进行评估:
```shell
#!/bin/bash
# Evaluate with mini-imagenet
DATA_ROOT=data/mini-imagenet/ofrecord
MODEL_LOAD_DIR="output/snapshots/model_save-20200907130848/snapshot_epoch_9"
python3 of_cnn_evaluate.py \
--num_epochs=3 \
--num_val_examples=50 \
--model_load_dir=$MODEL_LOAD_DIR \
--val_data_dir=$DATA_ROOT/validation \
--val_data_part_num=1 \
--num_nodes=1 \
--gpu_num_per_node=1 \
--model_update="momentum" \
--learning_rate=0.001 \
--loss_print_every_n_iter=1 \
--batch_size_per_device=16 \
--val_batch_size_per_device=10 \
--num_epoch=10 \
--model="resnet50"
```
运行此脚本,将在仅有50张金鱼图片的迷你imagenet数据集上,训练出一个分类模型,利用它,你可以对金鱼图片进行分类。
恭喜你,得到了这个还不错的金鱼分类模型,想尝试在完整imagenet上训练自己的分类模型吗?
不要着急,如果您需要在完整的ImageNet2012数据集上进行训练,请看下文【ResNet】部分的介绍。其中,我们将重点介绍其中的经典网络:Resnet50,以及如何利用OneFlow在完整的Imagenet2012数据集上训练Resnet50,并提供 **对标Nvidia的Mxnet版** 实现。
......@@ -189,50 +213,47 @@ cd OneFlow-Benchmark/Classification/cnns
rm -rf core.*
rm -rf ./output/snapshots/*
DATA_ROOT=/dataset/ImageNet/ofrecord
# training with imagenet
DATA_ROOT=/datasets/ImageNet/ofrecord
LOG_FOLDER=../logs
mkdir -p $LOG_FOLDER
LOGFILE=$LOG_FOLDER/resnet_training.log
python3 of_cnn_train_val.py \
--train_data_dir=$DATA_ROOT/train \
--train_data_part_num=256 \
--val_data_dir=$DATA_ROOT/validation \
--val_data_part_num=256 \
--num_nodes=1 \
--gpu_num_per_node=4 \
--model_update="momentum" \
--learning_rate=0.256 \
--loss_print_every_n_iter=10 \
--batch_size_per_device=64 \
--val_batch_size_per_device=50 \
--num_epoch=90 \
--model="resnet50"
--train_data_dir=$DATA_ROOT/train \
--train_data_part_num=256 \
--val_data_dir=$DATA_ROOT/validation \
--val_data_part_num=256 \
--num_nodes=1 \
--gpu_num_per_node=4 \
--optimizer="sgd" \
--momentum=0.875 \
--label_smoothing=0.1 \
--learning_rate=0.256 \
--loss_print_every_n_iter=100 \
--batch_size_per_device=64 \
--val_batch_size_per_device=50 \
--num_epoch=90 \
--model="resnet50" 2>&1 | tee ${LOGFILE}
echo "Writting log to ${LOGFILE}"
```
**参数说明**(部分)
- --train_data_dir Imagenet2012训练集文件夹路径(ofrecord格式)
- --train_data_part_num 训练所用的ofrecord分片数量
- --val_data_dir Imagenet2012验证集文件夹路径(ofrecord格式)
- --val_data_part_num 验证所用的ofrecord分片数量
- --num_nodes=1 训练使用的机器节点数
- --num_nodes 训练使用的机器节点数
- --gpu_num_per_node 每个机器节点使用的gpu数量
- --model_update="momentum" 学习率更新方式
- --learning_rate=0.256 初始学习率
- --optimizer 优化器,默认sgd
- --label_smoothing 是否使用标签平滑处理
- --learning_rate 初始学习率
- --loss_print_every_n_iter 打印loss间隔
- --batch_size_per_device 训练时每个gpu的batch大小
- --val_batch_size_per_device 验证时每个gpu的batch大小
- --num_epoch 迭代总轮数
- --model 使用的模型,可选:resnet50、vgg、alexnet、inceptionv3
然后在命令行执行:
......@@ -586,8 +607,8 @@ python3 of_cnn_train_val.py \
--val_data_part_num=256 \
--num_nodes=1 \
--gpu_num_per_node=1 \
--model_update="momentum" \
--mom=0.9 \
--optimizer="sgd" \
--momentum=0.9 \
--learning_rate=0.01 \
--loss_print_every_n_iter=100 \
--batch_size_per_device=512 \
......@@ -611,8 +632,8 @@ python3 cnn_benchmark/of_cnn_train_val.py \
--val_data_part_num=256 \
--num_nodes=1 \
--gpu_num_per_node=4 \
--model_update="momentum" \
--mom=0.9 \
--optimizer="sgd" \
--momentum=0.9 \
--learning_rate=0.01 \
--loss_print_every_n_iter=10 \
--batch_size_per_device=128 \
......@@ -635,7 +656,7 @@ python3 of_cnn_train_val.py \
--val_data_part_num=256 \
--num_nodes=1 \
--gpu_num_per_node=1 \
--model_update="rmsprop" \
--optimizer="rmsprop" \
--epsilon=1 \
--decay_rate=0.9 \
--learning_rate=0.045 \
......
......@@ -102,7 +102,7 @@ def get_parser(parser=None):
default='NHWC', help="NCHW or NHWC")
parser.add_argument('--image-shape', type=int_list, default=[3, 224, 224],
help='the image shape feed into the network')
parser.add_argument('--label-smoothing', type=float, default=0.1, help='label smoothing factor')
parser.add_argument('--label_smoothing', type=float, default=0.1, help='label smoothing factor')
# snapshot
parser.add_argument("--model_save_dir", type=str,
......
......@@ -15,7 +15,6 @@ limitations under the License.
"""
import oneflow as flow
from optimizer_util import gen_model_update_conf
def _default_config(args):
......@@ -31,12 +30,10 @@ def _default_config(args):
def get_train_config(args):
train_config = _default_config(args)
train_config.train.primary_lr(args.learning_rate)
train_config.cudnn_conv_heuristic_search_algo(False)
train_config.prune_parallel_cast_ops(True)
train_config.train.model_update_conf(gen_model_update_conf(args))
train_config.enable_inplace(True)
return train_config
......
......@@ -17,6 +17,7 @@ import os
import math
import oneflow as flow
import ofrecord_util
import optimizer_util
import config as configs
from util import Snapshot, Summary, InitNodes, Metric
from job_function_util import get_train_config, get_val_config
......@@ -56,7 +57,6 @@ flow.config.gpu_device_num(args.gpu_num_per_node)
def label_smoothing(labels, classes, eta, dtype):
assert classes > 0
assert eta >= 0.0 and eta < 1.0
return flow.one_hot(labels, depth=classes, dtype=dtype,
on_value=1 - eta + eta / classes, off_value=eta/classes)
......@@ -84,6 +84,9 @@ def TrainNet():
flow.losses.add_loss(loss)
predictions = flow.nn.softmax(logits)
outputs = {"loss": loss, "predictions": predictions, "labels": labels}
# set up warmup,learning rate and optimizer
optimizer_util.set_up_optimizer(loss, args)
return outputs
......
......@@ -13,104 +13,121 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import oneflow as flow
import math
import pprint
def add_optimizer_args(parser):
group = parser.add_argument_group('optimizer parameters',
'entire group applies only to optimizer parameters')
group.add_argument("--model_update", type=str, default="sgd", help="sgd, adam, momentum, rmsprop")
group.add_argument("--optimizer", type=str, default="sgd", help="sgd, adam, rmsprop")
group.add_argument("--learning_rate", type=float, default=0.256)
group.add_argument("--wd", type=float, default=1.0/32768, help="weight decay")
group.add_argument("--mom", type=float, default=0.875, help="momentum")
group.add_argument("--momentum", type=float, default=0.875, help="momentum")
group.add_argument('--lr_decay', type=str, default='cosine', help='cosine, step, polynomial, exponential, None')
group.add_argument('--lr_decay_rate', type=float, default='0.94', help='exponential learning decay rate')
group.add_argument('--lr_decay_epochs', type=int, default=2, help='exponential learning rate decay every n epochs')
group.add_argument('--warmup_epochs', type=int, default=5,
help='the epochs to ramp-up lr to scaled large-batch value')
help='the epochs to warmp-up lr to scaled large-batch value')
group.add_argument('--decay_rate', type=float, default='0.9', help='decay rate of RMSProp')
group.add_argument('--epsilon', type=float, default='1', help='epsilon')
group.add_argument('--gradient_clipping', type=float, default=0.0, help='gradient clipping')
return parser
def gen_model_update_conf(args):
def set_up_optimizer(loss, args):
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
epoch_size = math.ceil(args.num_examples / train_batch_size)
num_train_batches = epoch_size * args.num_epochs
num_warmup_batches = epoch_size * args.warmup_epochs
decay_batches = num_train_batches - num_warmup_batches
lr_decay_rate = args.lr_decay_rate
decay_rate = args.decay_rate
epsilon = args.epsilon
clipping_threshold = args.gradient_clipping
exponential_decay_batches = epoch_size * args.lr_decay_epochs
model_update_conf = {}
# basic model update
if args.model_update == 'sgd':
model_update_conf["naive_conf"] = {}
elif args.model_update == 'adam':
model_update_conf["adam_conf"] = {"beta1": 0.9}
elif args.model_update == 'momentum':
assert args.mom < 1.0
assert args.mom > 0.0
model_update_conf["momentum_conf"] = {"beta": args.mom}
elif args.model_update == 'rmsprop':
model_update_conf["rmsprop_conf"] = {"decay_rate": decay_rate, "epsilon": epsilon}
else:
assert False
batches_per_epoch = math.ceil(args.num_examples / train_batch_size)
warmup_batches = batches_per_epoch * args.warmup_epochs
num_train_batches = batches_per_epoch * args.num_epochs
decay_batches = num_train_batches - warmup_batches
exponential_decay_batches = batches_per_epoch * args.lr_decay_epochs
# learning rate warmup
if args.warmup_epochs > 0: #linear warmup only
model_update_conf['warmup_conf'] = {"linear_conf": {
"warmup_batches": num_warmup_batches,
"start_multiplier": 0,
}}
# set up warmup strategy
warmup = flow.optimizer.warmup.linear(warmup_batches, 0) if warmup_batches > 0 else None
# set up grad_clipping
grad_clipping = flow.optimizer.grad_clipping.by_global_norm(args.gradient_clipping) if args.gradient_clipping > 0.0 else None
# learning rate decay
# set up learning rate scheduler
if args.lr_decay == 'cosine':
model_update_conf['learning_rate_decay'] = {"cosine_conf": {"decay_batches": decay_batches}}
# CosineScheduler
lr_scheduler = flow.optimizer.CosineScheduler(
base_lr=args.learning_rate,
steps = decay_batches,
warmup=warmup
)
elif args.lr_decay == 'step':
boundaries = [x * epoch_size for x in [30, 60, 80]]
scales = [1, 0.1, 0.01, 0.001]
model_update_conf['learning_rate_decay'] = {"piecewise_scaling_conf": {
"boundaries": boundaries,
"scales":scales,
}}
# PiecewiseScalingScheduler
lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(
base_lr=args.learning_rate,
boundaries=[30, 60, 80],
scale=[0.1, 0.01, 0.001],
warmup=warmup
)
elif args.lr_decay == 'polynomial':
model_update_conf['learning_rate_decay'] = {"polynomial_conf": {
"decay_batches": decay_batches,
"end_learning_rate": 0.00001,
}}
# PolynomialSchduler
lr_scheduler = flow.optimizer.PolynomialSchduler(
base_lr=args.learning_rate,
steps=decay_batches,
end_learning_rate=0.00001,
power=1.0,
cycle=False,
warmup=warmup
)
elif args.lr_decay == 'exponential':
model_update_conf['learning_rate_decay'] = {"exponential_conf": {
"decay_batches": exponential_decay_batches,
"decay_rate": lr_decay_rate,
}}
# gradient_clipping
if args.gradient_clipping > 0:
model_update_conf['clip_conf'] = {"clip_by_global_norm": {
"clip_norm": clipping_threshold
}}
# weight decay
if args.wd > 0:
assert args.wd < 1.0
model_update_conf['weight_decay_conf'] = {
"weight_decay_rate": args.wd,
"excludes": {"pattern": ['_bn-']}
}
# ExponentialScheduler
lr_scheduler = flow.optimizer.ExponentialScheduler(
base_lr=args.learning_rate,
steps=exponential_decay_batches,
decay_rate=args.lr_decay_rate,
staircase=False,
warmup=warmup
)
else:
lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(
base_lr=args.learning_rate,
boundaries=[args.num_epochs],
scale=[1.0],
warmup=warmup
)
pprint.pprint(model_update_conf)
return model_update_conf
# set up optimizer
if args.optimizer=='sgd':
print("Optimizer: SGD")
flow.optimizer.SGD(lr_scheduler,
momentum=args.momentum if args.momentum>0 else None,
grad_clipping = grad_clipping
).minimize(loss)
elif args.optimizer=='adam':
if args.wd > 0 and args.wd < 1.0 :
print("Optimizer: AdamW")
flow.optimizer.AdamW(
lr_scheduler = lr_scheduler,
weight_decay = args.wd,
weight_decay_excludes='_bn-',
grad_clipping = grad_clipping,
epsilon=args.epsilon
).minimize(loss)
else:
print("Optimizer: Adam")
flow.optimizer.Adam(lr_scheduler=lr_scheduler,
grad_clipping=grad_clipping,
epsilon=args.epsilon
).minimize(loss)
elif args.optimizer=='rmsprop':
print("Optimizer: RMSProp")
flow.optimizer.RMSProp(lr_scheduler=lr_scheduler,
decay_rate=args.decay_rate,
epsilon=args.epsilon
).minimize(loss)
if __name__ == '__main__':
import config as configs
parser = configs.get_parser()
args = parser.parse_args()
configs.print_args(args)
gen_model_update_conf(args)
configs.print_args(args)
\ No newline at end of file
......@@ -2,13 +2,15 @@
rm -rf core.*
rm -rf ./output/snapshots/*
# training with synthetic data
python3 of_cnn_train_val.py \
--num_examples=50 \
--num_val_examples=50 \
--num_nodes=1 \
--gpu_num_per_node=1 \
--model_update="momentum" \
--optimizer="sgd" \
--momentum=0.875 \
--learning_rate=0.001 \
--loss_print_every_n_iter=1 \
--batch_size_per_device=16 \
......@@ -16,50 +18,3 @@ python3 of_cnn_train_val.py \
--num_epoch=10 \
--model="resnet50"
# # training with mini-imagenet
# DATA_ROOT=data/mini-imagenet/ofrecord
# python3 of_cnn_train_val.py \
# --train_data_dir=$DATA_ROOT/train \
# --num_examples=50 \
# --train_data_part_num=1 \
# --val_data_dir=$DATA_ROOT/validation \
# --num_val_examples=50 \
# --val_data_part_num=1 \
# --num_nodes=1 \
# --gpu_num_per_node=1 \
# --model_update="momentum" \
# --learning_rate=0.001 \
# --loss_print_every_n_iter=1 \
# --batch_size_per_device=16 \
# --val_batch_size_per_device=10 \
# --num_epoch=10 \
# --model="resnet50"
# # training with imagenet
# DATA_ROOT=/datasets/ImageNet/ofrecord
# LOG_FOLDER=../logs
# mkdir -p $LOG_FOLDER
# LOGFILE=$LOG_FOLDER/resnet_training.log
# python3 of_cnn_train_val.py \
# --train_data_dir=$DATA_ROOT/train \
# --train_data_part_num=256 \
# --val_data_dir=$DATA_ROOT/validation \
# --val_data_part_num=256 \
# --num_nodes=1 \
# --gpu_num_per_node=4 \
# --model_update="momentum" \
# --learning_rate=0.256 \
# --loss_print_every_n_iter=100 \
# --batch_size_per_device=64 \
# --val_batch_size_per_device=50 \
# --num_epoch=90 \
# --model="resnet50" 2>&1 | tee ${LOGFILE}
# echo "Writting log to ${LOGFILE}"
......@@ -48,14 +48,16 @@ def get_parser(parser=None):
help='node/machine number for training')
parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'],
help='nodes ip list for training, devided by ",", length >= num_nodes')
# train
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
parser.add_argument("--weight_decay_rate", type=float, default=0.01, help="weight decay rate")
parser.add_argument("--warmup_proportion", type=float, default=0.1)
parser.add_argument('--use_fp16', type=str2bool, nargs='?', default='False', const=True,
parser.add_argument('--use_fp16', type=str2bool, nargs='?', default='False', const=True,
help='use use fp16 or not')
parser.add_argument('--use_xla', type=str2bool, nargs='?', const=True,
help='Whether to use use xla')
# log and resore/save
parser.add_argument("--loss_print_every_n_iter", type=int, default=10, required=False,
help="print loss every n iteration")
......@@ -68,7 +70,7 @@ def get_parser(parser=None):
help="save model snapshot for last iteration")
parser.add_argument("--model_load_dir", type=str, default=None, help="model load directory")
parser.add_argument("--log_dir", type=str, default="./output", help="log info save directory")
# bert backbone
parser.add_argument('--do_lower_case', type=str2bool, nargs='?', const=True, default='True')
parser.add_argument("--seq_length", type=int, default=512)
......@@ -81,7 +83,7 @@ def get_parser(parser=None):
parser.add_argument("--attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--hidden_size_per_head", type=int, default=64)
return parser
......
......@@ -131,7 +131,7 @@ class Metric(object):
self.metric_dict[key] = 0.0
self.metric_dict['throughput'] = 0.0
self.num_samples = 0.0
def update_and_save(self, key, value, step, **kwargs):
self.metric_dict[key] = value
if self.save_summary:
......@@ -164,14 +164,16 @@ class Metric(object):
def CreateOptimizer(args):
warmup_batches = int(args.iter_num * args.warmup_proportion)
lr_warmup = flow.optimizer.warmup.linear(warmup_batches, 0)
lr_scheduler = flow.optimizer.PolynomialSchduler(args.learning_rate, args.iter_num, 0.0,
lr_scheduler = flow.optimizer.PolynomialSchduler(args.learning_rate, args.iter_num, 0.0,
warmup=lr_warmup)
return flow.optimizer.AdamW(lr_scheduler, epsilon=1e-6, weight_decay=args.weight_decay_rate,
return flow.optimizer.AdamW(lr_scheduler, epsilon=1e-6, weight_decay=args.weight_decay_rate,
weight_decay_excludes=["bias", "LayerNorm", "layer_norm"],
grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0))
def GetFunctionConfig(args):
config = flow.function_config()
config.enable_auto_mixed_precision(args.use_fp16)
if args.use_xla:
config.use_xla_jit(True)
return config
numpy>=1.17.2
pandas>=1.0.4
\ No newline at end of file
pandas>=1.0.4
pillow>=7.2.0
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册