未验证 提交 d1f0ee1b 编写于 作者: X xiaomuchongwhs 提交者: GitHub

Merge pull request #41 from Oneflow-Inc/hongshen

train VGG-16
......@@ -606,4 +606,33 @@ python3 cnn_benchmark/of_cnn_train_val.py \
```
The top1 accuracy and the top5 acuuracy are 54.762% and 78.1914%, respectively for our oneflow model after 90 epochs of training.
For reference, the top1 accuracy and the top5 accuracy are 54.6% and 78.33%, respectively for the model from the tensorflow benchmarks after 90 epochs of training.
\ No newline at end of file
For reference, the top1 accuracy and the top5 accuracy are 54.6% and 78.33%, respectively for the model from the tensorflow benchmarks after 90 epochs of training.
#### 训练 VGG-16
```
export ENABLE_USER_OP=True
rm -rf core.*
rm -rf ./output/snapshots/*
DATA_ROOT=/dataset/ImageNet/ofrecord
#Please change this to your data root.
python3 cnn_benchmark/of_cnn_train_val.py \
--train_data_dir=$DATA_ROOT/train \
--val_data_dir=$DATA_ROOT/validation \
--train_data_part_num=256 \
--val_data_part_num=256 \
--num_nodes=1 \
--gpu_num_per_node=4 \
--model_update="momentum" \
--mom=0.9 \
--learning_rate=0.01 \
--loss_print_every_n_iter=10 \
--batch_size_per_device=128 \
--val_batch_size_per_device=128 \
--num_epoch=90 \
--use_fp16=false \
--use_boxing_v2=false \
--model="vgg" \
```
The top1 accuracy and the top5 acuuracy are 69.3359% and 89.1370%, respectively for our oneflow model after 90 epochs of training.
For reference, the top1 accuracy and the top5 accuracy are 71.5% and 89.9%, respectively for the model from the tensorflow benchmarks after 90 epochs of training.
......@@ -19,14 +19,8 @@ def get_train_config(args):
train_config = _default_config(args)
train_config.train.primary_lr(args.learning_rate)
train_config.disable_all_reduce_sequence(False)
# train_config.cudnn_conv_enable_pseudo_half(True)
train_config.all_reduce_group_min_mbyte(8)
train_config.all_reduce_group_num(128)
# train_config.all_reduce_lazy_ratio(0)
# train_config.enable_nccl_hierarchical_all_reduce(True)
# train_config.cudnn_buf_limit_mbyte(2048)
# train_config.concurrency_width(2)
if args.use_boxing_v2:
train_config.use_boxing_v2(True)
......
......@@ -15,6 +15,8 @@ import resnet_model
import vgg_model
import alexnet_model
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
parser = configs.get_parser()
args = parser.parse_args()
......
......@@ -3,6 +3,7 @@ from __future__ import division
from __future__ import print_function
import math
import pprint
def add_optimizer_args(parser):
group = parser.add_argument_group('optimizer parameters',
......@@ -59,16 +60,7 @@ def gen_model_update_conf(args):
"decay_batches": decay_batches,
"end_learning_rate": 0.00001,
}}
# weight decay
# if args.wd > 0:
# assert args.wd < 1.0
# model_update_conf['weight_decay_conf'] = {
# "weight_decay_rate": args.wd,
# "excludes": {"pattern": ['_bn-']}
# }
import pprint
pprint.pprint(model_update_conf)
return model_update_conf
......
......@@ -17,6 +17,8 @@ def _batch_norm(inputs, name=None, trainable=True):
name=name,
)
def _get_regularizer():
return flow.regularizers.l2(0.00005)
def conv2d_layer(
name,
......@@ -29,12 +31,16 @@ def conv2d_layer(
dilation_rate=1,
activation="Relu",
use_bias=True,
weight_initializer=flow.variance_scaling_initializer(2, 'fan_out', 'random_normal',
data_format="NCHW"),
weight_initializer=flow.variance_scaling_initializer(2, 'fan_out', 'random_normal', data_format="NCHW"),
bias_initializer=flow.zeros_initializer(),
weight_regularizer=_get_regularizer(), # weight_decay
bias_regularizer=_get_regularizer(),
bn=True,
):
):
weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
print("weight_shape:{}".format(weight_shape))
weight = flow.get_variable(
name + "_weight",
shape=weight_shape,
......@@ -59,7 +65,7 @@ def conv2d_layer(
output = _batch_norm(output, name + "_bn")
output = flow.nn.relu(output)
else:
output = flow.nn.relu(output)
output = flow.nn.relu(output)
else:
raise NotImplementedError
......@@ -85,12 +91,12 @@ def _conv_block(in_blob, index, filters, conv_times):
return conv_block
def vgg16bn(images, trainable=True, need_transpose=False, training=True, wd=1.0 / 32768, channel_last=False):
def vgg16bn(images, trainable=True, need_transpose=False, training=True, wd=1.0/32768):
if need_transpose:
images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2])
conv1 = _conv_block(images, 0, 64, 2)
pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", "NCHW", name="pool1")
conv2 = _conv_block(pool1, 2, 128, 2)
pool2 = flow.nn.max_pool2d(conv2[-1], 2, 2, "VALID", "NCHW", name="pool2")
......@@ -117,6 +123,8 @@ def vgg16bn(images, trainable=True, need_transpose=False, training=True, wd=1.0
use_bias=True,
kernel_initializer=_get_kernel_initializer(),
bias_initializer=_get_bias_initializer(),
kernel_regularizer=_get_regularizer(), # weght_decay
bias_regularizer=_get_regularizer(),
trainable=trainable,
name="dense0",
)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册