Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Oneflow-Inc
OneFlow-Benchmark
提交
0ae082d3
O
OneFlow-Benchmark
项目概览
Oneflow-Inc
/
OneFlow-Benchmark
上一次同步 接近 3 年
通知
1
Star
92
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
OneFlow-Benchmark
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
0ae082d3
编写于
6月 02, 2020
作者:
M
mir-of
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
format
上级
38fe8151
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
53 addition
and
45 deletion
+53
-45
cnn_e2e/job_function_util.py
cnn_e2e/job_function_util.py
+5
-3
cnn_e2e/of_cnn_train_val.py
cnn_e2e/of_cnn_train_val.py
+21
-21
cnn_e2e/of_cnn_val.py
cnn_e2e/of_cnn_val.py
+19
-15
cnn_e2e/optimizer_util.py
cnn_e2e/optimizer_util.py
+8
-6
未找到文件。
cnn_e2e/job_function_util.py
浏览文件 @
0ae082d3
...
...
@@ -5,6 +5,7 @@ from __future__ import print_function
import
oneflow
as
flow
from
optimizer_util
import
get_optimizer
def
_default_config
(
args
):
config
=
flow
.
function_config
()
config
.
default_distribute_strategy
(
flow
.
distribute
.
consistent_strategy
())
...
...
@@ -13,11 +14,12 @@ def _default_config(args):
config
.
enable_auto_mixed_precision
(
True
)
return
config
def
get_train_config
(
args
):
train_config
=
_default_config
(
args
)
train_config
.
train
.
primary_lr
(
args
.
learning_rate
)
train_config
.
disable_all_reduce_sequence
(
False
)
#train_config.cudnn_conv_enable_pseudo_half(True)
#
train_config.cudnn_conv_enable_pseudo_half(True)
train_config
.
all_reduce_group_min_mbyte
(
8
)
train_config
.
all_reduce_group_num
(
128
)
# train_config.all_reduce_lazy_ratio(0)
...
...
@@ -28,12 +30,12 @@ def get_train_config(args):
if
args
.
use_boxing_v2
:
train_config
.
use_boxing_v2
(
True
)
train_config
.
prune_parallel_cast_ops
(
True
)
train_config
.
train
.
model_update_conf
(
get_optimizer
(
args
))
train_config
.
enable_inplace
(
True
)
return
train_config
def
get_val_config
(
args
):
return
_default_config
(
args
)
cnn_e2e/of_cnn_train_val.py
浏览文件 @
0ae082d3
...
...
@@ -3,26 +3,24 @@ from __future__ import division
from
__future__
import
print_function
import
os
import
time
import
math
import
numpy
as
np
import
config
as
configs
parser
=
configs
.
get_parser
()
args
=
parser
.
parse_args
()
configs
.
print_args
(
args
)
import
oneflow
as
flow
from
util
import
Snapshot
,
Summary
,
InitNodes
,
Metric
import
ofrecord_util
import
config
as
configs
from
util
import
Snapshot
,
Summary
,
InitNodes
,
Metric
from
job_function_util
import
get_train_config
,
get_val_config
import
oneflow
as
flow
import
alexnet_model
import
vgg_model
import
resnet_model
import
inception_model
import
resnet_model
import
vgg_model
import
alexnet_model
parser
=
configs
.
get_parser
()
args
=
parser
.
parse_args
()
configs
.
print_args
(
args
)
total_device_num
=
args
.
num_nodes
*
args
.
gpu_num_per_node
train_batch_size
=
total_device_num
*
args
.
batch_size_per_device
...
...
@@ -36,7 +34,7 @@ model_dict = {
"resnet50"
:
resnet_model
.
resnet50
,
"vgg16"
:
vgg_model
.
vgg16
,
"alexnet"
:
alexnet_model
.
alexnet
,
"inceptionv3"
:
inception_model
.
inceptionv3
,
"inceptionv3"
:
inception_model
.
inceptionv3
,
}
...
...
@@ -47,6 +45,7 @@ if args.use_boxing_v2:
flow
.
config
.
collective_boxing
.
nccl_fusion_threshold_mb
(
8
)
flow
.
config
.
collective_boxing
.
nccl_fusion_all_reduce_use_buffer
(
False
)
@
flow
.
function
(
get_train_config
(
args
))
def
TrainNet
():
if
args
.
train_data_dir
:
...
...
@@ -61,12 +60,14 @@ def TrainNet():
print
(
"Loading synthetic data."
)
(
labels
,
images
)
=
ofrecord_util
.
load_synthetic
(
args
)
logits
=
model_dict
[
args
.
model
](
images
,
need_transpose
=
not
args
.
use_new_dataloader
)
loss
=
flow
.
nn
.
sparse_softmax_cross_entropy_with_logits
(
labels
,
logits
,
name
=
"softmax_loss"
)
logits
=
model_dict
[
args
.
model
](
images
,
need_transpose
=
not
args
.
use_new_dataloader
)
loss
=
flow
.
nn
.
sparse_softmax_cross_entropy_with_logits
(
labels
,
logits
,
name
=
"softmax_loss"
)
loss
=
flow
.
math
.
reduce_mean
(
loss
)
flow
.
losses
.
add_loss
(
loss
)
predictions
=
flow
.
nn
.
softmax
(
logits
)
outputs
=
{
"loss"
:
loss
,
"predictions"
:
predictions
,
"labels"
:
labels
}
outputs
=
{
"loss"
:
loss
,
"predictions"
:
predictions
,
"labels"
:
labels
}
return
outputs
...
...
@@ -83,9 +84,10 @@ def InferenceNet():
print
(
"Loading synthetic data."
)
(
labels
,
images
)
=
ofrecord_util
.
load_synthetic
(
args
)
logits
=
model_dict
[
args
.
model
](
images
,
need_transpose
=
not
args
.
use_new_dataloader
)
logits
=
model_dict
[
args
.
model
](
images
,
need_transpose
=
not
args
.
use_new_dataloader
)
predictions
=
flow
.
nn
.
softmax
(
logits
)
outputs
=
{
"predictions"
:
predictions
,
"labels"
:
labels
}
outputs
=
{
"predictions"
:
predictions
,
"labels"
:
labels
}
return
outputs
...
...
@@ -104,9 +106,7 @@ def main():
batch_size
=
train_batch_size
,
loss_key
=
'loss'
)
for
i
in
range
(
epoch_size
):
TrainNet
().
async_get
(
metric
.
metric_cb
(
epoch
,
i
))
# if i > 40:#debug
# break
#break
if
args
.
val_data_dir
:
metric
=
Metric
(
desc
=
'validation'
,
calculate_batches
=
num_val_steps
,
summary
=
summary
,
save_summary_steps
=
num_val_steps
,
batch_size
=
val_batch_size
)
...
...
cnn_e2e/of_cnn_val.py
浏览文件 @
0ae082d3
...
...
@@ -3,23 +3,24 @@ from __future__ import division
from
__future__
import
print_function
import
os
import
time
import
math
import
numpy
as
np
import
oneflow
as
flow
import
ofrecord_util
import
config
as
configs
from
util
import
Snapshot
,
Summary
,
InitNodes
,
Metric
from
job_function_util
import
get_val_config
import
alexnet_model
import
resnet_model
import
vgg_model
parser
=
configs
.
get_parser
()
args
=
parser
.
parse_args
()
configs
.
print_args
(
args
)
from
util
import
Snapshot
,
Summary
,
InitNodes
,
Metric
import
ofrecord_util
from
job_function_util
import
get_train_config
,
get_val_config
import
oneflow
as
flow
#import vgg_model
import
resnet_model
#import alexnet_model
total_device_num
=
args
.
num_nodes
*
args
.
gpu_num_per_node
train_batch_size
=
total_device_num
*
args
.
batch_size_per_device
...
...
@@ -31,13 +32,15 @@ num_val_steps = int(args.num_val_examples / val_batch_size)
model_dict
=
{
"resnet50"
:
resnet_model
.
resnet50
,
#"vgg16": vgg_model.vgg16,
#"alexnet": alexnet_model.alexnet,
"vgg16"
:
vgg_model
.
vgg16
,
"alexnet"
:
alexnet_model
.
alexnet
,
"inceptionv3"
:
inception_model
.
inceptionv3
,
}
flow
.
config
.
gpu_device_num
(
args
.
gpu_num_per_node
)
flow
.
config
.
enable_debug_mode
(
True
)
@
flow
.
function
(
get_val_config
(
args
))
def
InferenceNet
():
if
args
.
val_data_dir
:
...
...
@@ -50,7 +53,7 @@ def InferenceNet():
logits
=
model_dict
[
args
.
model
](
images
)
predictions
=
flow
.
nn
.
softmax
(
logits
)
outputs
=
{
"predictions"
:
predictions
,
"labels"
:
labels
}
outputs
=
{
"predictions"
:
predictions
,
"labels"
:
labels
}
return
outputs
...
...
@@ -64,7 +67,8 @@ def main():
summary
=
Summary
(
args
.
log_dir
,
args
)
for
epoch
in
range
(
args
.
num_epochs
):
model_load_dir
=
os
.
path
.
join
(
args
.
model_load_dir
,
'snapshot_epoch_{}'
.
format
(
epoch
))
model_load_dir
=
os
.
path
.
join
(
args
.
model_load_dir
,
'snapshot_epoch_{}'
.
format
(
epoch
))
snapshot
=
Snapshot
(
args
.
model_save_dir
,
model_load_dir
)
metric
=
Metric
(
desc
=
'validation'
,
calculate_batches
=
num_val_steps
,
summary
=
summary
,
save_summary_steps
=
num_val_steps
,
batch_size
=
val_batch_size
)
...
...
cnn_e2e/optimizer_util.py
浏览文件 @
0ae082d3
...
...
@@ -4,17 +4,19 @@ from __future__ import print_function
import
math
def
add_optimizer_args
(
parser
):
group
=
parser
.
add_argument_group
(
'optimizer parameters'
,
'entire group applies only to optimizer parameters'
)
group
.
add_argument
(
"--optimizer"
,
type
=
str
,
default
=
"momentum-cosine-decay"
,
help
=
"sgd, adam, momentum, momentum-cosine-decay"
)
#group.add_argument("--weight_decay_rate", type=float, default=1.0/32768, help="weight decay")
#
group.add_argument("--weight_decay_rate", type=float, default=1.0/32768, help="weight decay")
group
.
add_argument
(
"--learning_rate"
,
type
=
float
,
default
=
0.256
)
group
.
add_argument
(
'--warmup-epochs'
,
type
=
int
,
default
=
5
,
help
=
'the epochs to ramp-up lr to scaled large-batch value'
)
return
parser
def
get_optimizer
(
args
):
total_device_num
=
args
.
num_nodes
*
args
.
gpu_num_per_node
train_batch_size
=
total_device_num
*
args
.
batch_size_per_device
...
...
@@ -29,18 +31,18 @@ def get_optimizer(args):
"momentum-decay"
:
{
"momentum_conf"
:
{
"beta"
:
0.9
},
"learning_rate_decay"
:
{
"polynomial_conf"
:
{
"decay_batches"
:
300000
,
"end_learning_rate"
:
0.0001
,},
"polynomial_conf"
:
{
"decay_batches"
:
300000
,
"end_learning_rate"
:
0.0001
,
},
},
},
"momentum-cosine-decay"
:
{
"momentum_conf"
:
{
"beta"
:
0.875
},
"warmup_conf"
:
{
"linear_conf"
:
{
"warmup_batches"
:
num_warmup_batches
,
"start_multiplier"
:
0
}},
"warmup_conf"
:
{
"linear_conf"
:
{
"warmup_batches"
:
num_warmup_batches
,
"start_multiplier"
:
0
}},
"learning_rate_decay"
:
{
"cosine_conf"
:
{
"decay_batches"
:
decay_batches
}},
#"weight_decay_conf": {
#
"weight_decay_conf": {
# "weight_decay_rate": args.weight_decay_rate,
# #"excludes": {"pattern": ['', '']},
# "includes": {"pattern": ['weight']},
#}
#
}
},
}
return
optimizer_dict
[
args
.
optimizer
]
\ No newline at end of file
return
optimizer_dict
[
args
.
optimizer
]
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录