Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Oneflow-Inc
OneFlow-Benchmark
提交
5462fe30
O
OneFlow-Benchmark
项目概览
Oneflow-Inc
/
OneFlow-Benchmark
上一次同步 接近 3 年
通知
1
Star
92
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
OneFlow-Benchmark
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
5462fe30
编写于
2月 05, 2020
作者:
S
ShawnXuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
support iteration
上级
a88bc643
变更
4
展开全部
隐藏空白更改
内联
并排
Showing
4 changed file
with
498 addition
and
89 deletion
+498
-89
cnn_benchmark/config.py
cnn_benchmark/config.py
+16
-3
cnn_benchmark/dali.py
cnn_benchmark/dali.py
+425
-40
cnn_benchmark/of_cnn_train_val.py
cnn_benchmark/of_cnn_train_val.py
+43
-45
cnn_benchmark/util.py
cnn_benchmark/util.py
+14
-1
未找到文件。
cnn_benchmark/config.py
浏览文件 @
5462fe30
...
...
@@ -18,8 +18,10 @@ def get_parser(parser=None):
# resouce
parser
.
add_argument
(
"--gpu_num_per_node"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--node_num"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--node_list"
,
type
=
str
,
default
=
None
,
help
=
"nodes' IP address, split by comma"
)
parser
.
add_argument
(
'--num_nodes'
,
type
=
int
,
default
=
1
,
help
=
'node/machine number for training'
)
parser
.
add_argument
(
'--node_ips'
,
type
=
str_list
,
default
=
[
'192.168.1.15'
,
'192.168.1.16'
],
help
=
'nodes ip list for training, devided by ",", length >= num_nodes'
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"vgg16"
,
help
=
"vgg16 or resnet50"
)
...
...
@@ -35,6 +37,17 @@ def get_parser(parser=None):
parser
.
add_argument
(
"--image_size"
,
type
=
int
,
default
=
224
,
help
=
"image size"
)
#Todo, remove
# from mxnet
parser
.
add_argument
(
'--num_epochs'
,
type
=
int
,
default
=
90
,
help
=
'number of epochs'
)
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
0.1
,
help
=
'initial learning rate'
)
parser
.
add_argument
(
'--lr-schedule'
,
choices
=
(
'multistep'
,
'cosine'
),
default
=
'cosine'
,
help
=
'learning rate schedule'
)
parser
.
add_argument
(
'--lr-factor'
,
type
=
float
,
default
=
0.256
,
help
=
'the ratio to reduce lr on each step'
)
parser
.
add_argument
(
'--lr-steps'
,
type
=
float_list
,
default
=
[],
help
=
'the epochs to reduce the lr, e.g. 30,60'
)
parser
.
add_argument
(
'--warmup-epochs'
,
type
=
int
,
default
=
5
,
help
=
'the epochs to ramp-up lr to scaled large-batch value'
)
parser
.
add_argument
(
"--input_layout"
,
type
=
str
,
default
=
'NHWC'
,
help
=
"NCHW or NHWC"
)
parser
.
add_argument
(
'--image-shape'
,
type
=
int_list
,
default
=
[
3
,
224
,
224
],
help
=
'the image shape feed into the network'
)
...
...
@@ -64,7 +77,7 @@ def get_parser(parser=None):
# validation
parser
.
add_argument
(
"--val_step_num"
,
type
=
int
,
default
=
10
,
help
=
"total validation step number"
)
parser
.
add_argument
(
"--val_batch_size_per_device"
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
"--val_batch_size_per_device"
,
type
=
int
,
default
=
100
)
parser
.
add_argument
(
"--val_data_dir"
,
type
=
str
,
default
=
None
,
help
=
"validation dataset directory"
)
parser
.
add_argument
(
"--val_data_part_num"
,
type
=
int
,
default
=
32
,
help
=
"validation data part number"
)
...
...
cnn_benchmark/dali.py
浏览文件 @
5462fe30
此差异已折叠。
点击以展开。
cnn_benchmark/of_cnn_train_val.py
浏览文件 @
5462fe30
...
...
@@ -5,6 +5,7 @@ from __future__ import print_function
import
os
import
time
import
numpy
as
np
import
logging
import
oneflow
as
flow
...
...
@@ -14,14 +15,13 @@ import resnet_model
import
alexnet_model
import
config
as
configs
from
util
import
Snapshot
,
Summary
,
print_args
,
make_lr
from
dali
import
get_rec_
pipe
from
util
import
Snapshot
,
Summary
,
print_args
,
make_lr
,
nodes_init
from
dali
import
get_rec_
iter
parser
=
configs
.
get_parser
()
#args = parser.parse_known_args()[0]
args
=
parser
.
parse_args
()
print
(
args
)
summary
=
Summary
(
args
.
log_dir
,
args
)
...
...
@@ -47,7 +47,7 @@ optimizer_dict = {
# "warmup_conf": {"linear_conf": {"warmup_batches":10000, "start_multiplier":0}},
total_device_num
=
args
.
n
ode_num
*
args
.
gpu_num_per_node
total_device_num
=
args
.
n
um_nodes
*
args
.
gpu_num_per_node
train_batch_size
=
total_device_num
*
args
.
batch_size_per_device
val_batch_size
=
total_device_num
*
args
.
val_batch_size_per_device
(
H
,
W
,
C
)
=
(
args
.
image_size
,
args
.
image_size
,
3
)
...
...
@@ -148,54 +148,54 @@ def InferenceNet():
return
(
softmax
,
labels
)
def
train_callback
(
step
):
def
callback
(
train_outputs
):
loss
=
train_outputs
[
'loss'
].
mean
()
summary
.
scalar
(
'loss'
,
loss
,
step
)
#summary.scalar('learning_rate', train_outputs['lr'], step)
if
(
step
-
1
)
%
args
.
loss_print_every_n_iter
==
0
:
print
(
"iter {}, loss: {:.6f}"
.
format
(
step
-
1
,
loss
))
return
callback
def
do_predictions
(
step
,
predict_step
,
predictions
):
classfications
=
np
.
argmax
(
predictions
[
0
].
ndarray
(),
axis
=
1
)
labels
=
predictions
[
1
]
if
predict_step
==
0
:
main
.
correct
=
0.0
main
.
total
=
0.0
else
:
main
.
correct
+=
np
.
sum
(
classfications
==
labels
);
main
.
total
+=
len
(
labels
)
if
predict_step
+
1
==
args
.
val_step_num
:
assert
main
.
total
>
0
summary
.
scalar
(
'top1_accuracy'
,
main
.
correct
/
main
.
total
,
step
)
#summary.scalar('top1_correct', main.correct, step)
#summary.scalar('total_val_images', main.total, step)
print
(
"iter {}, top 1 accuracy: {:.6f}"
.
format
(
step
,
main
.
correct
/
main
.
total
))
def
predict_callback
(
step
,
predict_step
):
def
callback
(
predictions
):
do_predictions
(
step
,
predict_step
,
predictions
)
return
callback
def
main
():
print_args
(
args
)
def
train_callback
(
step
):
def
callback
(
train_outputs
):
loss
=
train_outputs
[
'loss'
].
mean
()
summary
.
scalar
(
'loss'
,
loss
,
step
)
#summary.scalar('learning_rate', train_outputs['lr'], step)
if
(
step
-
1
)
%
args
.
loss_print_every_n_iter
==
0
:
print
(
"iter {}, loss: {:.6f}"
.
format
(
step
-
1
,
loss
))
return
callback
def
do_predictions
(
step
,
predict_step
,
predictions
):
classfications
=
np
.
argmax
(
predictions
[
0
].
ndarray
(),
axis
=
1
)
labels
=
predictions
[
1
]
if
predict_step
==
0
:
main
.
correct
=
0.0
main
.
total
=
0.0
else
:
main
.
correct
+=
np
.
sum
(
classfications
==
labels
);
main
.
total
+=
len
(
labels
)
if
predict_step
+
1
==
args
.
val_step_num
:
assert
main
.
total
>
0
summary
.
scalar
(
'top1_accuracy'
,
main
.
correct
/
main
.
total
,
step
)
#summary.scalar('top1_correct', main.correct, step)
#summary.scalar('total_val_images', main.total, step)
print
(
"iter {}, top 1 accuracy: {:.6f}"
.
format
(
step
,
main
.
correct
/
main
.
total
))
def
predict_callback
(
step
,
predict_step
):
def
callback
(
predictions
):
do_predictions
(
step
,
predict_step
,
predictions
)
return
callback
nodes_init
(
args
)
flow
.
env
.
grpc_use_no_signal
()
flow
.
env
.
log_dir
(
args
.
log_dir
)
if
args
.
node_num
>
1
:
nodes
=
[]
for
n
in
args
.
node_list
.
strip
().
split
(
","
):
addr_dict
=
{}
addr_dict
[
"addr"
]
=
n
nodes
.
append
(
addr_dict
)
flow
.
env
.
machine
(
nodes
)
snapshot
=
Snapshot
(
args
.
model_save_dir
,
args
.
model_load_dir
)
epoch
=
0
for
epoch
in
range
(
args
.
num_epoch
):
logging
.
info
(
'Starting epoch {}'
.
format
(
epoch
))
train_pipe
,
val_pipe
=
get_rec_pipe
(
args
,
True
,
seed
=
epoch
)
exit
()
train_pipe
,
_
=
get_rec_pipe
(
args
,
True
)
for
step
in
range
(
args
.
train_step_num
):
# save model every n iter
images
,
labels
=
train_pipe
.
run
()
...
...
@@ -207,8 +207,6 @@ def main():
snapshot
.
save
(
step
)
#TrainNet().async_get(train_callback(step+1))
#print(images.as_cpu().as_array().shape)
#break
NumpyTrainNet
(
images
.
as_cpu
().
as_array
(),
labels
.
as_array
().
astype
(
np
.
int32
)).
async_get
(
train_callback
(
step
+
1
))
step
+=
1
...
...
cnn_benchmark/util.py
浏览文件 @
5462fe30
...
...
@@ -5,6 +5,18 @@ from datetime import datetime
import
oneflow
as
flow
def
nodes_init
(
args
):
if
args
.
num_nodes
>
1
:
assert
args
.
num_nodes
<=
len
(
args
.
node_ips
)
nodes
=
[]
for
n
in
args
.
node_list
.
strip
().
split
(
","
):
addr_dict
=
{}
addr_dict
[
"addr"
]
=
n
nodes
.
append
(
addr_dict
)
flow
.
env
.
machine
(
nodes
)
class
Snapshot
:
def
__init__
(
self
,
model_save_dir
,
model_load_dir
):
self
.
_model_save_dir
=
model_save_dir
...
...
@@ -24,6 +36,7 @@ class Snapshot:
print
(
"Saving model to {}."
.
format
(
snapshot_save_path
))
self
.
_check_point
.
save
(
snapshot_save_path
)
class
Summary
():
def
__init__
(
self
,
log_dir
,
config
):
self
.
_log_dir
=
log_dir
...
...
@@ -82,7 +95,7 @@ def make_lr(train_step_name, model_update_conf, primary_lr, secondary_lr=None):
def
print_args
(
args
):
print
(
"="
.
ljust
(
66
,
"="
))
print
(
"Running {}: num_gpu_per_node = {}, num_nodes = {}."
.
format
(
args
.
model
,
args
.
gpu_num_per_node
,
args
.
n
ode_num
))
args
.
model
,
args
.
gpu_num_per_node
,
args
.
n
um_nodes
))
print
(
"="
.
ljust
(
66
,
"="
))
for
arg
in
vars
(
args
):
print
(
"{} = {}"
.
format
(
arg
,
getattr
(
args
,
arg
)))
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录