Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Oneflow-Inc
OneFlow-Benchmark
提交
d23d33fc
O
OneFlow-Benchmark
项目概览
Oneflow-Inc
/
OneFlow-Benchmark
上一次同步 接近 3 年
通知
1
Star
92
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
OneFlow-Benchmark
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
d23d33fc
编写于
4月 01, 2020
作者:
S
ShawnXuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
modify scripts for dali 2 nodes
上级
227fef9a
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
22 addition
and
66 deletion
+22
-66
cnn_e2e/dali_cnn_train_val.py
cnn_e2e/dali_cnn_train_val.py
+16
-64
dali_e2e.sh
dali_e2e.sh
+6
-2
未找到文件。
cnn_e2e/dali_cnn_train_val.py
浏览文件 @
d23d33fc
...
@@ -3,7 +3,6 @@ from __future__ import division
...
@@ -3,7 +3,6 @@ from __future__ import division
from
__future__
import
print_function
from
__future__
import
print_function
import
os
import
os
import
time
import
math
import
math
import
numpy
as
np
import
numpy
as
np
...
@@ -12,7 +11,7 @@ parser = configs.get_parser()
...
@@ -12,7 +11,7 @@ parser = configs.get_parser()
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
configs
.
print_args
(
args
)
configs
.
print_args
(
args
)
from
util
import
Snapshot
,
Summary
,
InitNodes
,
StopWatch
from
util
import
Snapshot
,
Summary
,
InitNodes
,
Metric
from
dali_util
import
get_rec_iter
from
dali_util
import
get_rec_iter
from
job_function_util
import
get_train_config
,
get_val_config
from
job_function_util
import
get_train_config
,
get_val_config
import
oneflow
as
flow
import
oneflow
as
flow
...
@@ -25,10 +24,9 @@ total_device_num = args.num_nodes * args.gpu_num_per_node
...
@@ -25,10 +24,9 @@ total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size
=
total_device_num
*
args
.
batch_size_per_device
train_batch_size
=
total_device_num
*
args
.
batch_size_per_device
val_batch_size
=
total_device_num
*
args
.
val_batch_size_per_device
val_batch_size
=
total_device_num
*
args
.
val_batch_size_per_device
(
C
,
H
,
W
)
=
args
.
image_shape
(
C
,
H
,
W
)
=
args
.
image_shape
epoch_size
=
math
.
ceil
(
args
.
num_examples
/
train_batch_size
)
num_val_steps
=
args
.
num_val_examples
/
val_batch_size
num_val_steps
=
args
.
num_val_examples
/
val_batch_size
summary
=
Summary
(
args
.
log_dir
,
args
)
timer
=
StopWatch
()
model_dict
=
{
model_dict
=
{
"resnet50"
:
resnet_model
.
resnet50
,
"resnet50"
:
resnet_model
.
resnet50
,
...
@@ -45,10 +43,10 @@ def TrainNet(images=flow.FixedTensorDef((train_batch_size, H, W, C), dtype=flow.
...
@@ -45,10 +43,10 @@ def TrainNet(images=flow.FixedTensorDef((train_batch_size, H, W, C), dtype=flow.
labels
=
flow
.
FixedTensorDef
((
train_batch_size
,
),
dtype
=
flow
.
int32
)):
labels
=
flow
.
FixedTensorDef
((
train_batch_size
,
),
dtype
=
flow
.
int32
)):
logits
=
model_dict
[
args
.
model
](
images
)
logits
=
model_dict
[
args
.
model
](
images
)
loss
=
flow
.
nn
.
sparse_softmax_cross_entropy_with_logits
(
labels
,
logits
,
name
=
"softmax_loss"
)
loss
=
flow
.
nn
.
sparse_softmax_cross_entropy_with_logits
(
labels
,
logits
,
name
=
"softmax_loss"
)
#
loss = flow.math.reduce_mean(loss)
loss
=
flow
.
math
.
reduce_mean
(
loss
)
flow
.
losses
.
add_loss
(
loss
)
flow
.
losses
.
add_loss
(
loss
)
softmax
=
flow
.
nn
.
softmax
(
logits
)
predictions
=
flow
.
nn
.
softmax
(
logits
)
outputs
=
{
"loss"
:
loss
,
"
softmax"
:
softmax
,
"labels"
:
labels
}
outputs
=
{
"loss"
:
loss
,
"
predictions"
:
predictions
,
"labels"
:
labels
}
return
outputs
return
outputs
...
@@ -56,87 +54,41 @@ def TrainNet(images=flow.FixedTensorDef((train_batch_size, H, W, C), dtype=flow.
...
@@ -56,87 +54,41 @@ def TrainNet(images=flow.FixedTensorDef((train_batch_size, H, W, C), dtype=flow.
def
InferenceNet
(
images
=
flow
.
FixedTensorDef
((
val_batch_size
,
H
,
W
,
C
),
dtype
=
flow
.
float
),
def
InferenceNet
(
images
=
flow
.
FixedTensorDef
((
val_batch_size
,
H
,
W
,
C
),
dtype
=
flow
.
float
),
labels
=
flow
.
FixedTensorDef
((
val_batch_size
,
),
dtype
=
flow
.
int32
)):
labels
=
flow
.
FixedTensorDef
((
val_batch_size
,
),
dtype
=
flow
.
int32
)):
logits
=
model_dict
[
args
.
model
](
images
)
logits
=
model_dict
[
args
.
model
](
images
)
softmax
=
flow
.
nn
.
softmax
(
logits
)
predictions
=
flow
.
nn
.
softmax
(
logits
)
outputs
=
{
"
softmax"
:
softmax
,
"labels"
:
labels
}
outputs
=
{
"
predictions"
:
predictions
,
"labels"
:
labels
}
return
outputs
#(softmax, labels)
return
outputs
#(softmax, labels)
def
acc_acc
(
step
,
predictions
):
classfications
=
np
.
argmax
(
predictions
[
'softmax'
].
ndarray
(),
axis
=
1
)
labels
=
predictions
[
'labels'
].
reshape
(
-
1
)
if
step
==
0
:
main
.
correct
=
0.0
main
.
total
=
0.0
else
:
main
.
correct
+=
np
.
sum
(
classfications
==
labels
);
main
.
total
+=
len
(
labels
)
def
train_callback
(
epoch
,
step
):
def
callback
(
train_outputs
):
acc_acc
(
step
,
train_outputs
)
loss
=
train_outputs
[
'loss'
].
mean
()
summary
.
scalar
(
'loss'
,
loss
,
step
)
#summary.scalar('learning_rate', train_outputs['lr'], step)
if
(
step
-
1
)
%
args
.
loss_print_every_n_iter
==
0
:
throughput
=
args
.
loss_print_every_n_iter
*
train_batch_size
/
timer
.
split
()
accuracy
=
main
.
correct
/
main
.
total
print
(
"epoch {}, iter {}, loss: {:.6f}, accuracy: {:.6f}, samples/s: {:.3f}"
.
format
(
epoch
,
step
-
1
,
loss
,
accuracy
,
throughput
))
summary
.
scalar
(
'train_accuracy'
,
accuracy
,
step
)
main
.
correct
=
0.0
main
.
total
=
0.0
return
callback
def
do_predictions
(
epoch
,
predict_step
,
predictions
):
acc_acc
(
predict_step
,
predictions
)
if
predict_step
+
1
==
num_val_steps
:
assert
main
.
total
>
0
summary
.
scalar
(
'top1_accuracy'
,
main
.
correct
/
main
.
total
,
epoch
)
#summary.scalar('top1_correct', main.correct, epoch)
#summary.scalar('total_val_images', main.total, epoch)
print
(
"epoch {}, top 1 accuracy: {:.6f}, time: {:.2f}"
.
format
(
epoch
,
main
.
correct
/
main
.
total
,
timer
.
split
()))
def
predict_callback
(
epoch
,
predict_step
):
def
callback
(
predictions
):
do_predictions
(
epoch
,
predict_step
,
predictions
)
return
callback
def
main
():
def
main
():
InitNodes
(
args
)
InitNodes
(
args
)
flow
.
env
.
grpc_use_no_signal
()
flow
.
env
.
grpc_use_no_signal
()
flow
.
env
.
log_dir
(
args
.
log_dir
)
flow
.
env
.
log_dir
(
args
.
log_dir
)
summary
=
Summary
(
args
.
log_dir
,
args
)
snapshot
=
Snapshot
(
args
.
model_save_dir
,
args
.
model_load_dir
)
snapshot
=
Snapshot
(
args
.
model_save_dir
,
args
.
model_load_dir
)
train_data_iter
,
val_data_iter
=
get_rec_iter
(
args
,
True
)
train_data_iter
,
val_data_iter
=
get_rec_iter
(
args
,
True
)
timer
.
start
()
for
epoch
in
range
(
args
.
num_epochs
):
for
epoch
in
range
(
args
.
num_epochs
):
tic
=
time
.
time
()
metric
=
Metric
(
desc
=
'train'
,
calculate_batches
=
args
.
loss_print_every_n_iter
,
print
(
'Starting epoch {} at {:.2f}'
.
format
(
epoch
,
tic
))
summary
=
summary
,
save_summary_steps
=
epoch_size
,
batch_size
=
train_batch_size
,
loss_key
=
'loss'
)
train_data_iter
.
reset
()
train_data_iter
.
reset
()
for
i
,
batches
in
enumerate
(
train_data_iter
):
for
i
,
batches
in
enumerate
(
train_data_iter
):
images
,
labels
=
batches
images
,
labels
=
batches
TrainNet
(
images
,
labels
).
async_get
(
train_callback
(
epoch
,
i
))
TrainNet
(
images
,
labels
).
async_get
(
metric
.
metric_cb
(
epoch
,
i
))
# if i > 30:#debug
# if i > 30:#debug
# break
# break
#break
#break
print
(
'epoch {} training time: {:.2f}'
.
format
(
epoch
,
time
.
time
()
-
tic
))
if
args
.
data_val
:
if
args
.
data_val
:
tic
=
time
.
time
()
metric
=
Metric
(
desc
=
'validation'
,
calculate_batches
=
num_val_steps
,
summary
=
summary
,
save_summary_steps
=
num_val_steps
,
batch_size
=
val_batch_size
)
val_data_iter
.
reset
()
val_data_iter
.
reset
()
for
i
,
batches
in
enumerate
(
val_data_iter
):
for
i
,
batches
in
enumerate
(
val_data_iter
):
images
,
labels
=
batches
images
,
labels
=
batches
InferenceNet
(
images
,
labels
).
async_get
(
predict_callback
(
epoch
,
i
))
InferenceNet
(
images
,
labels
).
async_get
(
metric
.
metric_cb
(
epoch
,
i
))
#acc_acc(i, InferenceNet(images, labels.astype(np.int32)).get())
summary
.
save
()
snapshot
.
save
(
'epoch_{}'
.
format
(
epoch
))
snapshot
.
save
(
'epoch_{}'
.
format
(
epoch
+
1
))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
dali_e2e.sh
浏览文件 @
d23d33fc
rm
-rf
core.
*
rm
-rf
core.
*
DATA_ROOT
=
/mnt/13_nfs/xuan/ImageNet/mxnet
rm
-rf
output/snapshots/
*
#DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/mxnet
DATA_ROOT
=
/ssd/ImageNet/mxnet
#DATA_ROOT=/dataset/imagenet-mxnet
#DATA_ROOT=/dataset/imagenet-mxnet
#python3 cnn_benchmark/of_cnn_train_val.py \
#python3 cnn_benchmark/of_cnn_train_val.py \
#gdb --args \
#gdb --args \
...
@@ -9,11 +11,13 @@ DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/mxnet
...
@@ -9,11 +11,13 @@ DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/mxnet
--data_train_idx
=
$DATA_ROOT
/train.idx
\
--data_train_idx
=
$DATA_ROOT
/train.idx
\
--data_val
=
$DATA_ROOT
/val.rec
\
--data_val
=
$DATA_ROOT
/val.rec
\
--data_val_idx
=
$DATA_ROOT
/val.idx
\
--data_val_idx
=
$DATA_ROOT
/val.idx
\
--num_nodes
=
2
\
--node_ips
=
'11.11.1.12,11.11.1.14'
\
--gpu_num_per_node
=
4
\
--gpu_num_per_node
=
4
\
--optimizer
=
"momentum-cosine-decay"
\
--optimizer
=
"momentum-cosine-decay"
\
--learning_rate
=
0.256
\
--learning_rate
=
0.256
\
--loss_print_every_n_iter
=
20
\
--loss_print_every_n_iter
=
20
\
--batch_size_per_device
=
64
\
--batch_size_per_device
=
32
\
--val_batch_size_per_device
=
125
\
--val_batch_size_per_device
=
125
\
--model
=
"resnet50"
--model
=
"resnet50"
#--use_fp16 true \
#--use_fp16 true \
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录