Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Oneflow-Inc
OneFlow-Benchmark
提交
a36664b0
O
OneFlow-Benchmark
项目概览
Oneflow-Inc
/
OneFlow-Benchmark
上一次同步 接近 3 年
通知
1
Star
92
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
OneFlow-Benchmark
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
a36664b0
编写于
3月 13, 2020
作者:
S
ShawnXuan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
prepare for ofrecord
上级
238e8449
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
168 addition
and
0 deletion
+168
-0
cnn_e2e/of_cnn_train_val.py
cnn_e2e/of_cnn_train_val.py
+143
-0
of_e2e.sh
of_e2e.sh
+25
-0
未找到文件。
cnn_e2e/of_cnn_train_val.py
0 → 100755
浏览文件 @
a36664b0
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
time
import
math
import
numpy
as
np
import
config
as
configs
parser
=
configs
.
get_parser
()
args
=
parser
.
parse_args
()
configs
.
print_args
(
args
)
from
util
import
Snapshot
,
Summary
,
InitNodes
,
StopWatch
from
dali_util
import
get_rec_iter
from
job_function_util
import
get_train_config
,
get_val_config
import
oneflow
as
flow
#import vgg_model
import
resnet_model
#import alexnet_model
total_device_num
=
args
.
num_nodes
*
args
.
gpu_num_per_node
train_batch_size
=
total_device_num
*
args
.
batch_size_per_device
val_batch_size
=
total_device_num
*
args
.
val_batch_size_per_device
(
C
,
H
,
W
)
=
args
.
image_shape
num_val_steps
=
args
.
num_val_examples
/
val_batch_size
summary
=
Summary
(
args
.
log_dir
,
args
)
timer
=
StopWatch
()
model_dict
=
{
"resnet50"
:
resnet_model
.
resnet50
,
#"vgg16": vgg_model.vgg16,
#"alexnet": alexnet_model.alexnet,
}
flow
.
config
.
gpu_device_num
(
args
.
gpu_num_per_node
)
flow
.
config
.
enable_debug_mode
(
True
)
@
flow
.
function
(
get_train_config
(
args
))
def
TrainNet
(
images
=
flow
.
FixedTensorDef
((
train_batch_size
,
H
,
W
,
C
),
dtype
=
flow
.
float
),
labels
=
flow
.
FixedTensorDef
((
train_batch_size
,
),
dtype
=
flow
.
int32
)):
logits
=
model_dict
[
args
.
model
](
images
)
loss
=
flow
.
nn
.
sparse_softmax_cross_entropy_with_logits
(
labels
,
logits
,
name
=
"softmax_loss"
)
#loss = flow.math.reduce_mean(loss)
flow
.
losses
.
add_loss
(
loss
)
softmax
=
flow
.
nn
.
softmax
(
logits
)
outputs
=
{
"loss"
:
loss
,
"softmax"
:
softmax
,
"labels"
:
labels
}
return
outputs
@
flow
.
function
(
get_val_config
(
args
))
def
InferenceNet
(
images
=
flow
.
FixedTensorDef
((
val_batch_size
,
H
,
W
,
C
),
dtype
=
flow
.
float
),
labels
=
flow
.
FixedTensorDef
((
val_batch_size
,
),
dtype
=
flow
.
int32
)):
logits
=
model_dict
[
args
.
model
](
images
)
softmax
=
flow
.
nn
.
softmax
(
logits
)
outputs
=
{
"softmax"
:
softmax
,
"labels"
:
labels
}
return
outputs
#(softmax, labels)
def
acc_acc
(
step
,
predictions
):
classfications
=
np
.
argmax
(
predictions
[
'softmax'
].
ndarray
(),
axis
=
1
)
labels
=
predictions
[
'labels'
].
reshape
(
-
1
)
if
step
==
0
:
main
.
correct
=
0.0
main
.
total
=
0.0
else
:
main
.
correct
+=
np
.
sum
(
classfications
==
labels
);
main
.
total
+=
len
(
labels
)
def
train_callback
(
epoch
,
step
):
def
callback
(
train_outputs
):
acc_acc
(
step
,
train_outputs
)
loss
=
train_outputs
[
'loss'
].
mean
()
summary
.
scalar
(
'loss'
,
loss
,
step
)
#summary.scalar('learning_rate', train_outputs['lr'], step)
if
(
step
-
1
)
%
args
.
loss_print_every_n_iter
==
0
:
throughput
=
args
.
loss_print_every_n_iter
*
train_batch_size
/
timer
.
split
()
accuracy
=
main
.
correct
/
main
.
total
print
(
"epoch {}, iter {}, loss: {:.6f}, accuracy: {:.6f}, samples/s: {:.3f}"
.
format
(
epoch
,
step
-
1
,
loss
,
accuracy
,
throughput
))
summary
.
scalar
(
'train_accuracy'
,
accuracy
,
step
)
main
.
correct
=
0.0
main
.
total
=
0.0
return
callback
def
do_predictions
(
epoch
,
predict_step
,
predictions
):
acc_acc
(
predict_step
,
predictions
)
if
predict_step
+
1
==
num_val_steps
:
assert
main
.
total
>
0
summary
.
scalar
(
'top1_accuracy'
,
main
.
correct
/
main
.
total
,
epoch
)
#summary.scalar('top1_correct', main.correct, epoch)
#summary.scalar('total_val_images', main.total, epoch)
print
(
"epoch {}, top 1 accuracy: {:.6f}, time: {:.2f}"
.
format
(
epoch
,
main
.
correct
/
main
.
total
,
timer
.
split
()))
def
predict_callback
(
epoch
,
predict_step
):
def
callback
(
predictions
):
do_predictions
(
epoch
,
predict_step
,
predictions
)
return
callback
def
main
():
InitNodes
(
args
)
flow
.
env
.
grpc_use_no_signal
()
flow
.
env
.
log_dir
(
args
.
log_dir
)
snapshot
=
Snapshot
(
args
.
model_save_dir
,
args
.
model_load_dir
)
train_data_iter
,
val_data_iter
=
get_rec_iter
(
args
,
True
)
timer
.
start
()
for
epoch
in
range
(
args
.
num_epochs
):
tic
=
time
.
time
()
print
(
'Starting epoch {} at {:.2f}'
.
format
(
epoch
,
tic
))
train_data_iter
.
reset
()
for
i
,
batches
in
enumerate
(
train_data_iter
):
images
,
labels
=
batches
TrainNet
(
images
,
labels
).
async_get
(
train_callback
(
epoch
,
i
))
# if i > 30:#debug
# break
#break
print
(
'epoch {} training time: {:.2f}'
.
format
(
epoch
,
time
.
time
()
-
tic
))
if
args
.
data_val
:
tic
=
time
.
time
()
val_data_iter
.
reset
()
for
i
,
batches
in
enumerate
(
val_data_iter
):
images
,
labels
=
batches
InferenceNet
(
images
,
labels
).
async_get
(
predict_callback
(
epoch
,
i
))
#acc_acc(i, InferenceNet(images, labels.astype(np.int32)).get())
summary
.
save
()
snapshot
.
save
(
'epoch_{}'
.
format
(
epoch
+
1
))
if
__name__
==
"__main__"
:
main
()
of_e2e.sh
0 → 100755
浏览文件 @
a36664b0
rm
-rf
core.
*
DATA_ROOT
=
/mnt/13_nfs/xuan/ImageNet/mxnet
#DATA_ROOT=/dataset/imagenet-mxnet
#python3 cnn_benchmark/of_cnn_train_val.py \
#gdb --args \
#nvprof -f -o resnet.nvvp \
python3 cnn_e2e/dali_cnn_train_val.py
\
--data_train
=
$DATA_ROOT
/train.rec
\
--data_train_idx
=
$DATA_ROOT
/train.idx
\
--data_val
=
$DATA_ROOT
/val.rec
\
--data_val_idx
=
$DATA_ROOT
/val.idx
\
--gpu_num_per_node
=
4
\
--optimizer
=
"momentum-cosine-decay"
\
--learning_rate
=
0.256
\
--loss_print_every_n_iter
=
20
\
--batch_size_per_device
=
64
\
--val_batch_size_per_device
=
125
\
--model
=
"resnet50"
#--use_fp16 true \
#--weight_l2=3.0517578125e-05 \
#--num_examples=1024 \
#--optimizer="momentum-decay" \
#--data_dir="/mnt/13_nfs/xuan/ImageNet/ofrecord/train"
#--data_dir="/mnt/dataset/xuan/ImageNet/ofrecord/train"
#--warmup_iter_num=10000 \
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录