提交 96861e7d 编写于 作者: S ShawnXuan

rm dali files

上级 daed129e
......@@ -6,7 +6,6 @@ import argparse
from datetime import datetime
import logging
#from dali_util import add_dali_args
from optimizer_util import add_optimizer_args
from ofrecord_util import add_ofrecord_args
......@@ -22,7 +21,6 @@ def get_parser(parser=None):
parser = argparse.ArgumentParser("flags for cnn benchmark")
parser.add_argument("--dtype", type=str, default='float32', help="float16 float32")
parser.add_argument("--dataloader", type=str, default='oneflow', help="oneflow or dali")
# resouce
parser.add_argument("--gpu_num_per_node", type=int, default=1)
......
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import math
import numpy as np
import config as configs
parser = configs.get_parser()
args = parser.parse_args()
configs.print_args(args)
from util import Snapshot, Summary, InitNodes, Metric
from dali_util import get_rec_iter
from job_function_util import get_train_config, get_val_config
import oneflow as flow
#import vgg_model
import resnet_model
#import alexnet_model
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
val_batch_size = total_device_num * args.val_batch_size_per_device
(C, H, W) = args.image_shape
epoch_size = math.ceil(args.num_examples / train_batch_size)
num_val_steps = args.num_val_examples / val_batch_size
model_dict = {
"resnet50": resnet_model.resnet50,
#"vgg16": vgg_model.vgg16,
#"alexnet": alexnet_model.alexnet,
}
flow.config.gpu_device_num(args.gpu_num_per_node)
flow.config.enable_debug_mode(True)
@flow.function(get_train_config(args))
def TrainNet(images=flow.FixedTensorDef((train_batch_size, H, W, C), dtype=flow.float),
labels=flow.FixedTensorDef((train_batch_size, ), dtype=flow.int32)):
logits = model_dict[args.model](images)
loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss")
loss = flow.math.reduce_mean(loss)
flow.losses.add_loss(loss)
predictions = flow.nn.softmax(logits)
outputs = {"loss": loss, "predictions":predictions, "labels": labels}
return outputs
@flow.function(get_val_config(args))
def InferenceNet(images=flow.FixedTensorDef((val_batch_size, H, W, C), dtype=flow.float),
labels=flow.FixedTensorDef((val_batch_size, ), dtype=flow.int32)):
logits = model_dict[args.model](images)
predictions = flow.nn.softmax(logits)
outputs = {"predictions":predictions, "labels": labels}
return outputs#(softmax, labels)
def main():
InitNodes(args)
flow.env.grpc_use_no_signal()
flow.env.log_dir(args.log_dir)
summary = Summary(args.log_dir, args)
snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
train_data_iter, val_data_iter = get_rec_iter(args, True)
for epoch in range(args.num_epochs):
metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter,
summary=summary, save_summary_steps=epoch_size,
batch_size=train_batch_size, loss_key='loss')
train_data_iter.reset()
for i, batches in enumerate(train_data_iter):
images, labels = batches
TrainNet(images, labels).async_get(metric.metric_cb(epoch, i))
# if i > 30:#debug
# break
#break
if args.data_val:
metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary,
save_summary_steps=num_val_steps, batch_size=val_batch_size)
val_data_iter.reset()
for i, batches in enumerate(val_data_iter):
images, labels = batches
InferenceNet(images, labels).async_get(metric.metric_cb(epoch, i))
snapshot.save('epoch_{}'.format(epoch))
if __name__ == "__main__":
main()
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import math
import numpy as np
import config as configs
parser = configs.get_parser()
args = parser.parse_args()
configs.print_args(args)
from util import Snapshot, Summary, InitNodes, Metric
from dali_util import get_rec_iter
from job_function_util import get_train_config, get_val_config
import oneflow as flow
#import vgg_model
import resnet_model
#import alexnet_model
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
val_batch_size = total_device_num * args.val_batch_size_per_device
(C, H, W) = args.image_shape
num_val_steps = int(args.num_val_examples / val_batch_size)
model_dict = {
"resnet50": resnet_model.resnet50,
#"vgg16": vgg_model.vgg16,
#"alexnet": alexnet_model.alexnet,
}
flow.config.gpu_device_num(args.gpu_num_per_node)
flow.config.enable_debug_mode(True)
@flow.function(get_val_config(args))
def InferenceNet(images=flow.FixedTensorDef((val_batch_size, H, W, C), dtype=flow.float),
labels=flow.FixedTensorDef((val_batch_size, ), dtype=flow.int32)):
logits = model_dict[args.model](images)
softmax = flow.nn.softmax(logits)
outputs = {"predictions":softmax, "labels": labels}
return outputs#(softmax, labels)
def main():
InitNodes(args)
assert args.model_load_dir, 'must have model load dir'
flow.env.grpc_use_no_signal()
flow.env.log_dir(args.log_dir)
summary = Summary(args.log_dir, args)
train_data_iter, val_data_iter = get_rec_iter(args, True)
for epoch in range(args.num_epochs):
model_load_dir = os.path.join(args.model_load_dir, 'snapshot_epoch_{}'.format(epoch))
snapshot = Snapshot(args.model_save_dir, model_load_dir)
metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary,
save_summary_steps=num_val_steps, batch_size=val_batch_size)
val_data_iter.reset()
for i, batches in enumerate(val_data_iter):
images, labels = batches
images = images[:,:,:,::-1]
InferenceNet(images, labels).async_get(metric.metric_cb(epoch, i))
summary.save()
if __name__ == "__main__":
main()
此差异已折叠。
import config as configs
parser = configs.get_parser()
args = parser.parse_args()
configs.print_args(args)
from dali_util import get_rec_iter
import numpy as np
import ofrecord_util
from job_function_util import get_val_config
import oneflow as flow
from PIL import Image
flow.config.gpu_device_num(args.gpu_num_per_node)
flow.config.enable_debug_mode(True)
@flow.function(get_val_config(args))
def InferenceNet():
(labels, images) = ofrecord_util.load_imagenet_for_validation(args)
return images, labels
def save_bmp(array, filepath):
print(array.dtype)
im = Image.fromarray(array)
im.save(filepath)
if __name__ == '__main__':
train_data_iter, val_data_iter = get_rec_iter(args, True)
#train_data_iter.reset()
for i, batches in enumerate(val_data_iter):
images, labels = batches
print(labels)
np.save('output/dali_val_data.npy', images)
save_bmp(images[0], 'output/dali_val.bmp')
for i, batches in enumerate(train_data_iter):
images, labels = batches
print(labels)
np.save('output/dali_train_data.npy', images)
images, labels = InferenceNet().get()
images = images.ndarray().astype(np.uint8)
np.save('output/of_val_data.npy', images)
save_bmp(images[0], 'output/of_val.bmp')
\ No newline at end of file
rm -rf core.*
rm -rf output/snapshots/*
#DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/mxnet
DATA_ROOT=/ssd/ImageNet/mxnet
#DATA_ROOT=/dataset/imagenet-mxnet
#python3 cnn_benchmark/of_cnn_train_val.py \
#gdb --args \
#nvprof -f -o resnet.nvvp \
python3 cnn_e2e/dali_cnn_train_val.py \
--data_train=$DATA_ROOT/train.rec \
--data_train_idx=$DATA_ROOT/train.idx \
--data_val=$DATA_ROOT/val.rec \
--data_val_idx=$DATA_ROOT/val.idx \
--num_nodes=2 \
--node_ips='11.11.1.12,11.11.1.14' \
--gpu_num_per_node=4 \
--optimizer="momentum-cosine-decay" \
--learning_rate=0.256 \
--loss_print_every_n_iter=20 \
--batch_size_per_device=32 \
--val_batch_size_per_device=125 \
--model="resnet50"
#--use_fp16 true \
#--weight_l2=3.0517578125e-05 \
#--num_examples=1024 \
#--optimizer="momentum-decay" \
#--data_dir="/mnt/13_nfs/xuan/ImageNet/ofrecord/train"
#--data_dir="/mnt/dataset/xuan/ImageNet/ofrecord/train"
#--warmup_iter_num=10000 \
rm -rf core.*
DATA_ROOT=/mnt/13_nfs/xuan/ImageNet/mxnet
#gdb --args \
#nvprof -f -o resnet.nvvp \
python3 cnn_e2e/dali_cnn_val.py \
--model_load_dir=output/models \
--data_train=$DATA_ROOT/train.rec \
--data_train_idx=$DATA_ROOT/train.idx \
--data_val=$DATA_ROOT/val.rec \
--data_val_idx=$DATA_ROOT/val.idx \
--num_nodes=1 \
--node_ips='11.11.1.13,11.11.1.14' \
--gpu_num_per_node=4 \
--optimizer="momentum-cosine-decay" \
--learning_rate=0.256 \
--loss_print_every_n_iter=20 \
--batch_size_per_device=32 \
--val_batch_size_per_device=125 \
--model="resnet50"
#--use_fp16 true \
#--weight_l2=3.0517578125e-05 \
#--num_examples=1024 \
#--optimizer="momentum-decay" \
#--data_dir="/mnt/13_nfs/xuan/ImageNet/ofrecord/train"
#--data_dir="/mnt/dataset/xuan/ImageNet/ofrecord/train"
#--warmup_iter_num=10000 \
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册