提交 1f53172c 编写于 作者: S ShawnXuan

fix

上级 28556b9c
......@@ -14,7 +14,7 @@
import numpy as np
import time
import logging
#import logging
import warnings
from nvidia import dali
from nvidia.dali.pipeline import Pipeline
......@@ -68,7 +68,8 @@ class HybridTrainPipe(Pipeline):
self.resize = ops.RandomResizedCrop(device=dali_resize_device, size=crop_shape)
self.cmnp = ops.CropMirrorNormalize(device=dali_resize_device, #"gpu",
#self.cmnp = ops.CropMirrorNormalize(device=dali_resize_device, #"gpu",
self.cmnp = ops.CropMirrorNormalize(device="gpu",
output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT,
output_layout=output_layout, crop=crop_shape, pad_output=pad_output,
image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std)
......@@ -103,7 +104,8 @@ class HybridValPipe(Pipeline):
host_memory_padding=nvjpeg_padding)
print(dali_device)
self.resize = ops.Resize(device=dali_device, resize_shorter=resize_shp) if resize_shp else None
self.cmnp = ops.CropMirrorNormalize(device=dali_device,#"gpu",
#self.cmnp = ops.CropMirrorNormalize(device=dali_device,#"gpu",
self.cmnp = ops.CropMirrorNormalize(device="gpu",
output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT,
output_layout=output_layout, crop=crop_shape, pad_output=pad_output,
image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std)
......@@ -274,7 +276,7 @@ class DALIGenericIterator(object):
with p._check_api_type_scope(types.PipelineAPIType.ITERATOR):
p.schedule_run()
else:
logging.warning("DALI iterator does not support resetting while epoch is not finished. Ignoring...")
print("DALI iterator does not support resetting while epoch is not finished. Ignoring...")
def get_rec_iter(args, dali_cpu=False, todo=True):
......
......@@ -6,7 +6,6 @@ import os
import time
import math
import numpy as np
import logging
import oneflow as flow
......@@ -170,25 +169,31 @@ def main():
train_data_iter, val_data_iter = get_rec_iter(args, True)
timer.start()
for epoch in range(args.num_epochs):
print('Starting epoch {}'.format(epoch))
tic = time.time()
print('Starting epoch {} at {:.2f}'.format(epoch, tic))
train_data_iter.reset()
for i, batches in enumerate(train_data_iter):
assert len(batches) == 1
images, labels = batches[0]
TrainNet(images, labels.astype(np.int32)).async_get(train_callback(epoch, i))
if i > 30:#debug
break
break
print(time.time() - tic)
# if i > 30:#debug
# break
#break
print('epoch {} training time: {:.2f}'.format(epoch, time.time() - tic))
if args.data_val:
tic = time.time()
val_data_iter.reset()
for i, batches in enumerate(val_data_iter):
assert len(batches) == 1
images, labels = batches[0]
InferenceNet(images, labels.astype(np.int32)).async_get(predict_callback(epoch, i))
print(time.time() - tic)
#InferenceNet(images, labels.astype(np.int32)).async_get(predict_callback(epoch, i))
acc_acc(i, InferenceNet(images, labels.astype(np.int32)).get())
assert main.total > 0
top1_accuracy = main.correct/main.total
summary.scalar('top1_accuracy', top1_accuracy, epoch)
print("epoch {}, top 1 accuracy: {:.6f}, val_time: {:.2f}".format(epoch, top1_accuracy,
time.time()-tic))
snapshot.save('epoch_{}'.format(epoch+1))
......
......@@ -39,7 +39,7 @@ def _batch_norm(inputs, name=None, trainable=True):
inputs=inputs,
axis=1,
momentum=0.9,#97,
epsilon=1e05,#1.001e-5,
epsilon=1e-05,#1.001e-5,
center=True,
scale=True,
trainable=trainable,
......
......@@ -12,9 +12,9 @@ DATA_ROOT=/dataset/imagenet-mxnet
--optimizer="momentum-cosine-decay" \
--weight_l2=3.0517578125e-05 \
--learning_rate=0.256 \
--loss_print_every_n_iter=10 \
--loss_print_every_n_iter=20 \
--batch_size_per_device=64 \
--val_batch_size_per_device=100 \
--val_batch_size_per_device=125 \
--model="resnet50"
#--weight_l2=3.0517578125e-05 \
#--num_examples=1024 \
......
......@@ -3,10 +3,10 @@ rm -rf core.*
#DATA_ROOT=/mnt/13_nfs/xuan/ImageNet
DATA_ROOT=/dataset/imagenet-mxnet
python cnn_benchmark/dali.py \
--data_train=$DATA_ROOT/mxnet/train.rec \
--data_train_idx=$DATA_ROOT/mxnet/train.idx \
--data_val=$DATA_ROOT/mxnet/val.rec \
--data_val_idx=$DATA_ROOT/mxnet/val.idx \
--data_train=$DATA_ROOT/train.rec \
--data_train_idx=$DATA_ROOT/train.idx \
--data_val=$DATA_ROOT/val.rec \
--data_val_idx=$DATA_ROOT/val.idx \
--val_batch_size_per_device=20 \
--gpu_num_per_node=4 \
--num_examples=1024 \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册