提交 1f53172c 编写于 作者: S ShawnXuan

fix

上级 28556b9c
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
import numpy as np import numpy as np
import time import time
import logging #import logging
import warnings import warnings
from nvidia import dali from nvidia import dali
from nvidia.dali.pipeline import Pipeline from nvidia.dali.pipeline import Pipeline
...@@ -68,7 +68,8 @@ class HybridTrainPipe(Pipeline): ...@@ -68,7 +68,8 @@ class HybridTrainPipe(Pipeline):
self.resize = ops.RandomResizedCrop(device=dali_resize_device, size=crop_shape) self.resize = ops.RandomResizedCrop(device=dali_resize_device, size=crop_shape)
self.cmnp = ops.CropMirrorNormalize(device=dali_resize_device, #"gpu", #self.cmnp = ops.CropMirrorNormalize(device=dali_resize_device, #"gpu",
self.cmnp = ops.CropMirrorNormalize(device="gpu",
output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT, output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT,
output_layout=output_layout, crop=crop_shape, pad_output=pad_output, output_layout=output_layout, crop=crop_shape, pad_output=pad_output,
image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std) image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std)
...@@ -103,7 +104,8 @@ class HybridValPipe(Pipeline): ...@@ -103,7 +104,8 @@ class HybridValPipe(Pipeline):
host_memory_padding=nvjpeg_padding) host_memory_padding=nvjpeg_padding)
print(dali_device) print(dali_device)
self.resize = ops.Resize(device=dali_device, resize_shorter=resize_shp) if resize_shp else None self.resize = ops.Resize(device=dali_device, resize_shorter=resize_shp) if resize_shp else None
self.cmnp = ops.CropMirrorNormalize(device=dali_device,#"gpu", #self.cmnp = ops.CropMirrorNormalize(device=dali_device,#"gpu",
self.cmnp = ops.CropMirrorNormalize(device="gpu",
output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT, output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT,
output_layout=output_layout, crop=crop_shape, pad_output=pad_output, output_layout=output_layout, crop=crop_shape, pad_output=pad_output,
image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std) image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std)
...@@ -274,7 +276,7 @@ class DALIGenericIterator(object): ...@@ -274,7 +276,7 @@ class DALIGenericIterator(object):
with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): with p._check_api_type_scope(types.PipelineAPIType.ITERATOR):
p.schedule_run() p.schedule_run()
else: else:
logging.warning("DALI iterator does not support resetting while epoch is not finished. Ignoring...") print("DALI iterator does not support resetting while epoch is not finished. Ignoring...")
def get_rec_iter(args, dali_cpu=False, todo=True): def get_rec_iter(args, dali_cpu=False, todo=True):
......
...@@ -6,7 +6,6 @@ import os ...@@ -6,7 +6,6 @@ import os
import time import time
import math import math
import numpy as np import numpy as np
import logging
import oneflow as flow import oneflow as flow
...@@ -170,25 +169,31 @@ def main(): ...@@ -170,25 +169,31 @@ def main():
train_data_iter, val_data_iter = get_rec_iter(args, True) train_data_iter, val_data_iter = get_rec_iter(args, True)
timer.start() timer.start()
for epoch in range(args.num_epochs): for epoch in range(args.num_epochs):
print('Starting epoch {}'.format(epoch))
tic = time.time() tic = time.time()
print('Starting epoch {} at {:.2f}'.format(epoch, tic))
train_data_iter.reset() train_data_iter.reset()
for i, batches in enumerate(train_data_iter): for i, batches in enumerate(train_data_iter):
assert len(batches) == 1 assert len(batches) == 1
images, labels = batches[0] images, labels = batches[0]
TrainNet(images, labels.astype(np.int32)).async_get(train_callback(epoch, i)) TrainNet(images, labels.astype(np.int32)).async_get(train_callback(epoch, i))
if i > 30:#debug # if i > 30:#debug
break # break
break #break
print(time.time() - tic) print('epoch {} training time: {:.2f}'.format(epoch, time.time() - tic))
if args.data_val: if args.data_val:
tic = time.time() tic = time.time()
val_data_iter.reset() val_data_iter.reset()
for i, batches in enumerate(val_data_iter): for i, batches in enumerate(val_data_iter):
assert len(batches) == 1 assert len(batches) == 1
images, labels = batches[0] images, labels = batches[0]
InferenceNet(images, labels.astype(np.int32)).async_get(predict_callback(epoch, i)) #InferenceNet(images, labels.astype(np.int32)).async_get(predict_callback(epoch, i))
print(time.time() - tic) acc_acc(i, InferenceNet(images, labels.astype(np.int32)).get())
assert main.total > 0
top1_accuracy = main.correct/main.total
summary.scalar('top1_accuracy', top1_accuracy, epoch)
print("epoch {}, top 1 accuracy: {:.6f}, val_time: {:.2f}".format(epoch, top1_accuracy,
time.time()-tic))
snapshot.save('epoch_{}'.format(epoch+1)) snapshot.save('epoch_{}'.format(epoch+1))
......
...@@ -39,7 +39,7 @@ def _batch_norm(inputs, name=None, trainable=True): ...@@ -39,7 +39,7 @@ def _batch_norm(inputs, name=None, trainable=True):
inputs=inputs, inputs=inputs,
axis=1, axis=1,
momentum=0.9,#97, momentum=0.9,#97,
epsilon=1e05,#1.001e-5, epsilon=1e-05,#1.001e-5,
center=True, center=True,
scale=True, scale=True,
trainable=trainable, trainable=trainable,
......
...@@ -12,9 +12,9 @@ DATA_ROOT=/dataset/imagenet-mxnet ...@@ -12,9 +12,9 @@ DATA_ROOT=/dataset/imagenet-mxnet
--optimizer="momentum-cosine-decay" \ --optimizer="momentum-cosine-decay" \
--weight_l2=3.0517578125e-05 \ --weight_l2=3.0517578125e-05 \
--learning_rate=0.256 \ --learning_rate=0.256 \
--loss_print_every_n_iter=10 \ --loss_print_every_n_iter=20 \
--batch_size_per_device=64 \ --batch_size_per_device=64 \
--val_batch_size_per_device=100 \ --val_batch_size_per_device=125 \
--model="resnet50" --model="resnet50"
#--weight_l2=3.0517578125e-05 \ #--weight_l2=3.0517578125e-05 \
#--num_examples=1024 \ #--num_examples=1024 \
......
...@@ -3,10 +3,10 @@ rm -rf core.* ...@@ -3,10 +3,10 @@ rm -rf core.*
#DATA_ROOT=/mnt/13_nfs/xuan/ImageNet #DATA_ROOT=/mnt/13_nfs/xuan/ImageNet
DATA_ROOT=/dataset/imagenet-mxnet DATA_ROOT=/dataset/imagenet-mxnet
python cnn_benchmark/dali.py \ python cnn_benchmark/dali.py \
--data_train=$DATA_ROOT/mxnet/train.rec \ --data_train=$DATA_ROOT/train.rec \
--data_train_idx=$DATA_ROOT/mxnet/train.idx \ --data_train_idx=$DATA_ROOT/train.idx \
--data_val=$DATA_ROOT/mxnet/val.rec \ --data_val=$DATA_ROOT/val.rec \
--data_val_idx=$DATA_ROOT/mxnet/val.idx \ --data_val_idx=$DATA_ROOT/val.idx \
--val_batch_size_per_device=20 \ --val_batch_size_per_device=20 \
--gpu_num_per_node=4 \ --gpu_num_per_node=4 \
--num_examples=1024 \ --num_examples=1024 \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册