From 1f53172cad9759cb94767c2c5c626ba37c043fa9 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Sat, 8 Feb 2020 11:03:36 +0800 Subject: [PATCH] fix --- cnn_benchmark/dali.py | 10 ++++++---- cnn_benchmark/of_cnn_train_val.py | 21 +++++++++++++-------- cnn_benchmark/resnet_model.py | 2 +- run.sh | 4 ++-- test.sh | 8 ++++---- 5 files changed, 26 insertions(+), 19 deletions(-) diff --git a/cnn_benchmark/dali.py b/cnn_benchmark/dali.py index ef562d6..fdc00cc 100644 --- a/cnn_benchmark/dali.py +++ b/cnn_benchmark/dali.py @@ -14,7 +14,7 @@ import numpy as np import time -import logging +#import logging import warnings from nvidia import dali from nvidia.dali.pipeline import Pipeline @@ -68,7 +68,8 @@ class HybridTrainPipe(Pipeline): self.resize = ops.RandomResizedCrop(device=dali_resize_device, size=crop_shape) - self.cmnp = ops.CropMirrorNormalize(device=dali_resize_device, #"gpu", + #self.cmnp = ops.CropMirrorNormalize(device=dali_resize_device, #"gpu", + self.cmnp = ops.CropMirrorNormalize(device="gpu", output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT, output_layout=output_layout, crop=crop_shape, pad_output=pad_output, image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std) @@ -103,7 +104,8 @@ class HybridValPipe(Pipeline): host_memory_padding=nvjpeg_padding) print(dali_device) self.resize = ops.Resize(device=dali_device, resize_shorter=resize_shp) if resize_shp else None - self.cmnp = ops.CropMirrorNormalize(device=dali_device,#"gpu", + #self.cmnp = ops.CropMirrorNormalize(device=dali_device,#"gpu", + self.cmnp = ops.CropMirrorNormalize(device="gpu", output_dtype=types.FLOAT16 if dtype == 'float16' else types.FLOAT, output_layout=output_layout, crop=crop_shape, pad_output=pad_output, image_type=types.RGB, mean=args.rgb_mean, std=args.rgb_std) @@ -274,7 +276,7 @@ class DALIGenericIterator(object): with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): p.schedule_run() else: - logging.warning("DALI iterator does not support resetting while epoch is not finished. Ignoring...") + print("DALI iterator does not support resetting while epoch is not finished. Ignoring...") def get_rec_iter(args, dali_cpu=False, todo=True): diff --git a/cnn_benchmark/of_cnn_train_val.py b/cnn_benchmark/of_cnn_train_val.py index 58bec3c..ffe9e34 100755 --- a/cnn_benchmark/of_cnn_train_val.py +++ b/cnn_benchmark/of_cnn_train_val.py @@ -6,7 +6,6 @@ import os import time import math import numpy as np -import logging import oneflow as flow @@ -170,25 +169,31 @@ def main(): train_data_iter, val_data_iter = get_rec_iter(args, True) timer.start() for epoch in range(args.num_epochs): - print('Starting epoch {}'.format(epoch)) tic = time.time() + print('Starting epoch {} at {:.2f}'.format(epoch, tic)) train_data_iter.reset() for i, batches in enumerate(train_data_iter): assert len(batches) == 1 images, labels = batches[0] TrainNet(images, labels.astype(np.int32)).async_get(train_callback(epoch, i)) - if i > 30:#debug - break - break - print(time.time() - tic) + # if i > 30:#debug + # break + #break + print('epoch {} training time: {:.2f}'.format(epoch, time.time() - tic)) if args.data_val: tic = time.time() val_data_iter.reset() for i, batches in enumerate(val_data_iter): assert len(batches) == 1 images, labels = batches[0] - InferenceNet(images, labels.astype(np.int32)).async_get(predict_callback(epoch, i)) - print(time.time() - tic) + #InferenceNet(images, labels.astype(np.int32)).async_get(predict_callback(epoch, i)) + acc_acc(i, InferenceNet(images, labels.astype(np.int32)).get()) + + assert main.total > 0 + top1_accuracy = main.correct/main.total + summary.scalar('top1_accuracy', top1_accuracy, epoch) + print("epoch {}, top 1 accuracy: {:.6f}, val_time: {:.2f}".format(epoch, top1_accuracy, + time.time()-tic)) snapshot.save('epoch_{}'.format(epoch+1)) diff --git a/cnn_benchmark/resnet_model.py b/cnn_benchmark/resnet_model.py index 46572d6..495a1d6 100755 --- a/cnn_benchmark/resnet_model.py +++ b/cnn_benchmark/resnet_model.py @@ -39,7 +39,7 @@ def _batch_norm(inputs, name=None, trainable=True): inputs=inputs, axis=1, momentum=0.9,#97, - epsilon=1e05,#1.001e-5, + epsilon=1e-05,#1.001e-5, center=True, scale=True, trainable=trainable, diff --git a/run.sh b/run.sh index 792292d..47c96e8 100755 --- a/run.sh +++ b/run.sh @@ -12,9 +12,9 @@ DATA_ROOT=/dataset/imagenet-mxnet --optimizer="momentum-cosine-decay" \ --weight_l2=3.0517578125e-05 \ --learning_rate=0.256 \ - --loss_print_every_n_iter=10 \ + --loss_print_every_n_iter=20 \ --batch_size_per_device=64 \ - --val_batch_size_per_device=100 \ + --val_batch_size_per_device=125 \ --model="resnet50" #--weight_l2=3.0517578125e-05 \ #--num_examples=1024 \ diff --git a/test.sh b/test.sh index 0e9bcd0..ddcfbe7 100755 --- a/test.sh +++ b/test.sh @@ -3,10 +3,10 @@ rm -rf core.* #DATA_ROOT=/mnt/13_nfs/xuan/ImageNet DATA_ROOT=/dataset/imagenet-mxnet python cnn_benchmark/dali.py \ - --data_train=$DATA_ROOT/mxnet/train.rec \ - --data_train_idx=$DATA_ROOT/mxnet/train.idx \ - --data_val=$DATA_ROOT/mxnet/val.rec \ - --data_val_idx=$DATA_ROOT/mxnet/val.idx \ + --data_train=$DATA_ROOT/train.rec \ + --data_train_idx=$DATA_ROOT/train.idx \ + --data_val=$DATA_ROOT/val.rec \ + --data_val_idx=$DATA_ROOT/val.idx \ --val_batch_size_per_device=20 \ --gpu_num_per_node=4 \ --num_examples=1024 \ -- GitLab