From 42eb23d3b6ba67849c091ad75abc918725c99102 Mon Sep 17 00:00:00 2001 From: Bai Yifan Date: Tue, 15 Dec 2020 10:13:22 +0800 Subject: [PATCH] Fix dygraph quant demo issues (#543) --- demo/dygraph/quant/README.md | 2 +- demo/dygraph/quant/train.py | 64 ++++++++++++++++++++++++++++-------- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/demo/dygraph/quant/README.md b/demo/dygraph/quant/README.md index c8a5337b..c10068b4 100755 --- a/demo/dygraph/quant/README.md +++ b/demo/dygraph/quant/README.md @@ -82,7 +82,7 @@ quanter.save_quantized_model(net, 'save_dir', input_spec=[paddle.static.InputSpe # 单卡训练 python train.py --lr=0.001 --use_pact=True --num_epochs=30 --l2_decay=2e-5 --ls_epsilon=0.1 # 多卡训练,以0到3号卡为例 - python -m paddle.distributed.launch --gpus="0,1,2,3" train.py --lr=0.001 --use_pact=True --num_epochs=60 --l2_decay=2e-5 --ls_epsilon=0.1 + python -m paddle.distributed.launch --gpus="0,1,2,3" train.py --lr=0.001 --use_pact=True --num_epochs=30 --l2_decay=2e-5 --ls_epsilon=0.1 ``` diff --git a/demo/dygraph/quant/train.py b/demo/dygraph/quant/train.py index 8ab47a8c..8b02cc2a 100644 --- a/demo/dygraph/quant/train.py +++ b/demo/dygraph/quant/train.py @@ -43,7 +43,7 @@ _logger = get_logger(__name__, level=logging.INFO) parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('batch_size', int, 256, "Minibatch size.") +add_arg('batch_size', int, 256, "Single Card Minibatch size.") add_arg('use_gpu', bool, True, "Whether to use GPU or not.") add_arg('model', str, "mobilenet_v3", "The target model.") add_arg('pretrained_model', str, "MobileNetV3_large_x1_0_ssld_pretrained", "Whether to use pretrained model.") @@ -57,7 +57,7 @@ add_arg('num_epochs', int, 1, add_arg('total_images', int, 1281167, "The number of total training images.") add_arg('data', str, "imagenet", "Which data to use. 'mnist' or 'imagenet'") add_arg('log_period', int, 10, "Log period in batches.") -add_arg('model_save_dir', str, "./", "model save directory.") +add_arg('model_save_dir', str, "./output_models", "model save directory.") parser.add_argument('--step_epochs', nargs='+', type=int, default=[10, 20, 30], help="piecewise decay step") # yapf: enable @@ -110,12 +110,12 @@ def compress(args): if use_data_parallel: paddle.distributed.init_parallel_env() + pretrain = True if args.data == "imagenet" else False if args.model == "mobilenet_v1": - pretrain = True if args.data == "imagenet" else False - net = mobilenet_v1(pretrained=pretrained) + net = mobilenet_v1(pretrained=pretrain) elif args.model == "mobilenet_v3": net = MobileNetV3_large_x1_0() - if args.data == "imagenet": + if pretrain: load_dygraph_pretrain(net, args.pretrained_model, True) else: raise ValueError("{} is not supported.".format(args.model)) @@ -190,25 +190,43 @@ def compress(args): batch_id = 0 acc_top1_ns = [] acc_top5_ns = [] + + eval_reader_cost = 0.0 + eval_run_cost = 0.0 + total_samples = 0 + reader_start = time.time() for data in valid_loader(): + eval_reader_cost += time.time() - reader_start image = data[0] label = data[1] - start_time = time.time() + + eval_start = time.time() out = net(image) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) - end_time = time.time() + eval_run_cost += time.time() - eval_start + batch_size = image.shape[0] + total_samples += batch_size + if batch_id % args.log_period == 0: + log_period = 1 if batch_id == 0 else args.log_period _logger.info( - "Eval epoch[{}] batch[{}] - acc_top1: {:.6f}; acc_top5: {:.6f}; time: {:.3f}". + "Eval epoch[{}] batch[{}] - top1: {:.6f}; top5: {:.6f}; avg_reader_cost: {:.6f} s, avg_batch_cost: {:.6f} s, avg_samples: {}, avg_ips: {:.3f} images/s". format(epoch, batch_id, np.mean(acc_top1.numpy()), - np.mean(acc_top5.numpy()), end_time - start_time)) + np.mean(acc_top5.numpy()), eval_reader_cost / + log_period, (eval_reader_cost + eval_run_cost) / + log_period, total_samples / log_period, total_samples + / (eval_reader_cost + eval_run_cost))) + eval_reader_cost = 0.0 + eval_run_cost = 0.0 + total_samples = 0 acc_top1_ns.append(np.mean(acc_top1.numpy())) acc_top5_ns.append(np.mean(acc_top5.numpy())) batch_id += 1 + reader_start = time.time() _logger.info( "Final eval epoch[{}] - acc_top1: {:.6f}; acc_top5: {:.6f}".format( @@ -234,11 +252,18 @@ def compress(args): net.train() batch_id = 0 + + train_reader_cost = 0.0 + train_run_cost = 0.0 + total_samples = 0 + reader_start = time.time() for data in train_loader(): + train_reader_cost += time.time() - reader_start + image = data[0] label = data[1] - start_time = time.time() + train_start = time.time() out = net(image) avg_cost = cross_entropy(out, label, args.ls_epsilon) @@ -253,14 +278,25 @@ def compress(args): acc_top1_n = np.mean(acc_top1.numpy()) acc_top5_n = np.mean(acc_top5.numpy()) - end_time = time.time() + train_run_cost += time.time() - train_start + batch_size = image.shape[0] + total_samples += batch_size + if batch_id % args.log_period == 0: + log_period = 1 if batch_id == 0 else args.log_period _logger.info( - "epoch[{}]-batch[{}] lr: {:.6f} - loss: {:.6f}; acc_top1: {:.6f}; acc_top5: {:.6f}; time: {:.3f}". + "epoch[{}]-batch[{}] lr: {:.6f} - loss: {:.6f}; top1: {:.6f}; top5: {:.6f}; avg_reader_cost: {:.6f} s, avg_batch_cost: {:.6f} s, avg_samples: {}, avg_ips: {:.3f} images/s". format(epoch, batch_id, - lr.get_lr(), loss_n, acc_top1_n, acc_top5_n, end_time - - start_time)) + lr.get_lr(), loss_n, acc_top1_n, acc_top5_n, + train_reader_cost / log_period, ( + train_reader_cost + train_run_cost) / log_period, + total_samples / log_period, total_samples / ( + train_reader_cost + train_run_cost))) + train_reader_cost = 0.0 + train_run_cost = 0.0 + total_samples = 0 batch_id += 1 + reader_start = time.time() ############################################################################################################ # train loop -- GitLab