From 5469c08188eec67d84ae4ea3df28e02ae775bee9 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 9 Apr 2018 12:39:19 +0800 Subject: [PATCH] "add auto feature" (#9760) --- benchmark/fluid/machine_translation.py | 66 +++++++++++++----- benchmark/fluid/mnist.py | 64 ++++++++++++------ benchmark/fluid/resnet.py | 38 ++++------- benchmark/fluid/run.sh | 70 +++++++++++++++++-- benchmark/fluid/stacked_dynamic_lstm.py | 89 ++++++++++++++++--------- benchmark/fluid/vgg.py | 16 +++-- 6 files changed, 236 insertions(+), 107 deletions(-) diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py index cc31d098328..d7a421c1097 100644 --- a/benchmark/fluid/machine_translation.py +++ b/benchmark/fluid/machine_translation.py @@ -48,6 +48,13 @@ parser.add_argument( type=int, default=16, help="The sequence number of a mini-batch data. (default: %(default)d)") +parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test') +parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') parser.add_argument( "--dict_size", type=int, @@ -72,16 +79,21 @@ parser.add_argument( default=3, help="The width for beam searching. (default: %(default)d)") parser.add_argument( - "--use_gpu", - type=distutils.util.strtobool, - default=True, - help="Whether to use gpu. (default: %(default)d)") + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help="The device type.") parser.add_argument( "--max_length", type=int, default=250, help="The maximum length of sequence when doing generation. " "(default: %(default)d)") +parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): @@ -281,7 +293,7 @@ def train(): paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), batch_size=args.batch_size) - place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) exe = Executor(place) exe.run(framework.default_startup_program()) @@ -307,14 +319,20 @@ def train(): return total_loss / count + iters, num_samples, start_time = 0, 0, time.time() for pass_id in xrange(args.pass_num): - pass_start_time = time.time() - words_seen = 0 + train_accs = [] + train_losses = [] for batch_id, data in enumerate(train_batch_generator()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) - words_seen += word_num + num_samples += word_num trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) - words_seen += word_num + num_samples += word_num lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) fetch_outs = exe.run(framework.default_main_program(), @@ -325,24 +343,36 @@ def train(): }, fetch_list=[avg_cost]) - avg_cost_val = np.array(fetch_outs[0]) - print('pass_id=%d, batch_id=%d, train_loss: %f' % - (pass_id, batch_id, avg_cost_val)) + iters += 1 + loss = np.array(fetch_outs[0]) + print( + "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss) + ) # The accuracy is the accumulation of batches, but not the current batch. - pass_end_time = time.time() - test_loss = do_validation() - time_consumed = pass_end_time - pass_start_time - words_per_sec = words_seen / time_consumed - print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % - (pass_id, test_loss, words_per_sec, time_consumed)) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + # evaluation + if args.with_test: + test_loss = do_validation() + exit(0) def infer(): pass +def print_arguments(args): + print('----------- seq2seq Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + if __name__ == '__main__': args = parser.parse_args() + print_arguments(args) if args.infer_only: infer() else: diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py index 7f7afaeb114..43866da9cb1 100644 --- a/benchmark/fluid/mnist.py +++ b/benchmark/fluid/mnist.py @@ -35,6 +35,12 @@ def parse_args(): parser = argparse.ArgumentParser("mnist model benchmark.") parser.add_argument( '--batch_size', type=int, default=128, help='The minibatch size.') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) parser.add_argument( '--iterations', type=int, default=35, help='The number of minibatches.') parser.add_argument( @@ -53,19 +59,14 @@ def parse_args(): '--use_nvprof', action='store_true', help='If set, use nvprof for CUDA.') + parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') args = parser.parse_args() return args -def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and - vars(args)['device'] == 'GPU') - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - def cnn_model(data): conv_pool_1 = fluid.nets.simple_img_conv_pool( input=data, @@ -161,16 +162,22 @@ def run_benchmark(model, args): paddle.dataset.mnist.train(), batch_size=args.batch_size) accuracy = fluid.average.WeightedAverage() + iters, num_samples, start_time = 0, 0, time.time() for pass_id in range(args.pass_num): accuracy.reset() - pass_start = time.time() + train_accs = [] + train_losses = [] for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break img_data = np.array( map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([len(y_data), 1]) - start = time.time() outs = exe.run( fluid.default_main_program(), feed={"pixel": img_data, @@ -178,21 +185,36 @@ def run_benchmark(model, args): fetch_list=[avg_cost, batch_acc, batch_size_tensor] ) # The accuracy is the accumulation of batches, but not the current batch. accuracy.add(value=outs[1], weight=outs[2]) - end = time.time() + iters += 1 + num_samples += len(y_data) loss = np.array(outs[0]) acc = np.array(outs[1]) - print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" % - (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000)) + train_losses.append(loss) + train_accs.append(acc) + print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % + (pass_id, iters, loss, acc)) + + print("Pass: %d, Loss: %f, Train Accuray: %f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed - pass_end = time.time() + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + # evaluation + if args.with_test: + test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, + inference_program) + exit(0) - train_avg_acc = accuracy.eval() - test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, - inference_program) - print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" % - (pass_id, train_avg_acc, test_avg_acc, - (pass_end - pass_start) / 1000)) +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- mnist Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') if __name__ == '__main__': diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py index f0f1db979fa..1af5eaf6b46 100644 --- a/benchmark/fluid/resnet.py +++ b/benchmark/fluid/resnet.py @@ -87,15 +87,6 @@ def parse_args(): return args -def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and - vars(args)['device'] == 'GPU') - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): conv1 = fluid.layers.conv2d( input=input, @@ -279,32 +270,31 @@ def run_benchmark(model, args): 'label': label}, fetch_list=[avg_cost, batch_acc, batch_size_tensor]) iters += 1 - num_samples += label[0] + num_samples += len(label) accuracy.add(value=acc, weight=weight) train_losses.append(loss) train_accs.append(acc) print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % (pass_id, iters, loss, acc)) - pass_train_acc = accuracy.eval() - # evaluation - if args.with_test: - pass_test_acc = test(exe) - train_elapsed = time.time() - start_time print("Pass: %d, Loss: %f, Train Accuray: %f\n" % (pass_id, np.mean(train_losses), np.mean(train_accs))) - + train_elapsed = time.time() - start_time examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % (num_samples, train_elapsed, examples_per_sec)) + # evaluation + if args.with_test: + pass_test_acc = test(exe) + exit(0) - if args.use_cprof: - pr.disable() - s = StringIO.StringIO() - sortby = 'cumulative' - ps = pstats.Stats(pr, stream=s).sort_stats(sortby) - ps.print_stats() - print(s.getvalue()) + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- resnet Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') if __name__ == '__main__': diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh index 663e2efd539..f6dfd20bf2e 100644 --- a/benchmark/fluid/run.sh +++ b/benchmark/fluid/run.sh @@ -1,7 +1,9 @@ #!/bin/bash # This script benchmarking the PaddlePaddle Fluid on # single thread single GPU. -export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib + +#export FLAGS_fraction_of_gpu_memory_to_use=0.0 +export CUDNN_PATH=/paddle/cudnn_v5 # disable openmp and mkl parallel #https://github.com/PaddlePaddle/Paddle/issues/7199 @@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0 export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH +# only query the gpu used +nohup stdbuf -oL nvidia-smi \ + --id=${CUDA_VISIBLE_DEVICES} \ + --query-gpu=timestamp \ + --query-compute-apps=pid,process_name,used_memory \ + --format=csv \ + --filename=mem.log \ + -l 1 & +# mnist +# mnist gpu mnist 128 +FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \ + --device=GPU \ + --batch_size=128 \ + --skip_batch_num=5 \ + --iterations=500 \ + 2>&1 | tee -a mnist_gpu_128.log # vgg16 -# cifar10 gpu cifar10 128 -FLAGS_benchmark=true python fluid/vgg.py \ +# gpu cifar10 128 +FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ --device=GPU \ --batch_size=128 \ --skip_batch_num=5 \ - --iterations=30 \ - 2>&1 > vgg16_gpu_128.log + --iterations=30 \ + 2>&1 | tee -a vgg16_gpu_128.log + +# flowers gpu 128 +FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ + --device=GPU \ + --batch_size=32 \ + --data_set=flowers \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a vgg16_gpu_flowers_32.log # resnet50 # resnet50 gpu cifar10 128 -FLAGS_benchmark=true python fluid/resnet.py \ +FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \ --device=GPU \ --batch_size=128 \ --data_set=cifar10 \ --model=resnet_cifar10 \ --skip_batch_num=5 \ --iterations=30 \ - 2>&1 > resnet50_gpu_128.log + 2>&1 | tee -a resnet50_gpu_128.log + +# resnet50 gpu flowers 64 +FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \ + --device=GPU \ + --batch_size=64 \ + --data_set=flowers \ + --model=resnet_imagenet \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a resnet50_gpu_flowers_64.log # lstm +# lstm gpu imdb 32 # tensorflow only support batch=32 +FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \ + --device=GPU \ + --batch_size=32 \ + --skip_batch_num=5 \ + --iterations=30 \ + --hidden_dim=512 \ + --emb_dim=512 \ + --crop_size=1500 \ + 2>&1 | tee -a lstm_gpu_32.log + +# seq2seq +# seq2seq gpu wmb 128 +FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \ + --device=GPU \ + --batch_size=128 \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a lstm_gpu_128.log diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py index 4e063549e02..5fcbdd64af9 100644 --- a/benchmark/fluid/stacked_dynamic_lstm.py +++ b/benchmark/fluid/stacked_dynamic_lstm.py @@ -37,6 +37,14 @@ def parse_args(): type=int, default=32, help='The sequence number of a batch data. (default: %(default)d)') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') parser.add_argument( '--emb_dim', type=int, @@ -64,6 +72,10 @@ def parse_args(): default=int(os.environ.get('CROP_SIZE', '1500')), help='The max sentence length of input. Since this model use plain RNN,' ' Gradient could be explored if sentence is too long') + parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') args = parser.parse_args() return args @@ -157,37 +169,43 @@ def main(): exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - def train_loop(pass_num, crop_size): - with profiler.profiler(args.device, 'total') as prof: - for pass_id in range(pass_num): - train_reader = batch( - paddle.reader.shuffle( - crop_sentence(imdb.train(word_dict), crop_size), - buf_size=25000), - batch_size=args.batch_size) - word_nums = 0 - pass_start_time = time.time() - for batch_id, data in enumerate(train_reader()): - tensor_words = to_lodtensor([x[0] for x in data], place) - for x in data: - word_nums += len(x[0]) - label = numpy.array([x[1] for x in data]).astype("int64") - label = label.reshape((-1, 1)) - loss_np, acc, weight = exe.run( - fluid.default_main_program(), - feed={"words": tensor_words, - "label": label}, - fetch_list=[loss, batch_acc, batch_size_tensor]) - print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" % - (pass_id, batch_id, loss_np, acc)) - - pass_end_time = time.time() - time_consumed = pass_end_time - pass_start_time - words_per_sec = word_nums / time_consumed - print("pass_id=%d, sec/pass: %f, words/s: %f" % - (pass_id, time_consumed, words_per_sec)) - - train_loop(args.pass_num, args.crop_size) + train_reader = batch( + paddle.reader.shuffle( + crop_sentence(imdb.train(word_dict), args.crop_size), + buf_size=25000), + batch_size=args.batch_size) + + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in range(args.pass_num): + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + tensor_words = to_lodtensor([x[0] for x in data], place) + label = numpy.array([x[1] for x in data]).astype("int64") + label = label.reshape((-1, 1)) + loss_np, acc, weight = exe.run( + fluid.default_main_program(), + feed={"words": tensor_words, + "label": label}, + fetch_list=[loss, batch_acc, batch_size_tensor]) + iters += 1 + for x in data: + num_samples += len(x[0]) + print( + "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % + (pass_id, iters, loss_np, acc) + ) # The accuracy is the accumulation of batches, but not the current batch. + + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + exit(0) def to_lodtensor(data, place): @@ -205,5 +223,14 @@ def to_lodtensor(data, place): return res +def print_arguments(args): + print('----------- lstm Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + if __name__ == '__main__': + args = parse_args() + print_arguments(args) main() diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py index 3bf78e4cf08..9d990eff62e 100644 --- a/benchmark/fluid/vgg.py +++ b/benchmark/fluid/vgg.py @@ -191,25 +191,29 @@ def main(): fetch_list=[avg_cost, batch_acc, batch_size_tensor]) accuracy.add(value=acc, weight=weight) iters += 1 - num_samples += len(data) + num_samples += len(y_data) print( "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % (pass_id, iters, loss, acc) ) # The accuracy is the accumulation of batches, but not the current batch. - pass_train_acc = accuracy.eval() + # pass_train_acc = accuracy.eval() train_losses.append(loss) train_accs.append(acc) + print("Pass: %d, Loss: %f, Train Accuray: %f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) # evaluation if args.with_test: pass_test_acc = test(exe) - train_elapsed = time.time() - start_time - print("Pass: %d, Loss: %f, Train Accuray: %f\n" % - (pass_id, np.mean(train_losses), np.mean(train_accs))) + exit(0) def print_arguments(): - print('----------- Configuration Arguments -----------') + print('----------- vgg Configuration Arguments -----------') for arg, value in sorted(vars(args).iteritems()): print('%s: %s' % (arg, value)) print('------------------------------------------------') -- GitLab