diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py index cc31d098328bc237c018ebf8f158bdab5c37bff1..d7a421c10979c3b9d6865a8c0b99a6410e0f46a8 100644 --- a/benchmark/fluid/machine_translation.py +++ b/benchmark/fluid/machine_translation.py @@ -48,6 +48,13 @@ parser.add_argument( type=int, default=16, help="The sequence number of a mini-batch data. (default: %(default)d)") +parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test') +parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') parser.add_argument( "--dict_size", type=int, @@ -72,16 +79,21 @@ parser.add_argument( default=3, help="The width for beam searching. (default: %(default)d)") parser.add_argument( - "--use_gpu", - type=distutils.util.strtobool, - default=True, - help="Whether to use gpu. (default: %(default)d)") + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help="The device type.") parser.add_argument( "--max_length", type=int, default=250, help="The maximum length of sequence when doing generation. " "(default: %(default)d)") +parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): @@ -281,7 +293,7 @@ def train(): paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), batch_size=args.batch_size) - place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) exe = Executor(place) exe.run(framework.default_startup_program()) @@ -307,14 +319,20 @@ def train(): return total_loss / count + iters, num_samples, start_time = 0, 0, time.time() for pass_id in xrange(args.pass_num): - pass_start_time = time.time() - words_seen = 0 + train_accs = [] + train_losses = [] for batch_id, data in enumerate(train_batch_generator()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) - words_seen += word_num + num_samples += word_num trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) - words_seen += word_num + num_samples += word_num lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) fetch_outs = exe.run(framework.default_main_program(), @@ -325,24 +343,36 @@ def train(): }, fetch_list=[avg_cost]) - avg_cost_val = np.array(fetch_outs[0]) - print('pass_id=%d, batch_id=%d, train_loss: %f' % - (pass_id, batch_id, avg_cost_val)) + iters += 1 + loss = np.array(fetch_outs[0]) + print( + "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss) + ) # The accuracy is the accumulation of batches, but not the current batch. - pass_end_time = time.time() - test_loss = do_validation() - time_consumed = pass_end_time - pass_start_time - words_per_sec = words_seen / time_consumed - print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % - (pass_id, test_loss, words_per_sec, time_consumed)) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + # evaluation + if args.with_test: + test_loss = do_validation() + exit(0) def infer(): pass +def print_arguments(args): + print('----------- seq2seq Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + if __name__ == '__main__': args = parser.parse_args() + print_arguments(args) if args.infer_only: infer() else: diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py index 7f7afaeb11447d936b65a1d83701b0176ecbc111..43866da9cb113e9d49fc1c51f67da94cbc6bfd8e 100644 --- a/benchmark/fluid/mnist.py +++ b/benchmark/fluid/mnist.py @@ -35,6 +35,12 @@ def parse_args(): parser = argparse.ArgumentParser("mnist model benchmark.") parser.add_argument( '--batch_size', type=int, default=128, help='The minibatch size.') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) parser.add_argument( '--iterations', type=int, default=35, help='The number of minibatches.') parser.add_argument( @@ -53,19 +59,14 @@ def parse_args(): '--use_nvprof', action='store_true', help='If set, use nvprof for CUDA.') + parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') args = parser.parse_args() return args -def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and - vars(args)['device'] == 'GPU') - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - def cnn_model(data): conv_pool_1 = fluid.nets.simple_img_conv_pool( input=data, @@ -161,16 +162,22 @@ def run_benchmark(model, args): paddle.dataset.mnist.train(), batch_size=args.batch_size) accuracy = fluid.average.WeightedAverage() + iters, num_samples, start_time = 0, 0, time.time() for pass_id in range(args.pass_num): accuracy.reset() - pass_start = time.time() + train_accs = [] + train_losses = [] for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break img_data = np.array( map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([len(y_data), 1]) - start = time.time() outs = exe.run( fluid.default_main_program(), feed={"pixel": img_data, @@ -178,21 +185,36 @@ def run_benchmark(model, args): fetch_list=[avg_cost, batch_acc, batch_size_tensor] ) # The accuracy is the accumulation of batches, but not the current batch. accuracy.add(value=outs[1], weight=outs[2]) - end = time.time() + iters += 1 + num_samples += len(y_data) loss = np.array(outs[0]) acc = np.array(outs[1]) - print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" % - (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000)) + train_losses.append(loss) + train_accs.append(acc) + print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % + (pass_id, iters, loss, acc)) + + print("Pass: %d, Loss: %f, Train Accuray: %f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed - pass_end = time.time() + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + # evaluation + if args.with_test: + test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, + inference_program) + exit(0) - train_avg_acc = accuracy.eval() - test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, - inference_program) - print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" % - (pass_id, train_avg_acc, test_avg_acc, - (pass_end - pass_start) / 1000)) +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- mnist Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') if __name__ == '__main__': diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py index f0f1db979fa7fb640679beacafd66dfbe1f62ab8..1af5eaf6b46be47cb6b778cedcf53830c201ef39 100644 --- a/benchmark/fluid/resnet.py +++ b/benchmark/fluid/resnet.py @@ -87,15 +87,6 @@ def parse_args(): return args -def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and - vars(args)['device'] == 'GPU') - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): conv1 = fluid.layers.conv2d( input=input, @@ -279,32 +270,31 @@ def run_benchmark(model, args): 'label': label}, fetch_list=[avg_cost, batch_acc, batch_size_tensor]) iters += 1 - num_samples += label[0] + num_samples += len(label) accuracy.add(value=acc, weight=weight) train_losses.append(loss) train_accs.append(acc) print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % (pass_id, iters, loss, acc)) - pass_train_acc = accuracy.eval() - # evaluation - if args.with_test: - pass_test_acc = test(exe) - train_elapsed = time.time() - start_time print("Pass: %d, Loss: %f, Train Accuray: %f\n" % (pass_id, np.mean(train_losses), np.mean(train_accs))) - + train_elapsed = time.time() - start_time examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % (num_samples, train_elapsed, examples_per_sec)) + # evaluation + if args.with_test: + pass_test_acc = test(exe) + exit(0) - if args.use_cprof: - pr.disable() - s = StringIO.StringIO() - sortby = 'cumulative' - ps = pstats.Stats(pr, stream=s).sort_stats(sortby) - ps.print_stats() - print(s.getvalue()) + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- resnet Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') if __name__ == '__main__': diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh index 663e2efd5392a6cd1a71f51fa0d017070b489341..f6dfd20bf2ee0b668b6d4238d4511253b2233035 100644 --- a/benchmark/fluid/run.sh +++ b/benchmark/fluid/run.sh @@ -1,7 +1,9 @@ #!/bin/bash # This script benchmarking the PaddlePaddle Fluid on # single thread single GPU. -export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib + +#export FLAGS_fraction_of_gpu_memory_to_use=0.0 +export CUDNN_PATH=/paddle/cudnn_v5 # disable openmp and mkl parallel #https://github.com/PaddlePaddle/Paddle/issues/7199 @@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0 export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH +# only query the gpu used +nohup stdbuf -oL nvidia-smi \ + --id=${CUDA_VISIBLE_DEVICES} \ + --query-gpu=timestamp \ + --query-compute-apps=pid,process_name,used_memory \ + --format=csv \ + --filename=mem.log \ + -l 1 & +# mnist +# mnist gpu mnist 128 +FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \ + --device=GPU \ + --batch_size=128 \ + --skip_batch_num=5 \ + --iterations=500 \ + 2>&1 | tee -a mnist_gpu_128.log # vgg16 -# cifar10 gpu cifar10 128 -FLAGS_benchmark=true python fluid/vgg.py \ +# gpu cifar10 128 +FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ --device=GPU \ --batch_size=128 \ --skip_batch_num=5 \ - --iterations=30 \ - 2>&1 > vgg16_gpu_128.log + --iterations=30 \ + 2>&1 | tee -a vgg16_gpu_128.log + +# flowers gpu 128 +FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ + --device=GPU \ + --batch_size=32 \ + --data_set=flowers \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a vgg16_gpu_flowers_32.log # resnet50 # resnet50 gpu cifar10 128 -FLAGS_benchmark=true python fluid/resnet.py \ +FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \ --device=GPU \ --batch_size=128 \ --data_set=cifar10 \ --model=resnet_cifar10 \ --skip_batch_num=5 \ --iterations=30 \ - 2>&1 > resnet50_gpu_128.log + 2>&1 | tee -a resnet50_gpu_128.log + +# resnet50 gpu flowers 64 +FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \ + --device=GPU \ + --batch_size=64 \ + --data_set=flowers \ + --model=resnet_imagenet \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a resnet50_gpu_flowers_64.log # lstm +# lstm gpu imdb 32 # tensorflow only support batch=32 +FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \ + --device=GPU \ + --batch_size=32 \ + --skip_batch_num=5 \ + --iterations=30 \ + --hidden_dim=512 \ + --emb_dim=512 \ + --crop_size=1500 \ + 2>&1 | tee -a lstm_gpu_32.log + +# seq2seq +# seq2seq gpu wmb 128 +FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \ + --device=GPU \ + --batch_size=128 \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a lstm_gpu_128.log diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py index 4e063549e0239abf9d946ed8735f0306203509d0..5fcbdd64af9dc196c9d5b2b82ce4213478ea1418 100644 --- a/benchmark/fluid/stacked_dynamic_lstm.py +++ b/benchmark/fluid/stacked_dynamic_lstm.py @@ -37,6 +37,14 @@ def parse_args(): type=int, default=32, help='The sequence number of a batch data. (default: %(default)d)') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') parser.add_argument( '--emb_dim', type=int, @@ -64,6 +72,10 @@ def parse_args(): default=int(os.environ.get('CROP_SIZE', '1500')), help='The max sentence length of input. Since this model use plain RNN,' ' Gradient could be explored if sentence is too long') + parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') args = parser.parse_args() return args @@ -157,37 +169,43 @@ def main(): exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - def train_loop(pass_num, crop_size): - with profiler.profiler(args.device, 'total') as prof: - for pass_id in range(pass_num): - train_reader = batch( - paddle.reader.shuffle( - crop_sentence(imdb.train(word_dict), crop_size), - buf_size=25000), - batch_size=args.batch_size) - word_nums = 0 - pass_start_time = time.time() - for batch_id, data in enumerate(train_reader()): - tensor_words = to_lodtensor([x[0] for x in data], place) - for x in data: - word_nums += len(x[0]) - label = numpy.array([x[1] for x in data]).astype("int64") - label = label.reshape((-1, 1)) - loss_np, acc, weight = exe.run( - fluid.default_main_program(), - feed={"words": tensor_words, - "label": label}, - fetch_list=[loss, batch_acc, batch_size_tensor]) - print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" % - (pass_id, batch_id, loss_np, acc)) - - pass_end_time = time.time() - time_consumed = pass_end_time - pass_start_time - words_per_sec = word_nums / time_consumed - print("pass_id=%d, sec/pass: %f, words/s: %f" % - (pass_id, time_consumed, words_per_sec)) - - train_loop(args.pass_num, args.crop_size) + train_reader = batch( + paddle.reader.shuffle( + crop_sentence(imdb.train(word_dict), args.crop_size), + buf_size=25000), + batch_size=args.batch_size) + + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in range(args.pass_num): + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + tensor_words = to_lodtensor([x[0] for x in data], place) + label = numpy.array([x[1] for x in data]).astype("int64") + label = label.reshape((-1, 1)) + loss_np, acc, weight = exe.run( + fluid.default_main_program(), + feed={"words": tensor_words, + "label": label}, + fetch_list=[loss, batch_acc, batch_size_tensor]) + iters += 1 + for x in data: + num_samples += len(x[0]) + print( + "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % + (pass_id, iters, loss_np, acc) + ) # The accuracy is the accumulation of batches, but not the current batch. + + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + exit(0) def to_lodtensor(data, place): @@ -205,5 +223,14 @@ def to_lodtensor(data, place): return res +def print_arguments(args): + print('----------- lstm Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + if __name__ == '__main__': + args = parse_args() + print_arguments(args) main() diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py index 3bf78e4cf08d43127a05c740fa30ca6d2bc416b0..9d990eff62ec368dc7033f55cc0862fa974a64e0 100644 --- a/benchmark/fluid/vgg.py +++ b/benchmark/fluid/vgg.py @@ -191,25 +191,29 @@ def main(): fetch_list=[avg_cost, batch_acc, batch_size_tensor]) accuracy.add(value=acc, weight=weight) iters += 1 - num_samples += len(data) + num_samples += len(y_data) print( "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % (pass_id, iters, loss, acc) ) # The accuracy is the accumulation of batches, but not the current batch. - pass_train_acc = accuracy.eval() + # pass_train_acc = accuracy.eval() train_losses.append(loss) train_accs.append(acc) + print("Pass: %d, Loss: %f, Train Accuray: %f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) # evaluation if args.with_test: pass_test_acc = test(exe) - train_elapsed = time.time() - start_time - print("Pass: %d, Loss: %f, Train Accuray: %f\n" % - (pass_id, np.mean(train_losses), np.mean(train_accs))) + exit(0) def print_arguments(): - print('----------- Configuration Arguments -----------') + print('----------- vgg Configuration Arguments -----------') for arg, value in sorted(vars(args).iteritems()): print('%s: %s' % (arg, value)) print('------------------------------------------------') diff --git a/benchmark/tensorflow/machine_translation.py b/benchmark/tensorflow/machine_translation.py new file mode 100644 index 0000000000000000000000000000000000000000..8f77dce98353af53803246be8dc61063836b7867 --- /dev/null +++ b/benchmark/tensorflow/machine_translation.py @@ -0,0 +1,626 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +from tensorflow.python.framework import dtypes +from tensorflow.python.layers.core import Dense +from tensorflow.python.ops import check_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.framework import ops +from tensorflow.python.ops import rnn_cell_impl +from tensorflow.python.ops.rnn_cell_impl import RNNCell, BasicLSTMCell +from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple +from tensorflow.contrib.rnn.python.ops import core_rnn_cell +from tensorflow.python.ops import array_ops +from tensorflow.python.util import nest +import tensorflow.contrib.seq2seq as seq2seq +from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder +import numpy as np +import os +import argparse +import time + +import paddle.v2 as paddle + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--embedding_dim", + type=int, + default=512, + help="The dimension of embedding table. (default: %(default)d)") +parser.add_argument( + "--encoder_size", + type=int, + default=512, + help="The size of encoder bi-rnn unit. (default: %(default)d)") +parser.add_argument( + "--decoder_size", + type=int, + default=512, + help="The size of decoder rnn unit. (default: %(default)d)") +parser.add_argument( + "--batch_size", + type=int, + default=128, + help="The sequence number of a mini-batch data. (default: %(default)d)") +parser.add_argument( + "--dict_size", + type=int, + default=30000, + help="The dictionary capacity. Dictionaries of source sequence and " + "target dictionary have same capacity. (default: %(default)d)") +parser.add_argument( + "--max_time_steps", + type=int, + default=81, + help="Max number of time steps for sequence. (default: %(default)d)") +parser.add_argument( + "--pass_num", + type=int, + default=10, + help="The pass number to train. (default: %(default)d)") +parser.add_argument( + "--learning_rate", + type=float, + default=0.0002, + help="Learning rate used to train the model. (default: %(default)f)") +parser.add_argument( + "--infer_only", action='store_true', help="If set, run forward only.") +parser.add_argument( + "--beam_size", + type=int, + default=3, + help="The width for beam searching. (default: %(default)d)") +parser.add_argument( + "--max_generation_length", + type=int, + default=250, + help="The maximum length of sequence when doing generation. " + "(default: %(default)d)") +parser.add_argument( + "--save_freq", + type=int, + default=500, + help="Save model checkpoint every this interation. (default: %(default)d)") +parser.add_argument( + "--model_dir", + type=str, + default='./checkpoint', + help="Path to save model checkpoints. (default: %(default)d)") + +_Linear = core_rnn_cell._Linear # pylint: disable=invalid-name + +START_TOKEN_IDX = 0 +END_TOKEN_IDX = 1 + + +class LSTMCellWithSimpleAttention(RNNCell): + """Add attention mechanism to BasicLSTMCell. + This class is a wrapper based on tensorflow's `BasicLSTMCell`. + """ + + def __init__(self, + num_units, + encoder_vector, + encoder_proj, + source_sequence_length, + forget_bias=1.0, + state_is_tuple=True, + activation=None, + reuse=None): + super(LSTMCellWithSimpleAttention, self).__init__(_reuse=reuse) + if not state_is_tuple: + logging.warn("%s: Using a concatenated state is slower and will " + "soon be deprecated. Use state_is_tuple=True.", self) + self._num_units = num_units + # set padding part to 0 + self._encoder_vector = self._reset_padding(encoder_vector, + source_sequence_length) + self._encoder_proj = self._reset_padding(encoder_proj, + source_sequence_length) + self._forget_bias = forget_bias + self._state_is_tuple = state_is_tuple + self._activation = activation or math_ops.tanh + self._linear = None + + @property + def state_size(self): + return (LSTMStateTuple(self._num_units, self._num_units) \ + if self._state_is_tuple else 2 * self._num_units) + + @property + def output_size(self): + return self._num_units + + def zero_state(self, batch_size, dtype): + state_size = self.state_size + if hasattr(self, "_last_zero_state"): + (last_state_size, last_batch_size, last_dtype, + last_output) = getattr(self, "_last_zero_state") + if (last_batch_size == batch_size and last_dtype == dtype and + last_state_size == state_size): + return last_output + with ops.name_scope( + type(self).__name__ + "ZeroState", values=[batch_size]): + output = _zero_state_tensors(state_size, batch_size, dtype) + self._last_zero_state = (state_size, batch_size, dtype, output) + return output + + def call(self, inputs, state): + sigmoid = math_ops.sigmoid + # Parameters of gates are concatenated into one multiply for efficiency. + if self._state_is_tuple: + c, h = state + else: + c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1) + + # get context from encoder outputs + context = self._simple_attention(self._encoder_vector, + self._encoder_proj, h) + + if self._linear is None: + self._linear = _Linear([inputs, context, h], 4 * self._num_units, + True) + # i = input_gate, j = new_input, f = forget_gate, o = output_gate + i, j, f, o = array_ops.split( + value=self._linear([inputs, context, h]), + num_or_size_splits=4, + axis=1) + + new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) * + self._activation(j)) + new_h = self._activation(new_c) * sigmoid(o) + + if self._state_is_tuple: + new_state = LSTMStateTuple(new_c, new_h) + else: + new_state = array_ops.concat([new_c, new_h], 1) + return new_h, new_state + + def _simple_attention(self, encoder_vec, encoder_proj, decoder_state): + """Implement the attention function. + The implementation has the same logic to the fluid decoder. + """ + decoder_state_proj = tf.contrib.layers.fully_connected( + inputs=decoder_state, + num_outputs=self._num_units, + activation_fn=None, + biases_initializer=None) + decoder_state_expand = tf.tile( + tf.expand_dims( + input=decoder_state_proj, axis=1), + [1, tf.shape(encoder_proj)[1], 1]) + concated = tf.concat([decoder_state_expand, encoder_proj], axis=2) + # need reduce the first dimension + attention_weights = tf.contrib.layers.fully_connected( + inputs=tf.reshape( + concated, shape=[-1, self._num_units * 2]), + num_outputs=1, + activation_fn=tf.nn.tanh, + biases_initializer=None) + attention_weights_reshaped = tf.reshape( + attention_weights, shape=[tf.shape(encoder_vec)[0], -1, 1]) + # normalize the attention weights using softmax + attention_weights_normed = tf.nn.softmax( + attention_weights_reshaped, dim=1) + scaled = tf.multiply(attention_weights_normed, encoder_vec) + context = tf.reduce_sum(scaled, axis=1) + return context + + def _reset_padding(self, + memory, + memory_sequence_length, + check_inner_dims_defined=True): + """Reset the padding part for encoder inputs. + This funtion comes from tensorflow's `_prepare_memory` function. + """ + memory = nest.map_structure( + lambda m: ops.convert_to_tensor(m, name="memory"), memory) + if memory_sequence_length is not None: + memory_sequence_length = ops.convert_to_tensor( + memory_sequence_length, name="memory_sequence_length") + if check_inner_dims_defined: + + def _check_dims(m): + if not m.get_shape()[2:].is_fully_defined(): + raise ValueError( + "Expected memory %s to have fully defined inner dims, " + "but saw shape: %s" % (m.name, m.get_shape())) + + nest.map_structure(_check_dims, memory) + if memory_sequence_length is None: + seq_len_mask = None + else: + seq_len_mask = array_ops.sequence_mask( + memory_sequence_length, + maxlen=array_ops.shape(nest.flatten(memory)[0])[1], + dtype=nest.flatten(memory)[0].dtype) + seq_len_batch_size = (memory_sequence_length.shape[0].value or + array_ops.shape(memory_sequence_length)[0]) + + def _maybe_mask(m, seq_len_mask): + rank = m.get_shape().ndims + rank = rank if rank is not None else array_ops.rank(m) + extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32) + m_batch_size = m.shape[0].value or array_ops.shape(m)[0] + if memory_sequence_length is not None: + message = ("memory_sequence_length and memory tensor " + "batch sizes do not match.") + with ops.control_dependencies([ + check_ops.assert_equal( + seq_len_batch_size, m_batch_size, message=message) + ]): + seq_len_mask = array_ops.reshape( + seq_len_mask, + array_ops.concat( + (array_ops.shape(seq_len_mask), extra_ones), 0)) + return m * seq_len_mask + else: + return m + + return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), + memory) + + +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, + max_generation_length): + src_word_idx = tf.placeholder(tf.int32, shape=[None, None]) + src_sequence_length = tf.placeholder(tf.int32, shape=[None, ]) + + src_embedding_weights = tf.get_variable("source_word_embeddings", + [source_dict_dim, embedding_dim]) + src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx) + + src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) + src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) + # no peephole + encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn( + cell_fw=src_forward_cell, + cell_bw=src_reversed_cell, + inputs=src_embedding, + sequence_length=src_sequence_length, + dtype=tf.float32) + + # concat the forward outputs and backward outputs + encoded_vec = tf.concat(encoder_outputs, axis=2) + + # project the encoder outputs to size of decoder lstm + encoded_proj = tf.contrib.layers.fully_connected( + inputs=tf.reshape( + encoded_vec, shape=[-1, embedding_dim * 2]), + num_outputs=decoder_size, + activation_fn=None, + biases_initializer=None) + encoded_proj_reshape = tf.reshape( + encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size]) + + # get init state for decoder lstm's H + backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1]) + decoder_boot = tf.contrib.layers.fully_connected( + inputs=tf.reshape( + backword_first, shape=[-1, embedding_dim]), + num_outputs=decoder_size, + activation_fn=tf.nn.tanh, + biases_initializer=None) + + # prepare the initial state for decoder lstm + cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32) + initial_state = LSTMStateTuple(cell_init, decoder_boot) + + # create decoder lstm cell + decoder_cell = LSTMCellWithSimpleAttention( + decoder_size, + encoded_vec + if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size), + encoded_proj_reshape if not is_generating else + seq2seq.tile_batch(encoded_proj_reshape, beam_size), + src_sequence_length if not is_generating else + seq2seq.tile_batch(src_sequence_length, beam_size), + forget_bias=0.0) + + output_layer = Dense(target_dict_dim, name='output_projection') + + if not is_generating: + trg_word_idx = tf.placeholder(tf.int32, shape=[None, None]) + trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ]) + trg_embedding_weights = tf.get_variable( + "target_word_embeddings", [target_dict_dim, embedding_dim]) + trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights, + trg_word_idx) + + training_helper = seq2seq.TrainingHelper( + inputs=trg_embedding, + sequence_length=trg_sequence_length, + time_major=False, + name='training_helper') + + training_decoder = seq2seq.BasicDecoder( + cell=decoder_cell, + helper=training_helper, + initial_state=initial_state, + output_layer=output_layer) + + # get the max length of target sequence + max_decoder_length = tf.reduce_max(trg_sequence_length) + + decoder_outputs_train, _, _ = seq2seq.dynamic_decode( + decoder=training_decoder, + output_time_major=False, + impute_finished=True, + maximum_iterations=max_decoder_length) + + decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output) + decoder_pred_train = tf.argmax( + decoder_logits_train, axis=-1, name='decoder_pred_train') + masks = tf.sequence_mask( + lengths=trg_sequence_length, + maxlen=max_decoder_length, + dtype=tf.float32, + name='masks') + + # place holder of label sequence + lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None]) + + # compute the loss + loss = seq2seq.sequence_loss( + logits=decoder_logits_train, + targets=lbl_word_idx, + weights=masks, + average_across_timesteps=True, + average_across_batch=True) + + # return feeding list and loss operator + return { + 'src_word_idx': src_word_idx, + 'src_sequence_length': src_sequence_length, + 'trg_word_idx': trg_word_idx, + 'trg_sequence_length': trg_sequence_length, + 'lbl_word_idx': lbl_word_idx + }, loss + else: + start_tokens = tf.ones([tf.shape(src_word_idx)[0], ], + tf.int32) * START_TOKEN_IDX + # share the same embedding weights with target word + trg_embedding_weights = tf.get_variable( + "target_word_embeddings", [target_dict_dim, embedding_dim]) + + inference_decoder = beam_search_decoder.BeamSearchDecoder( + cell=decoder_cell, + embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens), + start_tokens=start_tokens, + end_token=END_TOKEN_IDX, + initial_state=tf.nn.rnn_cell.LSTMStateTuple( + tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size), + tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)), + beam_width=beam_size, + output_layer=output_layer) + + decoder_outputs_decode, _, _ = seq2seq.dynamic_decode( + decoder=inference_decoder, + output_time_major=False, + #impute_finished=True,# error occurs + maximum_iterations=max_generation_length) + + predicted_ids = decoder_outputs_decode.predicted_ids + + return { + 'src_word_idx': src_word_idx, + 'src_sequence_length': src_sequence_length + }, predicted_ids + + +def print_arguments(args): + print('----------- Configuration Arguments -----------') + for arg, value in vars(args).iteritems(): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def padding_data(data, padding_size, value): + data = data + [value] * padding_size + return data[:padding_size] + + +def save(sess, path, var_list=None, global_step=None): + saver = tf.train.Saver(var_list) + save_path = saver.save(sess, save_path=path, global_step=global_step) + print('Model save at %s' % save_path) + + +def restore(sess, path, var_list=None): + # var_list = None returns the list of all saveable variables + saver = tf.train.Saver(var_list) + saver.restore(sess, save_path=path) + print('model restored from %s' % path) + + +def adapt_batch_data(data): + src_seq = map(lambda x: x[0], data) + trg_seq = map(lambda x: x[1], data) + lbl_seq = map(lambda x: x[2], data) + + src_sequence_length = np.array( + [len(seq) for seq in src_seq]).astype('int32') + src_seq_maxlen = np.max(src_sequence_length) + + trg_sequence_length = np.array( + [len(seq) for seq in trg_seq]).astype('int32') + trg_seq_maxlen = np.max(trg_sequence_length) + + src_seq = np.array( + [padding_data(seq, src_seq_maxlen, END_TOKEN_IDX) + for seq in src_seq]).astype('int32') + + trg_seq = np.array( + [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX) + for seq in trg_seq]).astype('int32') + + lbl_seq = np.array( + [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX) + for seq in lbl_seq]).astype('int32') + + return { + 'src_word_idx': src_seq, + 'src_sequence_length': src_sequence_length, + 'trg_word_idx': trg_seq, + 'trg_sequence_length': trg_sequence_length, + 'lbl_word_idx': lbl_seq + } + + +def train(): + feeding_dict, loss = seq_to_seq_net( + embedding_dim=args.embedding_dim, + encoder_size=args.encoder_size, + decoder_size=args.decoder_size, + source_dict_dim=args.dict_size, + target_dict_dim=args.dict_size, + is_generating=False, + beam_size=args.beam_size, + max_generation_length=args.max_generation_length) + + global_step = tf.Variable(0, trainable=False, name='global_step') + trainable_params = tf.trainable_variables() + optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) + + gradients = tf.gradients(loss, trainable_params) + # may clip the parameters + clip_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) + + updates = optimizer.apply_gradients( + zip(gradients, trainable_params), global_step=global_step) + + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size) + + train_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + def do_validataion(): + total_loss = 0.0 + count = 0 + for batch_id, data in enumerate(test_batch_generator()): + adapted_batch_data = adapt_batch_data(data) + outputs = sess.run([loss], + feed_dict={ + item[1]: adapted_batch_data[item[0]] + for item in feeding_dict.items() + }) + total_loss += outputs[0] + count += 1 + return total_loss / count + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_l) + sess.run(init_g) + for pass_id in xrange(args.pass_num): + pass_start_time = time.time() + words_seen = 0 + for batch_id, data in enumerate(train_batch_generator()): + adapted_batch_data = adapt_batch_data(data) + words_seen += np.sum(adapted_batch_data['src_sequence_length']) + words_seen += np.sum(adapted_batch_data['trg_sequence_length']) + outputs = sess.run([updates, loss], + feed_dict={ + item[1]: adapted_batch_data[item[0]] + for item in feeding_dict.items() + }) + print("pass_id=%d, batch_id=%d, train_loss: %f" % + (pass_id, batch_id, outputs[1])) + pass_end_time = time.time() + test_loss = do_validataion() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_loss, words_per_sec, time_consumed)) + + +def infer(): + feeding_dict, predicted_ids = seq_to_seq_net( + embedding_dim=args.embedding_dim, + encoder_size=args.encoder_size, + decoder_size=args.decoder_size, + source_dict_dim=args.dict_size, + target_dict_dim=args.dict_size, + is_generating=True, + beam_size=args.beam_size, + max_generation_length=args.max_generation_length) + + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size) + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + with tf.Session(config=config) as sess: + restore(sess, './checkpoint/tf_seq2seq-1500') + for batch_id, data in enumerate(test_batch_generator()): + src_seq = map(lambda x: x[0], data) + + source_language_seq = [ + src_dict[item] for seq in src_seq for item in seq + ] + + src_sequence_length = np.array( + [len(seq) for seq in src_seq]).astype('int32') + src_seq_maxlen = np.max(src_sequence_length) + src_seq = np.array([ + padding_data(seq, src_seq_maxlen, END_TOKEN_IDX) + for seq in src_seq + ]).astype('int32') + + outputs = sess.run([predicted_ids], + feed_dict={ + feeding_dict['src_word_idx']: src_seq, + feeding_dict['src_sequence_length']: + src_sequence_length + }) + + print("\nDecoder result comparison: ") + source_language_seq = ' '.join(source_language_seq).lstrip( + '').rstrip('').strip() + inference_seq = '' + print(" --> source: " + source_language_seq) + for item in outputs[0][0]: + if item[0] == END_TOKEN_IDX: break + inference_seq += ' ' + trg_dict.get(item[0], '') + print(" --> inference: " + inference_seq) + + +if __name__ == '__main__': + args = parser.parse_args() + print_arguments(args) + if args.infer_only: + infer() + else: + train() diff --git a/benchmark/tensorflow/mnist.py b/benchmark/tensorflow/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..7140eed6eaff49b5c65f9ccb2e38f113a4cdbdbf --- /dev/null +++ b/benchmark/tensorflow/mnist.py @@ -0,0 +1,180 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import time +import numpy as np + +import tensorflow as tf +import paddle.v2 as paddle + +DTYPE = tf.float32 + + +def parse_args(): + parser = argparse.ArgumentParser("mnist model benchmark.") + parser.add_argument( + '--batch_size', type=int, default=128, help='The minibatch size.') + parser.add_argument( + '--iterations', type=int, default=35, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=5, help='The number of passes.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + args = parser.parse_args() + return args + + +def run_benchmark(args): + def weight_variable(dtype, shape): + initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype) + return tf.Variable(initial) + + def bias_variable(dtype, shape): + initial = tf.constant(0.1, shape=shape, dtype=dtype) + return tf.Variable(initial) + + device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0' + with tf.device(device): + images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1)) + labels = tf.placeholder(tf.int64, shape=(None, )) + + # conv1, relu, pool1 + conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20]) + conv1_bias = bias_variable(DTYPE, [20]) + conv1 = tf.nn.conv2d( + images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID") + relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias)) + pool1 = tf.nn.max_pool( + relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID") + + # conv2, relu, pool2 + conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50]) + conv2_bias = bias_variable(DTYPE, [50]) + conv2 = tf.nn.conv2d( + pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID") + relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias)) + pool2 = tf.nn.max_pool( + relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID") + + # FC + pool_shape = pool2.get_shape().as_list() + hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1) + reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim)) + fc_weights = weight_variable(DTYPE, [hidden_dim, 10]) + fc_bias = bias_variable(DTYPE, [10]) + logits = tf.matmul(reshape, fc_weights) + fc_bias + + # Get prediction + prediction = tf.nn.softmax(logits) + + # Loss + one_hot_labels = tf.one_hot(labels, depth=10) + cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1]) + avg_cost = tf.reduce_mean(cost) + + # Get accuracy + correct = tf.equal(tf.argmax(prediction, 1), labels) + accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) + + # metrics, g_accuracy + with tf.variable_scope("reset_metrics_accuracy_scope") as scope: + g_accuracy = tf.metrics.accuracy( + labels, tf.argmax( + prediction, axis=1)) + vars = tf.contrib.framework.get_variables( + scope, collection=tf.GraphKeys.LOCAL_VARIABLES) + g_accuracy_reset_op = tf.variables_initializer(vars) + + # Optimizer + opt = tf.train.AdamOptimizer( + learning_rate=0.001, beta1=0.9, beta2=0.999) + train_op = opt.minimize(avg_cost) + # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost) + + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=args.batch_size) + + def eval_test(): + sess.run(g_accuracy_reset_op) + for batch_id, data in enumerate(test_reader()): + images_data = np.array( + map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32") + labels_data = np.array(map(lambda x: x[1], data)).astype("int64") + + loss, acc, g_acc = sess.run( + [avg_cost, accuracy, g_accuracy], + feed_dict={images: images_data, + labels: labels_data}) + return g_acc[1] + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_g) + sess.run(init_l) + for pass_id in range(args.pass_num): + sess.run(g_accuracy_reset_op) + + pass_start = time.time() + for batch_id, data in enumerate(train_reader()): + images_data = np.array( + map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32") + labels_data = np.array(map(lambda x: x[1], data)).astype( + "int64") + + start = time.time() + _, loss, acc, g_acc = sess.run( + [train_op, avg_cost, accuracy, g_accuracy], + feed_dict={images: images_data, + labels: labels_data}) + end = time.time() + + print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" % + (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000)) + + pass_end = time.time() + test_avg_acc = eval_test() + + print( + "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f" + % (pass_id, g_acc[1], test_avg_acc, + (pass_end - pass_start) / 1000)) + + +def print_arguments(args): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + run_benchmark(args) diff --git a/benchmark/tensorflow/resnet.py b/benchmark/tensorflow/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..c432fa8d59571e128b9ff9e3ffa1949b792ef3a4 --- /dev/null +++ b/benchmark/tensorflow/resnet.py @@ -0,0 +1,504 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py + +Get help: python resnet.py --help +See performance on flowers: python resnet.py +Train on cifar10: python resnet.py --data=cifar10 --with_test +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import time +import numpy as np + +import paddle.v2 as paddle +import tensorflow as tf + +DTYPE = tf.float32 + + +def parse_args(): + parser = argparse.ArgumentParser('Convolution model benchmark.') + parser.add_argument( + '--model', + type=str, + choices=['resnet'], + default='resnet', + help='The model architecture.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='use real data or fake data') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', + type=int, + default=105, + help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=300, help='The number of passes.') + parser.add_argument( + '--order', + type=str, + default='NHWC', + choices=['NCHW', 'NHWC'], + help='The data order, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--data', + type=str, + default='flowers102', + choices=['flowers102', 'cifar10'], + help='The kinds of data.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + args = parser.parse_args() + return args + + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[ + 'with_test'] else vars(args)['iterations'] + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def fixed_padding(inputs, kernel_size, data_format): + """Pads the input along the spatial dimensions independently of input size. + Args: + inputs: A tensor of size [batch, channels, height_in, width_in] or + [batch, height_in, width_in, channels] depending on data_format. + kernel_size: The kernel to be used in the conv2d or max_pool2d operation. + Should be a positive integer. + data_format: The input format ('channels_last' or 'channels_first'). + Returns: + A tensor with the same format as the input with the data either intact + (if kernel_size == 1) or padded (if kernel_size > 1). + """ + pad_total = kernel_size - 1 + pad_beg = pad_total // 2 + pad_end = pad_total - pad_beg + + if data_format == 'channels_first': + padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end], + [pad_beg, pad_end]]) + else: + padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end], + [pad_beg, pad_end], [0, 0]]) + return padded_inputs + + +def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format): + """Strided 2-D convolution with explicit padding.""" + # The padding is consistent and is based only on `kernel_size`, not on the + # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). + # This is consistent with PaddlePaddle. + # In addition, the calculation for output size in TensorFlow can refer: + # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc + if strides > 1: + inputs = fixed_padding(inputs, kernel_size, data_format) + + return tf.layers.conv2d( + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=('SAME' if strides == 1 else 'VALID'), + use_bias=False, + kernel_initializer=tf.variance_scaling_initializer(), + data_format=data_format) + + +def conv_bn(inputs, + filters, + kernel_size, + strides, + is_training, + data_format, + act=True): + # def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format): + # set fused=True for a significant performance boost. See + # https://www.tensorflow.org/performance/performance_guide#common_fused_ops + inputs = conv2d_fixed_padding( + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + data_format=data_format) + inputs = tf.layers.batch_normalization( + inputs=inputs, + axis=1 if data_format == 'channels_first' else 3, + momentum=0.9, + epsilon=1e-05, + center=True, + scale=True, + training=is_training, + fused=True) + if act: + inputs = tf.nn.relu(inputs) + return inputs + + +def basicblock(inputs, filters, is_training, projection_shortcut, strides, + data_format): + shortcut = inputs + if projection_shortcut is not None: + shortcut = projection_shortcut(inputs) + inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format) + inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False) + inputs = inputs + shortcut + inputs = tf.nn.relu(inputs) + return inputs + + +def bottleneck(inputs, filters, is_training, projection_shortcut, strides, + data_format): + shortcut = inputs + if projection_shortcut is not None: + shortcut = projection_shortcut(inputs) + inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format) + inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False) + inputs = conv_bn( + inputs, filters * 4, 1, 1, is_training, data_format, act=False) + inputs = inputs + shortcut + inputs = tf.nn.relu(inputs) + return inputs + + +def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name, + data_format): + # Bottleneck blocks end with 4x the number of filters as they start with + filters_out = 4 * filters if block_fn is bottleneck else filters + + def projection_shortcut(inputs): + return conv2d_fixed_padding( + inputs=inputs, + filters=filters_out, + kernel_size=1, + strides=strides, + data_format=data_format) + + # Only the first block per block_layer uses projection_shortcut and strides + inputs = block_fn(inputs, filters, is_training, projection_shortcut, + strides, data_format) + + for _ in range(1, blocks): + inputs = block_fn(inputs, filters, is_training, None, 1, data_format) + + return tf.identity(inputs, name) + + +def resnet_imagenet(depth, class_dim, data_format): + """Returns the ResNet model for a given size and number of output classes.""" + + def resnet_generator(block_fn, + layers, + num_classes, + data_format='channels_last'): + if data_format is None: + data_format = ('channels_first' + if tf.test.is_built_with_cuda() else 'channels_last') + + def model(inputs, is_training): + """Constructs the ResNet model given the inputs.""" + if data_format == 'channels_first': + # Convert the inputs from channels_last (NHWC) to channels_first (NCHW). + # This provides a large performance boost on GPU. See + # https://www.tensorflow.org/performance/performance_guide#data_formats + inputs = tf.transpose(inputs, [0, 3, 1, 2]) + + inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format) + inputs = tf.identity(inputs, 'initial_conv') + inputs = tf.layers.max_pooling2d( + inputs=inputs, + pool_size=3, + strides=2, + padding='SAME', + data_format=data_format) + inputs = tf.identity(inputs, 'initial_max_pool') + inputs = block_layer(inputs, 64, block_fn, layers[0], 1, + is_training, 'block_layer1', data_format) + inputs = block_layer(inputs, 128, block_fn, layers[1], 2, + is_training, 'block_layer2', data_format) + inputs = block_layer(inputs, 256, block_fn, layers[2], 2, + is_training, 'block_layer3', data_format) + inputs = block_layer(inputs, 512, block_fn, layers[3], 2, + is_training, 'block_layer4', data_format) + inputs = tf.layers.average_pooling2d( + inputs=inputs, + pool_size=7, + strides=1, + padding='VALID', + data_format=data_format) + inputs = tf.identity(inputs, 'final_avg_pool') + inputs = tf.reshape(inputs, + [-1, 512 if block_fn is basicblock else 2048]) + inputs = tf.layers.dense(inputs=inputs, units=num_classes) + inputs = tf.identity(inputs, 'final_dense') + return inputs + + return model + + model_params = { + 18: { + 'block': basicblock, + 'layers': [2, 2, 2, 2] + }, + 34: { + 'block': basicblock, + 'layers': [3, 4, 6, 3] + }, + 50: { + 'block': bottleneck, + 'layers': [3, 4, 6, 3] + }, + 101: { + 'block': bottleneck, + 'layers': [3, 4, 23, 3] + }, + 152: { + 'block': bottleneck, + 'layers': [3, 8, 36, 3] + }, + 200: { + 'block': bottleneck, + 'layers': [3, 24, 36, 3] + } + } + if depth not in model_params: + raise ValueError('Not a valid depth:', depth) + params = model_params[depth] + return resnet_generator(params['block'], params['layers'], class_dim, + data_format) + + +def resnet_cifar10(depth, num_classes, data_format): + if depth % 6 != 2: + raise ValueError('depth must be 6n + 2:', depth) + + num_blocks = (depth - 2) // 6 + + if data_format is None: + data_format = ('channels_first' + if tf.test.is_built_with_cuda() else 'channels_last') + + def model(inputs, is_training): + inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format) + inputs = tf.identity(inputs, 'initial_conv') + inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training, + 'block_layer1', data_format) + inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training, + 'block_layer2', data_format) + inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training, + 'block_layer3', data_format) + inputs = tf.layers.average_pooling2d( + inputs=inputs, + pool_size=8, + strides=1, + padding='VALID', + data_format=data_format) + inputs = tf.identity(inputs, 'final_avg_pool') + inputs = tf.reshape(inputs, [-1, 64]) + inputs = tf.layers.dense(inputs=inputs, units=num_classes) + inputs = tf.identity(inputs, 'final_dense') + return inputs + + return model + + +def run_benchmark(args, data_format='channels_last', device='/cpu:0'): + """Our model_fn for ResNet to be used with our Estimator.""" + + class_dim = 1000 + dshape = (None, 224, 224, 3) + + pdshape = (3, 224, 224) + if args.data == 'flowers102': + class_dim = 102 + dshape = (None, 224, 224, 3) + pdshape = (3, 224, 224) + elif args.data == 'cifar10': + class_dim = 10 + dshape = (None, 32, 32, 3) + pdshape = (3, 32, 32) + + with tf.device(device): + images = tf.placeholder(DTYPE, shape=dshape) + labels = tf.placeholder(tf.int64, shape=(None, )) + is_training = tf.placeholder('bool') + onehot_labels = tf.one_hot(labels, depth=class_dim) + + network = resnet_cifar10( + 32, class_dim, + data_format) if args.data == 'cifar10' else resnet_imagenet( + 50, class_dim, data_format) + + logits = network(inputs=images, is_training=is_training) + + cross_entropy = tf.losses.softmax_cross_entropy( + logits=logits, onehot_labels=onehot_labels) + avg_cost = tf.reduce_mean(cross_entropy) + + correct = tf.equal(tf.argmax(logits, 1), labels) + accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) + + lr = 0.1 if args.data == 'cifar10' else 0.01 + optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9) + + # Batch norm requires update_ops to be added as a train_op dependency. + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(update_ops): + train_op = optimizer.minimize(avg_cost) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=100) + + def test(): + test_accs = [] + for batch_id, data in enumerate(test_reader()): + test_images = np.array( + map(lambda x: np.transpose(x[0].reshape(pdshape), + axes=[1, 2, 0]), data)).astype("float32") + test_labels = np.array(map(lambda x: x[1], data)).astype('int64') + test_accs.append( + accuracy.eval(feed_dict={ + images: test_images, + labels: test_labels, + is_training: False + })) + print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" % + (pass_id, num_samples / train_elapsed, np.mean(test_accs))) + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_g) + sess.run(init_l) + + if args.use_fake_data: + data = train_reader().next() + images_data = np.array( + map(lambda x: np.transpose(x[0].reshape(pdshape), + axes=[1, 2, 0]), data)).astype("float32") + labels_data = np.array(map(lambda x: x[1], data)).astype('int64') + iters, num_samples, start_time = 0, 0, 0.0 + for pass_id in range(args.pass_num): + if iters == args.iterations: + break + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + if not args.use_fake_data: + images_data = np.array( + map(lambda x: np.transpose(x[0].reshape(pdshape), + axes=[1, 2, 0]), data)).astype("float32") + labels_data = np.array(map(lambda x: x[1], data)).astype( + 'int64') + _, loss, acc = sess.run([train_op, avg_cost, accuracy], + feed_dict={ + images: images_data, + labels: labels_data, + is_training: True + }) + iters += 1 + train_accs.append(acc) + train_losses.append(loss) + num_samples += len(data) + print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" % + (pass_id, iters, loss, acc)) + + train_elapsed = time.time() - start_time + print("Pass=%d, Loss=%f, Accuray=%f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + + # evaluation + if args.with_test: + test() + + if not args.with_test: + duration = time.time() - start_time + examples_per_sec = num_samples / duration + sec_per_batch = duration / (iters - args.skip_batch_num) + + print('Total examples: %d, total time: %.5f' % + (num_samples, duration)) + print('%.5f examples/sec, %.5f sec/batch' % + (examples_per_sec, sec_per_batch)) + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + if tf.test.is_built_with_cuda(): + device = '/device:GPU:0' + if args.order == 'NHWC': + data_format = 'channels_last' + else: + data_format = 'channels_first' + else: + device = '/cpu:0' + if args.order == 'NHWC': + data_format = 'channels_last' + else: + raise ValueError('Only support NHWC order in CPU mode') + + run_benchmark(args, data_format, device) diff --git a/benchmark/tensorflow/stacked_dynamic_lstm.py b/benchmark/tensorflow/stacked_dynamic_lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..5285033005044d907d0b7e91eb66ee7281c4f27a --- /dev/null +++ b/benchmark/tensorflow/stacked_dynamic_lstm.py @@ -0,0 +1,220 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import tensorflow as tf + +import paddle.v2 as paddle + + +def parse_args(): + parser = argparse.ArgumentParser("LSTM model benchmark.") + parser.add_argument( + '--batch_size', + type=int, + default=32, + help='The sequence number of a batch data. (default: %(default)d)') + parser.add_argument( + '--stacked_num', + type=int, + default=5, + help='Number of lstm layers to stack. (default: %(default)d)') + parser.add_argument( + '--embedding_dim', + type=int, + default=512, + help='Dimension of embedding table. (default: %(default)d)') + parser.add_argument( + '--hidden_dim', + type=int, + default=512, + help='Hidden size of lstm unit. (default: %(default)d)') + parser.add_argument( + '--pass_num', + type=int, + default=10, + help='Epoch number to train. (default: %(default)d)') + parser.add_argument( + '--learning_rate', + type=float, + default=0.0002, + help='Learning rate used to train. (default: %(default)f)') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + args = parser.parse_args() + return args + + +def print_arguments(args): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def dynamic_lstm_model(dict_size, + embedding_dim, + hidden_dim, + stacked_num, + class_num=2, + is_train=True): + word_idx = tf.placeholder(tf.int64, shape=[None, None]) + sequence_length = tf.placeholder(tf.int64, shape=[None, ]) + + embedding_weights = tf.get_variable('word_embeddings', + [dict_size, embedding_dim]) + embedding = tf.nn.embedding_lookup(embedding_weights, word_idx) + + lstm_cell = tf.nn.rnn_cell.LSTMCell( + num_units=hidden_dim, use_peepholes=False) + stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num) + + # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples + _, final_state = tf.nn.dynamic_rnn( + cell=stacked_cell, + inputs=embedding, + dtype=tf.float32, + sequence_length=sequence_length) + + w = tf.Variable( + tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32) + bias = tf.Variable( + tf.constant( + value=0.0, shape=[class_num], dtype=tf.float32)) + prediction = tf.matmul(final_state[-1][1], w) + bias + + if not is_train: + return (word_idx, sequence_length), tf.nn.softmax(prediction) + + label = tf.placeholder(tf.int64, shape=[None, ]) + loss = tf.nn.softmax_cross_entropy_with_logits( + labels=tf.one_hot(label, 2), logits=prediction) + avg_loss = tf.reduce_mean(loss) + + correct_count = tf.equal(tf.argmax(prediction, 1), label) + acc = tf.reduce_mean(tf.cast(correct_count, tf.float32)) + + with tf.variable_scope("reset_metrics_accuracy_scope") as scope: + g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1)) + vars = tf.contrib.framework.get_variables( + scope, collection=tf.GraphKeys.LOCAL_VARIABLES) + reset_op = tf.variables_initializer(vars) + + return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op + + +def padding_data(data, padding_size, value): + data = data + [value] * padding_size + return data[:padding_size] + + +def train(args): + word_dict = paddle.dataset.imdb.word_dict() + dict_size = len(word_dict) + + feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model( + dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num) + + adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) + train_op = adam_optimizer.minimize(avg_loss) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=25000), + batch_size=args.batch_size) + + test_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.test(word_dict), buf_size=25000), + batch_size=args.batch_size) + + def do_validation(sess): + sess.run(reset_op) + for batch_id, data in enumerate(test_reader()): + word_idx = map(lambda x: x[0], data) + sequence_length = np.array( + [len(seq) for seq in word_idx]).astype('int64') + maxlen = np.max(sequence_length) + word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx] + word_idx = np.array(word_idx).astype('int64') + label = np.array(map(lambda x: x[1], data)).astype('int64') + + _, loss, fetch_acc, fetch_g_acc = sess.run( + [train_op, avg_loss, acc, g_acc], + feed_dict={ + feeding_list[0]: word_idx, + feeding_list[1]: sequence_length, + feeding_list[2]: label + }) + + return fetch_g_acc[1] + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_l) + sess.run(init_g) + + for pass_id in xrange(args.pass_num): + # clear accuracy local variable + sess.run(reset_op) + pass_start_time = time.time() + words_seen = 0 + + for batch_id, data in enumerate(train_reader()): + word_idx = map(lambda x: x[0], data) + sequence_length = np.array( + [len(seq) for seq in word_idx]).astype('int64') + words_seen += np.sum(sequence_length) + maxlen = np.max(sequence_length) + word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx] + word_idx = np.array(word_idx).astype('int64') + label = np.array(map(lambda x: x[1], data)).astype('int64') + + _, loss, fetch_acc, fetch_g_acc = sess.run( + [train_op, avg_loss, acc, g_acc], + feed_dict={ + feeding_list[0]: word_idx, + feeding_list[1]: sequence_length, + feeding_list[2]: label + }) + + print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f" + % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1])) + + pass_end_time = time.time() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + test_acc = do_validation(sess) + print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_acc, words_per_sec, time_consumed)) + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + + if args.infer_only: + pass + else: + train(args) diff --git a/benchmark/tensorflow/vgg.py b/benchmark/tensorflow/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..fba5ec71a46b3ac8b2e1244424c39fd5192e5458 --- /dev/null +++ b/benchmark/tensorflow/vgg.py @@ -0,0 +1,324 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VGG16 benchmark in TensorFlow""" +import tensorflow as tf +import paddle.v2 as paddle +import numpy as np +import argparse +import time + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--batch_size', type=int, default=128, help="Batch size for training.") +parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test') +parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') +parser.add_argument( + '--learning_rate', + type=float, + default=1e-3, + help="Learning rate for training.") +parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.") +parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help="The device type.") +parser.add_argument( + '--data_format', + type=str, + default='NHWC', + choices=['NCHW', 'NHWC'], + help='The data order, NCHW=[batch, channels, height, width].' + 'Only support NHWC right now.') +parser.add_argument( + '--data_set', + type=str, + default='cifar10', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') +args = parser.parse_args() + + +class VGG16Model(object): + def __init__(self): + self.parameters = [] + + def batch_norm_relu(self, inputs, is_training): + """Performs a batch normalization followed by a ReLU.""" + # We set fused=True for a significant speed boost. See + # https://www.tensorflow.org/speed/speed_guide#common_fused_ops + inputs = tf.layers.batch_normalization( + inputs=inputs, + axis=1 if args.data_format == 'NCHW' else -1, + momentum=0.9, + epsilon=1e-05, + center=True, + scale=True, + training=is_training, + fused=True) + inputs = tf.nn.relu(inputs) + return inputs + + def conv_bn_layer(self, + name, + images, + kernel_shape, + is_training, + drop_rate=0.0): + with tf.name_scope(name) as scope: + kernel = tf.Variable( + tf.truncated_normal( + kernel_shape, dtype=tf.float32, stddev=1e-1), + name='weights') + conv = tf.nn.conv2d( + images, + kernel, [1, 1, 1, 1], + data_format=args.data_format, + padding='SAME') + biases = tf.Variable( + tf.constant( + 0.0, shape=[kernel_shape[-1]], dtype=tf.float32), + trainable=True, + name='biases') + out = tf.nn.bias_add(conv, biases) + out = self.batch_norm_relu(out, is_training) + out = tf.layers.dropout(out, rate=drop_rate, training=is_training) + return out + + def fc_layer(self, name, inputs, shape): + with tf.name_scope(name) as scope: + fc_w = tf.Variable( + tf.truncated_normal( + shape, dtype=tf.float32, stddev=1e-1), + name='weights') + fc_b = tf.Variable( + tf.constant( + 0.0, shape=[shape[-1]], dtype=tf.float32), + trainable=True, + name='biases') + out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b) + return out + + def network(self, images, class_dim, is_training): + """ VGG16 model structure. + + TODO(kuke): enable this network to support the 'NCHW' data format + """ + + # conv1 + conv1_1 = self.conv_bn_layer( + 'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3) + conv1_2 = self.conv_bn_layer( + 'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0) + # pool1 + pool1 = tf.nn.max_pool( + conv1_2, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool1') + # conv2 + conv2_1 = self.conv_bn_layer( + 'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4) + conv2_2 = self.conv_bn_layer( + 'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0) + # pool2 + pool2 = tf.nn.max_pool( + conv2_2, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool2') + # conv3 + conv3_1 = self.conv_bn_layer( + 'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4) + conv3_2 = self.conv_bn_layer( + 'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4) + conv3_3 = self.conv_bn_layer( + 'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0) + # pool3 + pool3 = tf.nn.max_pool( + conv3_3, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool3') + # conv4 + conv4_1 = self.conv_bn_layer( + 'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4) + conv4_2 = self.conv_bn_layer( + 'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4) + conv4_3 = self.conv_bn_layer( + 'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0) + # pool4 + pool4 = tf.nn.max_pool( + conv4_3, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool4') + # conv5 + conv5_1 = self.conv_bn_layer( + 'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4) + conv5_2 = self.conv_bn_layer( + 'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4) + conv5_3 = self.conv_bn_layer( + 'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0) + # pool5 + pool5 = tf.nn.max_pool( + conv5_3, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool4') + # flatten + shape = int(np.prod(pool5.get_shape()[1:])) + pool5_flat = tf.reshape(pool5, [-1, shape]) + # fc1 + drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training) + fc1 = self.fc_layer('fc1', drop, [shape, 512]) + # fc2 + bn = self.batch_norm_relu(fc1, is_training) + drop = tf.layers.dropout(bn, rate=0.5, training=is_training) + fc2 = self.fc_layer('fc2', drop, [512, 512]) + + fc3 = self.fc_layer('fc3', fc2, [512, class_dim]) + + return fc3 + + +def run_benchmark(): + """Run benchmark on cifar10 or flowers.""" + + if args.data_set == "cifar10": + class_dim = 10 + raw_shape = (3, 32, 32) + dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else ( + None, 3, 32, 32) + else: + class_dim = 102 + raw_shape = (3, 224, 224) + dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else ( + None, 3, 224, 224) + + device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0' + + with tf.device(device): + images = tf.placeholder(tf.float32, shape=dat_shape) + labels = tf.placeholder(tf.int64, shape=(None, )) + is_training = tf.placeholder('bool') + onehot_labels = tf.one_hot(labels, depth=class_dim) + + vgg16 = VGG16Model() + logits = vgg16.network(images, class_dim, is_training) + loss = tf.losses.softmax_cross_entropy( + onehot_labels=onehot_labels, logits=logits) + avg_loss = tf.reduce_mean(loss) + + correct = tf.equal(tf.argmax(logits, 1), labels) + accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) + + optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(update_ops): + train_op = optimizer.minimize(avg_loss) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + buf_size=5120), + batch_size=args.batch_size) + + # test + def test(): + test_accs = [] + for batch_id, data in enumerate(test_reader()): + test_images = np.array( + map(lambda x: np.transpose(x[0].reshape(raw_shape), + axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32") + test_labels = np.array(map(lambda x: x[1], data)).astype('int64') + test_accs.append( + accuracy.eval(feed_dict={ + images: test_images, + labels: test_labels, + is_training: False + })) + return np.mean(test_accs) + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_g) + sess.run(init_l) + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in range(args.num_passes): + # train + num_samples = 0 + start_time = time.time() + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + train_images = np.array( + map(lambda x: np.transpose(x[0].reshape(raw_shape), + axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32") + train_labels = np.array(map(lambda x: x[1], data)).astype( + 'int64') + _, loss, acc = sess.run([train_op, avg_loss, accuracy], + feed_dict={ + images: train_images, + labels: train_labels, + is_training: True + }) + iters += 1 + num_samples += len(data) + print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" % + (pass_id, iters, loss, acc)) + train_elapsed = time.time() - start_time + # test + pass_test_acc = test() + print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" % + (pass_id, num_samples / train_elapsed, pass_test_acc)) + + +def print_arguments(): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +if __name__ == '__main__': + print_arguments() + run_benchmark() diff --git a/paddle/.gitignore b/paddle/.gitignore index f921eef14156a97e4fd250f014960e306b43f35a..1c1c0c2c829f088d7e3f52ca007fcb8f33a16a36 100644 --- a/paddle/.gitignore +++ b/paddle/.gitignore @@ -1,3 +1,4 @@ +.timestamp *.o *.a .svn diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 16a118090ba9cfd50b4b03484983f9fc73cf7973..c2ca1bbc78f3ebc6066df6b666720af0d1fbbf59 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -93,6 +93,43 @@ static void CheckTensorNANOrInf(const std::string& name, "Tensor %s contains NAN", name); } +void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, + int block_id) { + auto& global_block = pdesc.Block(block_id); + + const Scope* ancestor_scope = scope; + while (ancestor_scope->parent()) { + ancestor_scope = ancestor_scope->parent(); + } + + if (ancestor_scope != scope) { + for (auto& var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var->Persistable()) { + auto* ptr = const_cast(ancestor_scope)->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { + auto* ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + } else { + for (auto& var : global_block.AllVars()) { + auto* ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; + } + } +} + void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { platform::RecordBlock b(block_id); @@ -184,8 +221,8 @@ static bool has_fetch_operators( void Executor::Run(const ProgramDesc& program, Scope* scope, std::map& feed_targets, std::map& fetch_targets, - const std::string& feed_holder_name, - const std::string& fetch_holder_name, bool create_vars) { + bool create_vars, const std::string& feed_holder_name, + const std::string& fetch_holder_name) { platform::RecordBlock b(kProgramId); bool has_feed_ops = has_feed_operators(program.Block(0), feed_targets, feed_holder_name); @@ -296,38 +333,13 @@ std::vector> Executor::Prepare( void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, bool create_local_scope, bool create_vars) { - auto& block = ctx->prog_.Block(ctx->block_id_); - Scope* local_scope = scope; if (create_vars) { if (create_local_scope) { local_scope = &scope->NewScope(); - for (auto& var : block.AllVars()) { - if (var->Name() == framework::kEmptyVarName) { - continue; - } - - if (var->Persistable()) { - auto* ptr = scope->Var(var->Name()); - InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; - } else { - auto* ptr = local_scope->Var(var->Name()); - InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; - } - } - } else { - for (auto& var : block.AllVars()) { - auto* ptr = local_scope->Var(var->Name()); - InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create variable " << var->Name() << ", which pointer is " - << ptr; - } - } // if (create_local_scope) - } // if (create_vars) + } + CreateVariables(ctx->prog_, local_scope, ctx->block_id_); + } for (auto& op : ctx->ops_) { VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index d7c99165f0c9d3b1ae11a3b4753a61e8118f7b52..75b29b2f4065ad75b62a134b890b8f9f6730fdc7 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -54,9 +54,9 @@ class Executor { void Run(const ProgramDesc& program, Scope* scope, std::map& feed_targets, std::map& fetch_targets, + bool create_vars = true, const std::string& feed_holder_name = "feed", - const std::string& fetch_holder_name = "fetch", - bool create_vars = true); + const std::string& fetch_holder_name = "fetch"); static std::unique_ptr Prepare( const ProgramDesc& program, int block_id); @@ -64,6 +64,8 @@ class Executor { static std::vector> Prepare( const ProgramDesc& program, const std::vector& block_ids); + void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); + void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, bool create_local_scope = true, bool create_vars = true); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7be93fa6002ae93c3e1b75c8f7fe5ca5f40b271f..74945fb4f2f745b6ca9c48adb0c8b9e6ae1e94a4 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" -#include "paddle/fluid/platform/profiler.h" #include #include @@ -24,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -43,30 +43,40 @@ class ParallelExecutorPrivate { #endif }; +std::vector &ParallelExecutor::GetLocalScopes() { + return member_->local_scopes_; +} + ParallelExecutor::ParallelExecutor( size_t num_threads, bool use_event, const std::vector &places, const std::unordered_set ¶ms, - const ProgramDesc &startup_program, const ProgramDesc &main_program, - const std::string &loss_var_name, Scope *scope, bool allow_op_delay) + const std::unordered_set &bcast_vars, + const ProgramDesc &main_program, const std::string &loss_var_name, + Scope *scope, const std::vector &local_scopes, bool allow_op_delay) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; - // Step 1. RunStartupProgram and Bcast the params to devs. - Executor exe(places[0]); - exe.Run(startup_program, scope, 0); + // Step 1. Bcast the params to devs. // Create local scopes - for (size_t i = 0; i < member_->places_.size(); ++i) { - member_->local_scopes_.push_back(&scope->NewScope()); + if (local_scopes.empty()) { + for (size_t i = 0; i < member_->places_.size(); ++i) { + member_->local_scopes_.push_back(&scope->NewScope()); + } + } else { + PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size()); + for (size_t i = 0; i < member_->places_.size(); ++i) { + member_->local_scopes_.push_back(local_scopes[i]); + } } // Bcast Parameters to all GPUs #ifdef PADDLE_WITH_CUDA member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_)); #endif - if (platform::is_gpu_place(places[0]) && - member_->local_scopes_.size() != 1) { // Is CUDA - BCastParamsToGPUs(startup_program); + if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 && + local_scopes.empty()) { // Is CUDA + BCastParamsToGPUs(bcast_vars); } // Startup Program has been run. All local scopes has correct parameters. @@ -99,48 +109,47 @@ ParallelExecutor::ParallelExecutor( } void ParallelExecutor::BCastParamsToGPUs( - const ProgramDesc &startup_program) const { + const std::unordered_set &vars) const { #ifdef PADDLE_WITH_CUDA auto *main_scope = member_->local_scopes_[0]; - for (auto *var_desc : startup_program.Block(0).AllVars()) { - size_t idx = var_desc->Name().find("@GRAD"); - if (idx != std::string::npos) continue; - if (var_desc->GetType() == proto::VarType::LOD_TENSOR) { - auto &main_tensor = - main_scope->FindVar(var_desc->Name())->Get(); - - auto &dims = main_tensor.dims(); - - if (paddle::platform::is_gpu_place(main_tensor.place())) { - size_t numel = main_tensor.numel(); - ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); - platform::NCCLGroupGuard guard; - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto place = member_->places_[i]; - void *buffer; - if (i == 0) { - buffer = const_cast(main_tensor.data()); - } else { - auto local_scope = member_->local_scopes_[i]; - auto *t = - local_scope->Var(var_desc->Name())->GetMutable(); - t->Resize(dims); - buffer = t->mutable_data(place, main_tensor.type()); - } - auto &nccl_ctx = member_->nccl_ctxs_->at(place); - platform::dynload::ncclBcast(buffer, numel, data_type, 0, - nccl_ctx.comm_, nccl_ctx.stream()); - } - } else { - platform::CPUPlace cpu; - for (size_t i = 1; i < member_->places_.size(); ++i) { + for (auto &var : vars) { + auto *main_var = main_scope->FindVar(var); + if (!main_var->IsType()) { + continue; + } + + auto &main_tensor = main_var->Get(); + + auto &dims = main_tensor.dims(); + + if (paddle::platform::is_gpu_place(main_tensor.place())) { + size_t numel = main_tensor.numel(); + ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); + platform::NCCLGroupGuard guard; + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto place = member_->places_[i]; + void *buffer; + if (i == 0) { + buffer = const_cast(main_tensor.data()); + } else { auto local_scope = member_->local_scopes_[i]; - auto *t = local_scope->Var(var_desc->Name())->GetMutable(); + auto *t = local_scope->Var(var)->GetMutable(); t->Resize(dims); - t->mutable_data(cpu, main_tensor.type()); - paddle::framework::TensorCopy(main_tensor, cpu, t); + buffer = t->mutable_data(place, main_tensor.type()); } + auto &nccl_ctx = member_->nccl_ctxs_->at(place); + platform::dynload::ncclBcast(buffer, numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); + } + } else { + platform::CPUPlace cpu; + for (size_t i = 1; i < member_->places_.size(); ++i) { + auto local_scope = member_->local_scopes_[i]; + auto *t = local_scope->Var(var)->GetMutable(); + t->Resize(dims); + t->mutable_data(cpu, main_tensor.type()); + paddle::framework::TensorCopy(main_tensor, cpu, t); } } member_->nccl_ctxs_->WaitAll(); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index c7c58b2b808383621a6d492f9188b0d36bfa6858..c048c3865f14822be4a0015e385ea1b8e05d0ced 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -36,11 +36,14 @@ class ParallelExecutor { explicit ParallelExecutor(size_t num_threads, bool use_event, const std::vector& places, const std::unordered_set& params, - const ProgramDesc& startup_program, + const std::unordered_set& bcast_vars, const ProgramDesc& main_program, const std::string& loss_var_name, Scope* scope, + const std::vector& local_scopes, bool allow_op_delay); + std::vector& GetLocalScopes(); + void Run(const std::vector& fetch_tensors, const std::string& fetched_var_name, const std::unordered_map& feed_tensors); @@ -51,7 +54,7 @@ class ParallelExecutor { ParallelExecutorPrivate* member_; - void BCastParamsToGPUs(const ProgramDesc& startup_program) const; + void BCastParamsToGPUs(const std::unordered_set& vars) const; }; } // namespace framework diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 97a15c71773051dfc01c98f11cf9cb76adbcec7f..c8cb70549f1d131b66fa7c6eeb35f3b7151a9e7f 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -58,7 +58,7 @@ class Scope { /// nullptr if cannot find. Variable* FindVar(const std::string& name) const; - const Scope& parent() const { return *parent_; } + const Scope* parent() const { return parent_; } /// Find the scope or an ancestor scope that contains the given variable. const Scope* FindScope(const Variable* var) const; diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index a6b6c3f828f4c6f59fca42e4c3d9580d6c136524..ca2077d07411d2cd6095e0dc2a874af0890145c5 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -46,8 +46,8 @@ TEST(inference, image_classification) { // Run inference on CPU LOG(INFO) << "--- CPU Runs: ---"; - TestInference(dirname, cpu_feeds, cpu_fetchs1, - FLAGS_repeat); + TestInference(dirname, cpu_feeds, + cpu_fetchs1, FLAGS_repeat); LOG(INFO) << output1.dims(); #ifdef PADDLE_WITH_CUDA @@ -57,8 +57,8 @@ TEST(inference, image_classification) { // Run inference on CUDA GPU LOG(INFO) << "--- GPU Runs: ---"; - TestInference(dirname, cpu_feeds, cpu_fetchs2, - FLAGS_repeat); + TestInference(dirname, cpu_feeds, + cpu_fetchs2, FLAGS_repeat); LOG(INFO) << output2.dims(); CheckError(output1, output2); diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 5118e66f1e7a36333ff4425361e54ec59e6ba05b..aae34ceda07fea6e881cf61b3755ec45d1d6f2dc 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -88,7 +88,7 @@ void CheckError(const paddle::framework::LoDTensor& output1, EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; } -template +template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, const std::vector& cpu_fetchs, @@ -166,8 +166,16 @@ void TestInference(const std::string& dirname, // 6. Run the inference program { + if (!CreateVars) { + // If users don't want to create and destroy variables every time they + // run, they need to set `create_vars` to false and manually call + // `CreateVariables` before running. + executor.CreateVariables(*inference_program, scope, 0); + } + // Ignore the profiling results of the first run - executor.Run(*inference_program, scope, feed_targets, fetch_targets); + executor.Run(*inference_program, scope, feed_targets, fetch_targets, + CreateVars); // Enable the profiler paddle::platform::EnableProfiler(state); @@ -178,7 +186,8 @@ void TestInference(const std::string& dirname, "run_inference", paddle::platform::DeviceContextPool::Instance().Get(place)); - executor.Run(*inference_program, scope, feed_targets, fetch_targets); + executor.Run(*inference_program, scope, feed_targets, fetch_targets, + CreateVars); } // Disable the profiler and print the timing information diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 7fbe4efc045b6539b498389af94769e5bdb1f82e..c4efbcd3f977ee285e13223d7e0ca420aec63b98 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" diff --git a/paddle/fluid/operators/adagrad_op.cc b/paddle/fluid/operators/adagrad_op.cc index c990fe784380bf78a7f3594c0f49ef5e06e6caea..0153e1253b00ded21a7a14e37faf5a76d904d8d1 100644 --- a/paddle/fluid/operators/adagrad_op.cc +++ b/paddle/fluid/operators/adagrad_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/adagrad_op.h" +#include #include diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h index dbcc7abb0996268b5a3571ba113d9cc56f6f65a3..4309f0a5497456065e5c43bc8f7b265fa711f699 100644 --- a/paddle/fluid/operators/array_operator.h +++ b/paddle/fluid/operators/array_operator.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc index e8123cb1a490be642d1061bba8129f63e681d3c3..993610fdedde4bafd99f59a0adeeeef4526eb089 100644 --- a/paddle/fluid/operators/assign_value_op.cc +++ b/paddle/fluid/operators/assign_value_op.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "paddle/fluid/operators/assign_value_op.h" +#include +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h index c7b1a55a5cd52bd2bacbdea3ee22c75c2a2c12d5..e749d6f6d3685f207f0ad4f2ebc7c3c7ae32992c 100644 --- a/paddle/fluid/operators/assign_value_op.h +++ b/paddle/fluid/operators/assign_value_op.h @@ -14,6 +14,7 @@ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc index 71de78b1181daf4bd0b6d73508638857bafcf560..a168eaeab56128b75bbe97d7ccf843a081b5dced 100644 --- a/paddle/fluid/operators/auc_op.cc +++ b/paddle/fluid/operators/auc_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/auc_op.h" +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h index f4e8208c3f2e238a4acecab4579fc955092d5978..8b016c3d31ad83e66baeb298c61840cc529efa1e 100644 --- a/paddle/fluid/operators/auc_op.h +++ b/paddle/fluid/operators/auc_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel { std::vector thresholds_list; thresholds_list.reserve(num_thresholds); for (int i = 1; i < num_thresholds - 1; i++) { - thresholds_list[i] = (float)i / (num_thresholds - 1); + thresholds_list[i] = static_cast(i) / (num_thresholds - 1); } const float kEpsilon = 1e-7; thresholds_list[0] = 0.0f - kEpsilon; @@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel { float* fp_rate_data = fp_rate.mutable_data(ctx.GetPlace()); float* rec_rate_data = rec_rate.mutable_data(ctx.GetPlace()); for (int i = 0; i < num_thresholds; i++) { - tp_rate_data[i] = - ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon); - fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon); - rec_rate_data[i] = - ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon); + tp_rate_data[i] = (static_cast(tp_data[i]) + epsilon) / + (tp_data[i] + fn_data[i] + epsilon); + fp_rate_data[i] = + static_cast(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon); + rec_rate_data[i] = (static_cast(tp_data[i]) + epsilon) / + (tp_data[i] + fp_data[i] + epsilon); } *auc_data = 0.0f; if (curve == "ROC") { diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc index c95077fcbdb6b6c0da31f30b795dbe4d7d4fe6fe..b21deaf9258567c05a8816b14ac7d6462964e8ba 100644 --- a/paddle/fluid/operators/average_accumulates_op.cc +++ b/paddle/fluid/operators/average_accumulates_op.cc @@ -19,15 +19,15 @@ namespace operators { template <> void GetAccumulators( - const framework::ExecutionContext& ctx, int64_t& num_updates_, - int64_t& num_accumulates_, int64_t& old_num_accumulates_) { + const framework::ExecutionContext& ctx, int64_t* num_updates_, + int64_t* num_accumulates_, int64_t* old_num_accumulates_) { auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); auto* in_num_accumulates = ctx.Input("in_num_accumulates"); auto* in_num_updates = ctx.Input("in_num_updates"); - old_num_accumulates_ = in_old_num_accumulates->data()[0]; - num_accumulates_ = in_num_accumulates->data()[0]; - num_updates_ = in_num_updates->data()[0]; + *old_num_accumulates_ = in_old_num_accumulates->data()[0]; + *num_accumulates_ = in_num_accumulates->data()[0]; + *num_updates_ = in_num_updates->data()[0]; } template <> diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu index 270c46984465e5ca62eaa8da3955ce7a3eaa0c57..046f72b471fa7ffcc82d84262a668c90a7f577a8 100644 --- a/paddle/fluid/operators/average_accumulates_op.cu +++ b/paddle/fluid/operators/average_accumulates_op.cu @@ -19,18 +19,18 @@ namespace paddle { namespace operators { template <> void GetAccumulators( - const framework::ExecutionContext& ctx, int64_t& num_updates_, - int64_t& num_accumulates_, int64_t& old_num_accumulates_) { + const framework::ExecutionContext& ctx, int64_t* num_updates_, + int64_t* num_accumulates_, int64_t* old_num_accumulates_) { auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); auto* in_num_accumulates = ctx.Input("in_num_accumulates"); auto* in_num_updates = ctx.Input("in_num_updates"); auto stream = ctx.cuda_device_context().stream(); - memory::Copy(platform::CPUPlace(), &old_num_accumulates_, + memory::Copy(platform::CPUPlace(), old_num_accumulates_, platform::CUDAPlace(), in_old_num_accumulates->data(), sizeof(int64_t), stream); - memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(), + memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(), in_num_accumulates->data(), sizeof(int64_t), stream); - memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(), + memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(), in_num_updates->data(), sizeof(int64_t), stream); } diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h index f858109d1428dc67d94c253e5a39818eb2d4560d..07ac5ced11605f6d0d5164d1c0f69acbd7bbed60 100644 --- a/paddle/fluid/operators/average_accumulates_op.h +++ b/paddle/fluid/operators/average_accumulates_op.h @@ -29,8 +29,8 @@ using EigenVector = framework::EigenVector; template void GetAccumulators(const framework::ExecutionContext& ctx, - int64_t& num_updates, int64_t& num_accumulates, - int64_t& old_num_accumulates); + int64_t* num_updates, int64_t* num_accumulates, + int64_t* old_num_accumulates); template void SetAccumulators(const framework::ExecutionContext& ctx, @@ -47,8 +47,8 @@ class AverageAccumulatesKernel : public framework::OpKernel { int64_t num_updates = 0; int64_t num_accumulates = 0; int64_t old_num_accumulates = 0; - GetAccumulators(ctx, num_updates, num_accumulates, - old_num_accumulates); + GetAccumulators(ctx, &num_updates, &num_accumulates, + &old_num_accumulates); // Get attrs float average_window = ctx.Attr("average_window"); diff --git a/paddle/fluid/operators/go_op.cc b/paddle/fluid/operators/go_op.cc index cfa797717d78aa72e1b931b6db6e153270b3424e..58fe32446217e07235b40b9b78190094e57e4951 100644 --- a/paddle/fluid/operators/go_op.cc +++ b/paddle/fluid/operators/go_op.cc @@ -56,11 +56,11 @@ class GoOp : public framework::OperatorBase { // TODO(varunarora): Consider moving this root scope lookup to scope.h. const framework::Scope *root_scope = &scope; - const framework::Scope *parent_scope = &(root_scope->parent()); + const framework::Scope *parent_scope = root_scope->parent(); while (parent_scope != nullptr) { root_scope = parent_scope; - parent_scope = &(parent_scope->parent()); + parent_scope = parent_scope->parent(); } framework::BlockDesc *block = Attr(kBlock); diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc index f1c4415f27d54ad09e5cb3659bd16abd82e38215..8c55b4ebbc88f696e99b1194055bed3b0d0b3f0b 100644 --- a/paddle/fluid/operators/spp_op.cc +++ b/paddle/fluid/operators/spp_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/spp_op.h" +#include +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h index 3d2f22632570fe2a28a822370a400390c78b533a..08cb7849d20443862b66ea6096c095b294c7242c 100644 --- a/paddle/fluid/operators/spp_op.h +++ b/paddle/fluid/operators/spp_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/pooling.h" diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index d3d5c8a3429e2070c5472355b4440401eaa699cb..9061e137bd1c789d34665729c48c1c2ea9525c8e 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -10,6 +10,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sum_op.h" +#include +#include #include #include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/operators/detail/safe_ref.h" diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index e7e5346cdca5efaf81c2b0fddedde7406e3b874d..49a4afb3a8a19c97e844e66477c6288772ece807 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h index 42828b7e6564d7da91d608d63fbc0615ef6c4f97..9f8482adedb4c29e32d4109941a2752d942ae49f 100644 --- a/paddle/fluid/operators/top_k_op.h +++ b/paddle/fluid/operators/top_k_op.h @@ -15,6 +15,8 @@ limitations under the License. */ #pragma once #include #include +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index 87b1f530e08df7022d112b26e28511a982052126..4aea9cd65bed615c84c95d891a0a4092678e1444 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/transpose_op.h" +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h index 90f16499a6f52514bfed3dbeb4176ccc956b23d7..895d1ce2cca19c0c1e4aa03cc64eb1425e8bab1a 100644 --- a/paddle/fluid/operators/transpose_op.h +++ b/paddle/fluid/operators/transpose_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc index 0ca7ea00fafc5cf7ab240e1e41710d3b791dfbfb..31859fd1d70dc6e6387258cd5f7412e78a302567 100644 --- a/paddle/fluid/operators/unpool_op.cc +++ b/paddle/fluid/operators/unpool_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/unpool_op.h" +#include +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h index a4421045756bd39728fc14c06efd11a56c7e55af..96abad3de9b959ee611355c67f1fa9e56c430b1b 100644 --- a/paddle/fluid/operators/unpool_op.h +++ b/paddle/fluid/operators/unpool_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/unpooling.h" diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h index 3e3e3089315ab9365925c38b9bce5fb0120d37c3..afbfe69973830bde93ec0af8d1c844580a786663 100644 --- a/paddle/fluid/operators/warpctc_op.h +++ b/paddle/fluid/operators/warpctc_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence_padding.h" diff --git a/paddle/fluid/platform/call_once.h b/paddle/fluid/platform/call_once.h deleted file mode 100644 index fa34972c38d6e7f77a7e178d68592f9886748fa1..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/call_once.h +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -namespace paddle { -namespace platform { - -/* - The current implementation of std::call_once has a bug described in - https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call. - This is likely caused by a deeper bug of pthread_once, which is discussed in - https://patchwork.ozlabs.org/patch/482350/ - - This wrap is a hack to avoid this bug. -*/ -template -inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) { - bool good = true; - std::exception ex; - try { - std::call_once(flag, - [&](Args&&... args) { - try { - f(args...); - } catch (const std::exception& e) { - ex = e; - good = false; - } catch (...) { - ex = std::runtime_error("excption caught in call_once"); - good = false; - } - }, - args...); - } catch (std::system_error& x) { - throw std::runtime_error("call once failed"); - } - if (!good) { - throw std::exception(ex); - } -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h index d21e29df3cf9b2d78920d8bac41209d200b5ba3a..c5a10a78a4f432b431680c089f255fea777277cb 100644 --- a/paddle/fluid/platform/dynload/nccl.h +++ b/paddle/fluid/platform/dynload/nccl.h @@ -18,7 +18,6 @@ limitations under the License. */ #include // NOLINT -#include "paddle/fluid/platform/call_once.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h" namespace paddle { diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc index b716ad9df41330bd6e22937381d24e33fa3a7914..d60aecf96c8828a5656f81fd3602cfb2e66990cf 100644 --- a/paddle/fluid/platform/float16_test.cc +++ b/paddle/fluid/platform/float16_test.cc @@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/fluid/platform/float16.h" + +#include + +#include "gtest/gtest.h" #include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/lod_tensor.h" -#include - namespace paddle { namespace platform { @@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) { // Conversion operator EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00); - EXPECT_EQ(float(float16(0.5f)), 0.5f); - EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001); - EXPECT_EQ(int(float16(-1)), -1); - EXPECT_EQ(bool(float16(true)), true); + EXPECT_EQ(static_cast(float16(0.5f)), 0.5f); + EXPECT_NEAR(static_cast(float16(0.33333)), 0.33333, 0.0001); + EXPECT_EQ(static_cast(float16(-1)), -1); + EXPECT_EQ(static_cast(float16(true)), true); } TEST(float16, arithmetic_cpu) { - EXPECT_EQ(float(float16(1) + float16(1)), 2); - EXPECT_EQ(float(float16(5) + float16(-5)), 0); - EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001); - EXPECT_EQ(float(float16(3) - float16(5)), -2); - EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001); - EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01); - EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01); - EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001); - EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f); - EXPECT_EQ(float(-float16(512.0f)), -512.0f); - EXPECT_EQ(float(-float16(-512.0f)), 512.0f); + EXPECT_EQ(static_cast(float16(1) + float16(1)), 2); + EXPECT_EQ(static_cast(float16(5) + float16(-5)), 0); + EXPECT_NEAR(static_cast(float16(0.33333f) + float16(0.66667f)), 1.0f, + 0.001); + EXPECT_EQ(static_cast(float16(3) - float16(5)), -2); + EXPECT_NEAR(static_cast(float16(0.66667f) - float16(0.33333f)), + 0.33334f, 0.001); + EXPECT_NEAR(static_cast(float16(3.3f) * float16(2.0f)), 6.6f, 0.01); + EXPECT_NEAR(static_cast(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01); + EXPECT_NEAR(static_cast(float16(2.0f) / float16(3.0f)), 0.66667f, + 0.001); + EXPECT_EQ(static_cast(float16(1.0f) / float16(2.0f)), 0.5f); + EXPECT_EQ(static_cast(-float16(512.0f)), -512.0f); + EXPECT_EQ(static_cast(-float16(-512.0f)), 512.0f); } TEST(float16, comparison_cpu) { diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index 567209df4edc483bcb5c6264c62034ddff50c413..577fc24ceb1d3c83cc0546dc5db9c8c7c1f01f86 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -36,19 +36,19 @@ limitations under the License. */ half *in1, *in2, *out; \ half *d_in1, *d_in2, *d_out; \ int size = sizeof(half); \ - cudaMalloc((void**)&d_in1, size); \ - cudaMalloc((void**)&d_in2, size); \ - cudaMalloc((void**)&d_out, size); \ - in1 = (half*)malloc(size); \ - in2 = (half*)malloc(size); \ - out = (half*)malloc(size); \ + cudaMalloc(reinterpret_cast(&d_in1), size); \ + cudaMalloc(reinterpret_cast(&d_in2), size); \ + cudaMalloc(reinterpret_cast(&d_out), size); \ + in1 = reinterpret_cast(malloc(size)); \ + in2 = reinterpret_cast(malloc(size)); \ + out = reinterpret_cast(malloc(size)); \ in1[0] = half(float16(v_in1)); \ in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \ - EXPECT_EQ(float(float16(out[0])), v_out); \ + EXPECT_EQ(static_cast(float16(out[0])), v_out); \ free(in1); \ free(in2); \ free(out); \ @@ -63,17 +63,17 @@ limitations under the License. */ half *in1, *in2; \ half *d_in1, *d_in2; \ int size = sizeof(half); \ - cudaMalloc((void**)&d_in1, size); \ - cudaMalloc((void**)&d_in2, size); \ - in1 = (half*)malloc(size); \ - in2 = (half*)malloc(size); \ + cudaMalloc(reinterpret_cast(&d_in1), size); \ + cudaMalloc(reinterpret_cast(&d_in2), size); \ + in1 = reinterpret_cast(malloc(size)); \ + in2 = reinterpret_cast(malloc(size)); \ in1[0] = half(float16(v_in1)); \ in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2); \ cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \ - EXPECT_EQ(float(float16(in1[0])), v_out); \ + EXPECT_EQ(static_cast(float16(in1[0])), v_out); \ free(in1); \ free(in2); \ cudaFree(d_in1); \ @@ -87,12 +87,12 @@ limitations under the License. */ half *d_in1, *d_in2; \ bool *out, *d_out; \ int size = sizeof(half); \ - cudaMalloc((void**)&d_in1, size); \ - cudaMalloc((void**)&d_in2, size); \ - cudaMalloc((void**)&d_out, 1); \ - in1 = (half*)malloc(size); \ - in2 = (half*)malloc(size); \ - out = (bool*)malloc(1); \ + cudaMalloc(reinterpret_cast(&d_in1), size); \ + cudaMalloc(reinterpret_cast(&d_in2), size); \ + cudaMalloc(reinterpret_cast(&d_out), 1); \ + in1 = reinterpret_cast(malloc(size)); \ + in2 = reinterpret_cast(malloc(size)); \ + out = reinterpret_cast(malloc(1)); \ in1[0] = half(float16(v_in1)); \ in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ @@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) { LOG(INFO) << "Test Neg on GPU!"; half *in, *d_in; int size = sizeof(half); - cudaMalloc((void**)&d_in, size); - in = (half*)malloc(size); + cudaMalloc(reinterpret_cast(&d_in), size); + in = reinterpret_cast(malloc(size)); in[0] = half(float16(v_in)); cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); Neg<<<1, 1>>>(d_in); cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); - EXPECT_EQ(float(float16(in[0])), v_out); + EXPECT_EQ(static_cast(float16(in[0])), v_out); free(in); cudaFree(d_in); } diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..8f222791edb016df65be5db75831f5f83cf63726 --- /dev/null +++ b/paddle/fluid/pybind/.gitignore @@ -0,0 +1 @@ +pybind.h diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 748ad75a99ea4955730327a10ae8468a107fed0a..bd8446df6650f5fb1c62e5370fd48216dbf31e17 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -544,13 +544,20 @@ All parameter, weight, gradient are variables in Paddle. [](ParallelExecutor &self, size_t num_threads, bool use_event, const std::vector &places, const std::unordered_set ¶ms, - const ProgramDesc &startup_program, + const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, - Scope *scope, bool allow_op_delay) { - new (&self) ParallelExecutor(num_threads, use_event, places, - params, startup_program, main_program, - loss_var_name, scope, allow_op_delay); + Scope *scope, std::vector &local_scopes, + bool allow_op_delay) { + new (&self) + ParallelExecutor(num_threads, use_event, places, params, + bcast_vars, main_program, loss_var_name, + scope, local_scopes, allow_op_delay); }) + .def("local_scopes", + [](ParallelExecutor &self) -> std::vector * { + return &self.GetLocalScopes(); + }, + py::return_value_policy::reference) .def("run", &ParallelExecutor::Run); BindRecordIOWriter(&m); diff --git a/python/.gitignore b/python/.gitignore index 1ba1d4c9b0301ed920f5303089e75dd3a8e4e3fa..53a2b7a76b0dd2d9095f9582540e455e2c1174e2 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -1,6 +1,7 @@ *pyc build dist +paddlepaddle.egg-info paddle.egg-info paddlepaddle_gpu.egg-info .idea diff --git a/python/paddle/.gitignore b/python/paddle/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..98527864664d32f798edc06a53131e8d5a068295 --- /dev/null +++ b/python/paddle/.gitignore @@ -0,0 +1 @@ +version.py diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index fbe4531f056a583d2b4d855d32fe8e04da94fa3c..33cf6918178ff746a6b130af0e23a69de0f532fe 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -659,7 +659,7 @@ class Block(object): def __init__(self, program, idx): self.desc = program.desc.block(idx) self.vars = dict() # var_name --> var - self.ops = collections.deque() # operator list + self.ops = list() # operator list self.program = program self.removed_vars = dict() @@ -831,6 +831,13 @@ class Block(object): self.ops.append(op) return op + def insert_op(self, index, *args, **kwargs): + self.sync_with_cpp() + op_desc = self.desc.insert_op(index) + op = Operator(block=self, desc=op_desc, *args, **kwargs) + self.ops.insert(index, op) + return op + def delete_ops(self, ops): # remove from cpp # FIXME(typhoonzero): remove only the first occurrence. @@ -842,12 +849,12 @@ class Block(object): self.desc.remove_op(start, end + 1) def slice_ops(self, start, end): - return list(self.ops)[start:end] + return self.ops[start:end] def prepend_op(self, *args, **kwargs): op_desc = self.desc.prepend_op() op = Operator(self, op_desc, *args, **kwargs) - self.ops.appendleft(op) + self.ops.insert(0, op) return op def sync_with_cpp(self): @@ -892,7 +899,7 @@ class Block(object): for index in range((start_index - 1 - 1), -1, -1): op_desc = ops_in_cpp[index] op = Operator(self, op_desc) - self.ops.appendleft(op) + self.ops.insert(0, op) # sync ops append to the end of cpp_ops for index in range((end_index + 1), len(ops_in_cpp)): diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 1b3ba414ecb50cc4d75dcaecd1f31265334c9aec..b93f2f974ca28cfd8d03c0dbbf1d401620a15e53 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -22,10 +22,49 @@ __all__ = ['ParallelExecutor'] class ParallelExecutor(object): def __init__(self, - loss_name, use_cuda, + loss_name=None, + main_program=None, num_threads=None, - allow_op_delay=False): + allow_op_delay=False, + share_vars_from=None): + """ + ParallelExecutor can run program in parallel. + + Args: + use_cuda(bool): Whether to use CUDA or not. + loss_name(str, default None): The loss name must set in training. + main_program(Program, default None): The program that need to run, + if not provided, then default_main_program will be used. + num_threads(int, default None): How many threads are used for + training. + allow_op_delay(bool, default False): Whether to delay and buffer + some operators together for scheduling or not, which may + improve performance in some cases, defalut False. + share_vars_from(ParallelExecutor, default None): If provied, + it will share variables from the specified ParallelExecutor. + + Returns: + A ParallelExecutor object. + + Raises: + TypeError: If share_vars_from is provided, but not ParallelExecutor + object. + + Examples: + .. code-block:: python + + train_exe = fluid.ParallelExecutor( + use_cuda=True, loss_name=loss.name) + test_exe = fluid.ParallelExecutor( + use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + train_loss, = train_exe.run([loss.name], feed_dict=feed_dict) + test_loss, = test_exe.run([loss.name], feed_dict=feed_dict) + """ + self._places = [] self._act_places = [] if use_cuda: @@ -50,10 +89,21 @@ class ParallelExecutor(object): else: min(len(self._places) * 2, multiprocessing.cpu_count()) - startup = framework.default_startup_program() - main = framework.default_main_program() + main = main_program + main = main if main else framework.default_main_program() scope = executor.global_scope() + if share_vars_from and not isinstance(share_vars_from, + ParallelExecutor): + raise TypeError("share_vars_from must be ParallelExecutor.") + local_scopes = share_vars_from.executor.local_scopes( + ) if share_vars_from else [] + + persistable_vars = [ + v.name + for v in filter(lambda var: var.persistable, main.list_vars()) + ] + self.executor = core.ParallelExecutor( num_threads, True if use_cuda else False, # use_event @@ -62,10 +112,11 @@ class ParallelExecutor(object): p.name for p in main.global_block().iter_parameters() if not p.stop_gradient ]), - startup.desc, + set(persistable_vars), main.desc, - loss_name, + loss_name if loss_name else '', scope, + local_scopes, allow_op_delay) self.scope = scope diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 0f90e0e4df5da93f427b892d1be69f14625d2e29..8401716db88ef3dda68644a052d78b4476c9fdc7 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -207,7 +207,11 @@ class TestParallelExecutorBase(unittest.TestCase): if memory_opt: fluid.memory_optimize(main) - exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) + place = fluid.CUDAPlace(0) + startup_exe = fluid.Executor(place) + startup_exe.run(startup) + + exe = fluid.ParallelExecutor(True, loss_name=loss.name) if batch_size is not None: batch_size *= fluid.core.get_cuda_device_count() begin = time.time() @@ -453,3 +457,41 @@ class TestTransformer(TestParallelExecutorBase): @unittest.skip("transformer is buggy in multi gpu") def test_main(self): self.check_network_convergence(transformer) + + +class ParallelExecutorTestingDuringTraining(unittest.TestCase): + def test_parallel_testing(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = simple_fc_net(True) + test_program = main.clone(for_test=True) + + opt = fluid.optimizer.SGD(learning_rate=0.0001) + opt.minimize(loss) + + batch_size = 32 + image = numpy.random.normal(size=(batch_size, + 784)).astype('float32') + label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64") + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup) + feed_dict = {'image': image, 'label': label} + + train_exe = fluid.ParallelExecutor( + use_cuda=True, loss_name=loss.name, main_program=main) + + test_exe = fluid.ParallelExecutor( + use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + for i in xrange(5): + test_loss, = test_exe.run([loss.name], feed_dict=feed_dict) + test_loss = numpy.array(test_loss) + + train_loss, = train_exe.run([loss.name], feed_dict=feed_dict) + train_loss = numpy.array(train_loss) + self.assertTrue(numpy.allclose(train_loss, test_loss))