diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py index cc31d098328bc237c018ebf8f158bdab5c37bff1..d7a421c10979c3b9d6865a8c0b99a6410e0f46a8 100644 --- a/benchmark/fluid/machine_translation.py +++ b/benchmark/fluid/machine_translation.py @@ -48,6 +48,13 @@ parser.add_argument( type=int, default=16, help="The sequence number of a mini-batch data. (default: %(default)d)") +parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test') +parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') parser.add_argument( "--dict_size", type=int, @@ -72,16 +79,21 @@ parser.add_argument( default=3, help="The width for beam searching. (default: %(default)d)") parser.add_argument( - "--use_gpu", - type=distutils.util.strtobool, - default=True, - help="Whether to use gpu. (default: %(default)d)") + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help="The device type.") parser.add_argument( "--max_length", type=int, default=250, help="The maximum length of sequence when doing generation. " "(default: %(default)d)") +parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): @@ -281,7 +293,7 @@ def train(): paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), batch_size=args.batch_size) - place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() + place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) exe = Executor(place) exe.run(framework.default_startup_program()) @@ -307,14 +319,20 @@ def train(): return total_loss / count + iters, num_samples, start_time = 0, 0, time.time() for pass_id in xrange(args.pass_num): - pass_start_time = time.time() - words_seen = 0 + train_accs = [] + train_losses = [] for batch_id, data in enumerate(train_batch_generator()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) - words_seen += word_num + num_samples += word_num trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) - words_seen += word_num + num_samples += word_num lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) fetch_outs = exe.run(framework.default_main_program(), @@ -325,24 +343,36 @@ def train(): }, fetch_list=[avg_cost]) - avg_cost_val = np.array(fetch_outs[0]) - print('pass_id=%d, batch_id=%d, train_loss: %f' % - (pass_id, batch_id, avg_cost_val)) + iters += 1 + loss = np.array(fetch_outs[0]) + print( + "Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss) + ) # The accuracy is the accumulation of batches, but not the current batch. - pass_end_time = time.time() - test_loss = do_validation() - time_consumed = pass_end_time - pass_start_time - words_per_sec = words_seen / time_consumed - print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % - (pass_id, test_loss, words_per_sec, time_consumed)) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + # evaluation + if args.with_test: + test_loss = do_validation() + exit(0) def infer(): pass +def print_arguments(args): + print('----------- seq2seq Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + if __name__ == '__main__': args = parser.parse_args() + print_arguments(args) if args.infer_only: infer() else: diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py index 7f7afaeb11447d936b65a1d83701b0176ecbc111..43866da9cb113e9d49fc1c51f67da94cbc6bfd8e 100644 --- a/benchmark/fluid/mnist.py +++ b/benchmark/fluid/mnist.py @@ -35,6 +35,12 @@ def parse_args(): parser = argparse.ArgumentParser("mnist model benchmark.") parser.add_argument( '--batch_size', type=int, default=128, help='The minibatch size.') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) parser.add_argument( '--iterations', type=int, default=35, help='The number of minibatches.') parser.add_argument( @@ -53,19 +59,14 @@ def parse_args(): '--use_nvprof', action='store_true', help='If set, use nvprof for CUDA.') + parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') args = parser.parse_args() return args -def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and - vars(args)['device'] == 'GPU') - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - def cnn_model(data): conv_pool_1 = fluid.nets.simple_img_conv_pool( input=data, @@ -161,16 +162,22 @@ def run_benchmark(model, args): paddle.dataset.mnist.train(), batch_size=args.batch_size) accuracy = fluid.average.WeightedAverage() + iters, num_samples, start_time = 0, 0, time.time() for pass_id in range(args.pass_num): accuracy.reset() - pass_start = time.time() + train_accs = [] + train_losses = [] for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break img_data = np.array( map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([len(y_data), 1]) - start = time.time() outs = exe.run( fluid.default_main_program(), feed={"pixel": img_data, @@ -178,21 +185,36 @@ def run_benchmark(model, args): fetch_list=[avg_cost, batch_acc, batch_size_tensor] ) # The accuracy is the accumulation of batches, but not the current batch. accuracy.add(value=outs[1], weight=outs[2]) - end = time.time() + iters += 1 + num_samples += len(y_data) loss = np.array(outs[0]) acc = np.array(outs[1]) - print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" % - (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000)) + train_losses.append(loss) + train_accs.append(acc) + print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % + (pass_id, iters, loss, acc)) + + print("Pass: %d, Loss: %f, Train Accuray: %f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed - pass_end = time.time() + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + # evaluation + if args.with_test: + test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, + inference_program) + exit(0) - train_avg_acc = accuracy.eval() - test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, - inference_program) - print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" % - (pass_id, train_avg_acc, test_avg_acc, - (pass_end - pass_start) / 1000)) +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- mnist Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') if __name__ == '__main__': diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py index f0f1db979fa7fb640679beacafd66dfbe1f62ab8..1af5eaf6b46be47cb6b778cedcf53830c201ef39 100644 --- a/benchmark/fluid/resnet.py +++ b/benchmark/fluid/resnet.py @@ -87,15 +87,6 @@ def parse_args(): return args -def print_arguments(args): - vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and - vars(args)['device'] == 'GPU') - print('----------- Configuration Arguments -----------') - for arg, value in sorted(vars(args).iteritems()): - print('%s: %s' % (arg, value)) - print('------------------------------------------------') - - def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): conv1 = fluid.layers.conv2d( input=input, @@ -279,32 +270,31 @@ def run_benchmark(model, args): 'label': label}, fetch_list=[avg_cost, batch_acc, batch_size_tensor]) iters += 1 - num_samples += label[0] + num_samples += len(label) accuracy.add(value=acc, weight=weight) train_losses.append(loss) train_accs.append(acc) print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" % (pass_id, iters, loss, acc)) - pass_train_acc = accuracy.eval() - # evaluation - if args.with_test: - pass_test_acc = test(exe) - train_elapsed = time.time() - start_time print("Pass: %d, Loss: %f, Train Accuray: %f\n" % (pass_id, np.mean(train_losses), np.mean(train_accs))) - + train_elapsed = time.time() - start_time examples_per_sec = num_samples / train_elapsed - print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % (num_samples, train_elapsed, examples_per_sec)) + # evaluation + if args.with_test: + pass_test_acc = test(exe) + exit(0) - if args.use_cprof: - pr.disable() - s = StringIO.StringIO() - sortby = 'cumulative' - ps = pstats.Stats(pr, stream=s).sort_stats(sortby) - ps.print_stats() - print(s.getvalue()) + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- resnet Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') if __name__ == '__main__': diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh index 663e2efd5392a6cd1a71f51fa0d017070b489341..f6dfd20bf2ee0b668b6d4238d4511253b2233035 100644 --- a/benchmark/fluid/run.sh +++ b/benchmark/fluid/run.sh @@ -1,7 +1,9 @@ #!/bin/bash # This script benchmarking the PaddlePaddle Fluid on # single thread single GPU. -export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib + +#export FLAGS_fraction_of_gpu_memory_to_use=0.0 +export CUDNN_PATH=/paddle/cudnn_v5 # disable openmp and mkl parallel #https://github.com/PaddlePaddle/Paddle/issues/7199 @@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0 export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH +# only query the gpu used +nohup stdbuf -oL nvidia-smi \ + --id=${CUDA_VISIBLE_DEVICES} \ + --query-gpu=timestamp \ + --query-compute-apps=pid,process_name,used_memory \ + --format=csv \ + --filename=mem.log \ + -l 1 & +# mnist +# mnist gpu mnist 128 +FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \ + --device=GPU \ + --batch_size=128 \ + --skip_batch_num=5 \ + --iterations=500 \ + 2>&1 | tee -a mnist_gpu_128.log # vgg16 -# cifar10 gpu cifar10 128 -FLAGS_benchmark=true python fluid/vgg.py \ +# gpu cifar10 128 +FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ --device=GPU \ --batch_size=128 \ --skip_batch_num=5 \ - --iterations=30 \ - 2>&1 > vgg16_gpu_128.log + --iterations=30 \ + 2>&1 | tee -a vgg16_gpu_128.log + +# flowers gpu 128 +FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \ + --device=GPU \ + --batch_size=32 \ + --data_set=flowers \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a vgg16_gpu_flowers_32.log # resnet50 # resnet50 gpu cifar10 128 -FLAGS_benchmark=true python fluid/resnet.py \ +FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \ --device=GPU \ --batch_size=128 \ --data_set=cifar10 \ --model=resnet_cifar10 \ --skip_batch_num=5 \ --iterations=30 \ - 2>&1 > resnet50_gpu_128.log + 2>&1 | tee -a resnet50_gpu_128.log + +# resnet50 gpu flowers 64 +FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \ + --device=GPU \ + --batch_size=64 \ + --data_set=flowers \ + --model=resnet_imagenet \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a resnet50_gpu_flowers_64.log # lstm +# lstm gpu imdb 32 # tensorflow only support batch=32 +FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \ + --device=GPU \ + --batch_size=32 \ + --skip_batch_num=5 \ + --iterations=30 \ + --hidden_dim=512 \ + --emb_dim=512 \ + --crop_size=1500 \ + 2>&1 | tee -a lstm_gpu_32.log + +# seq2seq +# seq2seq gpu wmb 128 +FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \ + --device=GPU \ + --batch_size=128 \ + --skip_batch_num=5 \ + --iterations=30 \ + 2>&1 | tee -a lstm_gpu_128.log diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py index 4e063549e0239abf9d946ed8735f0306203509d0..5fcbdd64af9dc196c9d5b2b82ce4213478ea1418 100644 --- a/benchmark/fluid/stacked_dynamic_lstm.py +++ b/benchmark/fluid/stacked_dynamic_lstm.py @@ -37,6 +37,14 @@ def parse_args(): type=int, default=32, help='The sequence number of a batch data. (default: %(default)d)') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') parser.add_argument( '--emb_dim', type=int, @@ -64,6 +72,10 @@ def parse_args(): default=int(os.environ.get('CROP_SIZE', '1500')), help='The max sentence length of input. Since this model use plain RNN,' ' Gradient could be explored if sentence is too long') + parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') args = parser.parse_args() return args @@ -157,37 +169,43 @@ def main(): exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - def train_loop(pass_num, crop_size): - with profiler.profiler(args.device, 'total') as prof: - for pass_id in range(pass_num): - train_reader = batch( - paddle.reader.shuffle( - crop_sentence(imdb.train(word_dict), crop_size), - buf_size=25000), - batch_size=args.batch_size) - word_nums = 0 - pass_start_time = time.time() - for batch_id, data in enumerate(train_reader()): - tensor_words = to_lodtensor([x[0] for x in data], place) - for x in data: - word_nums += len(x[0]) - label = numpy.array([x[1] for x in data]).astype("int64") - label = label.reshape((-1, 1)) - loss_np, acc, weight = exe.run( - fluid.default_main_program(), - feed={"words": tensor_words, - "label": label}, - fetch_list=[loss, batch_acc, batch_size_tensor]) - print("pass_id=%d, batch_id=%d, loss=%f, acc=%f" % - (pass_id, batch_id, loss_np, acc)) - - pass_end_time = time.time() - time_consumed = pass_end_time - pass_start_time - words_per_sec = word_nums / time_consumed - print("pass_id=%d, sec/pass: %f, words/s: %f" % - (pass_id, time_consumed, words_per_sec)) - - train_loop(args.pass_num, args.crop_size) + train_reader = batch( + paddle.reader.shuffle( + crop_sentence(imdb.train(word_dict), args.crop_size), + buf_size=25000), + batch_size=args.batch_size) + + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in range(args.pass_num): + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + tensor_words = to_lodtensor([x[0] for x in data], place) + label = numpy.array([x[1] for x in data]).astype("int64") + label = label.reshape((-1, 1)) + loss_np, acc, weight = exe.run( + fluid.default_main_program(), + feed={"words": tensor_words, + "label": label}, + fetch_list=[loss, batch_acc, batch_size_tensor]) + iters += 1 + for x in data: + num_samples += len(x[0]) + print( + "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % + (pass_id, iters, loss_np, acc) + ) # The accuracy is the accumulation of batches, but not the current batch. + + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + exit(0) def to_lodtensor(data, place): @@ -205,5 +223,14 @@ def to_lodtensor(data, place): return res +def print_arguments(args): + print('----------- lstm Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + if __name__ == '__main__': + args = parse_args() + print_arguments(args) main() diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py index 3bf78e4cf08d43127a05c740fa30ca6d2bc416b0..9d990eff62ec368dc7033f55cc0862fa974a64e0 100644 --- a/benchmark/fluid/vgg.py +++ b/benchmark/fluid/vgg.py @@ -191,25 +191,29 @@ def main(): fetch_list=[avg_cost, batch_acc, batch_size_tensor]) accuracy.add(value=acc, weight=weight) iters += 1 - num_samples += len(data) + num_samples += len(y_data) print( "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % (pass_id, iters, loss, acc) ) # The accuracy is the accumulation of batches, but not the current batch. - pass_train_acc = accuracy.eval() + # pass_train_acc = accuracy.eval() train_losses.append(loss) train_accs.append(acc) + print("Pass: %d, Loss: %f, Train Accuray: %f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + train_elapsed = time.time() - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) # evaluation if args.with_test: pass_test_acc = test(exe) - train_elapsed = time.time() - start_time - print("Pass: %d, Loss: %f, Train Accuray: %f\n" % - (pass_id, np.mean(train_losses), np.mean(train_accs))) + exit(0) def print_arguments(): - print('----------- Configuration Arguments -----------') + print('----------- vgg Configuration Arguments -----------') for arg, value in sorted(vars(args).iteritems()): print('%s: %s' % (arg, value)) print('------------------------------------------------') diff --git a/benchmark/tensorflow/machine_translation.py b/benchmark/tensorflow/machine_translation.py new file mode 100644 index 0000000000000000000000000000000000000000..8f77dce98353af53803246be8dc61063836b7867 --- /dev/null +++ b/benchmark/tensorflow/machine_translation.py @@ -0,0 +1,626 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +from tensorflow.python.framework import dtypes +from tensorflow.python.layers.core import Dense +from tensorflow.python.ops import check_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.framework import ops +from tensorflow.python.ops import rnn_cell_impl +from tensorflow.python.ops.rnn_cell_impl import RNNCell, BasicLSTMCell +from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple +from tensorflow.contrib.rnn.python.ops import core_rnn_cell +from tensorflow.python.ops import array_ops +from tensorflow.python.util import nest +import tensorflow.contrib.seq2seq as seq2seq +from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder +import numpy as np +import os +import argparse +import time + +import paddle.v2 as paddle + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--embedding_dim", + type=int, + default=512, + help="The dimension of embedding table. (default: %(default)d)") +parser.add_argument( + "--encoder_size", + type=int, + default=512, + help="The size of encoder bi-rnn unit. (default: %(default)d)") +parser.add_argument( + "--decoder_size", + type=int, + default=512, + help="The size of decoder rnn unit. (default: %(default)d)") +parser.add_argument( + "--batch_size", + type=int, + default=128, + help="The sequence number of a mini-batch data. (default: %(default)d)") +parser.add_argument( + "--dict_size", + type=int, + default=30000, + help="The dictionary capacity. Dictionaries of source sequence and " + "target dictionary have same capacity. (default: %(default)d)") +parser.add_argument( + "--max_time_steps", + type=int, + default=81, + help="Max number of time steps for sequence. (default: %(default)d)") +parser.add_argument( + "--pass_num", + type=int, + default=10, + help="The pass number to train. (default: %(default)d)") +parser.add_argument( + "--learning_rate", + type=float, + default=0.0002, + help="Learning rate used to train the model. (default: %(default)f)") +parser.add_argument( + "--infer_only", action='store_true', help="If set, run forward only.") +parser.add_argument( + "--beam_size", + type=int, + default=3, + help="The width for beam searching. (default: %(default)d)") +parser.add_argument( + "--max_generation_length", + type=int, + default=250, + help="The maximum length of sequence when doing generation. " + "(default: %(default)d)") +parser.add_argument( + "--save_freq", + type=int, + default=500, + help="Save model checkpoint every this interation. (default: %(default)d)") +parser.add_argument( + "--model_dir", + type=str, + default='./checkpoint', + help="Path to save model checkpoints. (default: %(default)d)") + +_Linear = core_rnn_cell._Linear # pylint: disable=invalid-name + +START_TOKEN_IDX = 0 +END_TOKEN_IDX = 1 + + +class LSTMCellWithSimpleAttention(RNNCell): + """Add attention mechanism to BasicLSTMCell. + This class is a wrapper based on tensorflow's `BasicLSTMCell`. + """ + + def __init__(self, + num_units, + encoder_vector, + encoder_proj, + source_sequence_length, + forget_bias=1.0, + state_is_tuple=True, + activation=None, + reuse=None): + super(LSTMCellWithSimpleAttention, self).__init__(_reuse=reuse) + if not state_is_tuple: + logging.warn("%s: Using a concatenated state is slower and will " + "soon be deprecated. Use state_is_tuple=True.", self) + self._num_units = num_units + # set padding part to 0 + self._encoder_vector = self._reset_padding(encoder_vector, + source_sequence_length) + self._encoder_proj = self._reset_padding(encoder_proj, + source_sequence_length) + self._forget_bias = forget_bias + self._state_is_tuple = state_is_tuple + self._activation = activation or math_ops.tanh + self._linear = None + + @property + def state_size(self): + return (LSTMStateTuple(self._num_units, self._num_units) \ + if self._state_is_tuple else 2 * self._num_units) + + @property + def output_size(self): + return self._num_units + + def zero_state(self, batch_size, dtype): + state_size = self.state_size + if hasattr(self, "_last_zero_state"): + (last_state_size, last_batch_size, last_dtype, + last_output) = getattr(self, "_last_zero_state") + if (last_batch_size == batch_size and last_dtype == dtype and + last_state_size == state_size): + return last_output + with ops.name_scope( + type(self).__name__ + "ZeroState", values=[batch_size]): + output = _zero_state_tensors(state_size, batch_size, dtype) + self._last_zero_state = (state_size, batch_size, dtype, output) + return output + + def call(self, inputs, state): + sigmoid = math_ops.sigmoid + # Parameters of gates are concatenated into one multiply for efficiency. + if self._state_is_tuple: + c, h = state + else: + c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1) + + # get context from encoder outputs + context = self._simple_attention(self._encoder_vector, + self._encoder_proj, h) + + if self._linear is None: + self._linear = _Linear([inputs, context, h], 4 * self._num_units, + True) + # i = input_gate, j = new_input, f = forget_gate, o = output_gate + i, j, f, o = array_ops.split( + value=self._linear([inputs, context, h]), + num_or_size_splits=4, + axis=1) + + new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) * + self._activation(j)) + new_h = self._activation(new_c) * sigmoid(o) + + if self._state_is_tuple: + new_state = LSTMStateTuple(new_c, new_h) + else: + new_state = array_ops.concat([new_c, new_h], 1) + return new_h, new_state + + def _simple_attention(self, encoder_vec, encoder_proj, decoder_state): + """Implement the attention function. + The implementation has the same logic to the fluid decoder. + """ + decoder_state_proj = tf.contrib.layers.fully_connected( + inputs=decoder_state, + num_outputs=self._num_units, + activation_fn=None, + biases_initializer=None) + decoder_state_expand = tf.tile( + tf.expand_dims( + input=decoder_state_proj, axis=1), + [1, tf.shape(encoder_proj)[1], 1]) + concated = tf.concat([decoder_state_expand, encoder_proj], axis=2) + # need reduce the first dimension + attention_weights = tf.contrib.layers.fully_connected( + inputs=tf.reshape( + concated, shape=[-1, self._num_units * 2]), + num_outputs=1, + activation_fn=tf.nn.tanh, + biases_initializer=None) + attention_weights_reshaped = tf.reshape( + attention_weights, shape=[tf.shape(encoder_vec)[0], -1, 1]) + # normalize the attention weights using softmax + attention_weights_normed = tf.nn.softmax( + attention_weights_reshaped, dim=1) + scaled = tf.multiply(attention_weights_normed, encoder_vec) + context = tf.reduce_sum(scaled, axis=1) + return context + + def _reset_padding(self, + memory, + memory_sequence_length, + check_inner_dims_defined=True): + """Reset the padding part for encoder inputs. + This funtion comes from tensorflow's `_prepare_memory` function. + """ + memory = nest.map_structure( + lambda m: ops.convert_to_tensor(m, name="memory"), memory) + if memory_sequence_length is not None: + memory_sequence_length = ops.convert_to_tensor( + memory_sequence_length, name="memory_sequence_length") + if check_inner_dims_defined: + + def _check_dims(m): + if not m.get_shape()[2:].is_fully_defined(): + raise ValueError( + "Expected memory %s to have fully defined inner dims, " + "but saw shape: %s" % (m.name, m.get_shape())) + + nest.map_structure(_check_dims, memory) + if memory_sequence_length is None: + seq_len_mask = None + else: + seq_len_mask = array_ops.sequence_mask( + memory_sequence_length, + maxlen=array_ops.shape(nest.flatten(memory)[0])[1], + dtype=nest.flatten(memory)[0].dtype) + seq_len_batch_size = (memory_sequence_length.shape[0].value or + array_ops.shape(memory_sequence_length)[0]) + + def _maybe_mask(m, seq_len_mask): + rank = m.get_shape().ndims + rank = rank if rank is not None else array_ops.rank(m) + extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32) + m_batch_size = m.shape[0].value or array_ops.shape(m)[0] + if memory_sequence_length is not None: + message = ("memory_sequence_length and memory tensor " + "batch sizes do not match.") + with ops.control_dependencies([ + check_ops.assert_equal( + seq_len_batch_size, m_batch_size, message=message) + ]): + seq_len_mask = array_ops.reshape( + seq_len_mask, + array_ops.concat( + (array_ops.shape(seq_len_mask), extra_ones), 0)) + return m * seq_len_mask + else: + return m + + return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), + memory) + + +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, + max_generation_length): + src_word_idx = tf.placeholder(tf.int32, shape=[None, None]) + src_sequence_length = tf.placeholder(tf.int32, shape=[None, ]) + + src_embedding_weights = tf.get_variable("source_word_embeddings", + [source_dict_dim, embedding_dim]) + src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx) + + src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) + src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) + # no peephole + encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn( + cell_fw=src_forward_cell, + cell_bw=src_reversed_cell, + inputs=src_embedding, + sequence_length=src_sequence_length, + dtype=tf.float32) + + # concat the forward outputs and backward outputs + encoded_vec = tf.concat(encoder_outputs, axis=2) + + # project the encoder outputs to size of decoder lstm + encoded_proj = tf.contrib.layers.fully_connected( + inputs=tf.reshape( + encoded_vec, shape=[-1, embedding_dim * 2]), + num_outputs=decoder_size, + activation_fn=None, + biases_initializer=None) + encoded_proj_reshape = tf.reshape( + encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size]) + + # get init state for decoder lstm's H + backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1]) + decoder_boot = tf.contrib.layers.fully_connected( + inputs=tf.reshape( + backword_first, shape=[-1, embedding_dim]), + num_outputs=decoder_size, + activation_fn=tf.nn.tanh, + biases_initializer=None) + + # prepare the initial state for decoder lstm + cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32) + initial_state = LSTMStateTuple(cell_init, decoder_boot) + + # create decoder lstm cell + decoder_cell = LSTMCellWithSimpleAttention( + decoder_size, + encoded_vec + if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size), + encoded_proj_reshape if not is_generating else + seq2seq.tile_batch(encoded_proj_reshape, beam_size), + src_sequence_length if not is_generating else + seq2seq.tile_batch(src_sequence_length, beam_size), + forget_bias=0.0) + + output_layer = Dense(target_dict_dim, name='output_projection') + + if not is_generating: + trg_word_idx = tf.placeholder(tf.int32, shape=[None, None]) + trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ]) + trg_embedding_weights = tf.get_variable( + "target_word_embeddings", [target_dict_dim, embedding_dim]) + trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights, + trg_word_idx) + + training_helper = seq2seq.TrainingHelper( + inputs=trg_embedding, + sequence_length=trg_sequence_length, + time_major=False, + name='training_helper') + + training_decoder = seq2seq.BasicDecoder( + cell=decoder_cell, + helper=training_helper, + initial_state=initial_state, + output_layer=output_layer) + + # get the max length of target sequence + max_decoder_length = tf.reduce_max(trg_sequence_length) + + decoder_outputs_train, _, _ = seq2seq.dynamic_decode( + decoder=training_decoder, + output_time_major=False, + impute_finished=True, + maximum_iterations=max_decoder_length) + + decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output) + decoder_pred_train = tf.argmax( + decoder_logits_train, axis=-1, name='decoder_pred_train') + masks = tf.sequence_mask( + lengths=trg_sequence_length, + maxlen=max_decoder_length, + dtype=tf.float32, + name='masks') + + # place holder of label sequence + lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None]) + + # compute the loss + loss = seq2seq.sequence_loss( + logits=decoder_logits_train, + targets=lbl_word_idx, + weights=masks, + average_across_timesteps=True, + average_across_batch=True) + + # return feeding list and loss operator + return { + 'src_word_idx': src_word_idx, + 'src_sequence_length': src_sequence_length, + 'trg_word_idx': trg_word_idx, + 'trg_sequence_length': trg_sequence_length, + 'lbl_word_idx': lbl_word_idx + }, loss + else: + start_tokens = tf.ones([tf.shape(src_word_idx)[0], ], + tf.int32) * START_TOKEN_IDX + # share the same embedding weights with target word + trg_embedding_weights = tf.get_variable( + "target_word_embeddings", [target_dict_dim, embedding_dim]) + + inference_decoder = beam_search_decoder.BeamSearchDecoder( + cell=decoder_cell, + embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens), + start_tokens=start_tokens, + end_token=END_TOKEN_IDX, + initial_state=tf.nn.rnn_cell.LSTMStateTuple( + tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size), + tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)), + beam_width=beam_size, + output_layer=output_layer) + + decoder_outputs_decode, _, _ = seq2seq.dynamic_decode( + decoder=inference_decoder, + output_time_major=False, + #impute_finished=True,# error occurs + maximum_iterations=max_generation_length) + + predicted_ids = decoder_outputs_decode.predicted_ids + + return { + 'src_word_idx': src_word_idx, + 'src_sequence_length': src_sequence_length + }, predicted_ids + + +def print_arguments(args): + print('----------- Configuration Arguments -----------') + for arg, value in vars(args).iteritems(): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def padding_data(data, padding_size, value): + data = data + [value] * padding_size + return data[:padding_size] + + +def save(sess, path, var_list=None, global_step=None): + saver = tf.train.Saver(var_list) + save_path = saver.save(sess, save_path=path, global_step=global_step) + print('Model save at %s' % save_path) + + +def restore(sess, path, var_list=None): + # var_list = None returns the list of all saveable variables + saver = tf.train.Saver(var_list) + saver.restore(sess, save_path=path) + print('model restored from %s' % path) + + +def adapt_batch_data(data): + src_seq = map(lambda x: x[0], data) + trg_seq = map(lambda x: x[1], data) + lbl_seq = map(lambda x: x[2], data) + + src_sequence_length = np.array( + [len(seq) for seq in src_seq]).astype('int32') + src_seq_maxlen = np.max(src_sequence_length) + + trg_sequence_length = np.array( + [len(seq) for seq in trg_seq]).astype('int32') + trg_seq_maxlen = np.max(trg_sequence_length) + + src_seq = np.array( + [padding_data(seq, src_seq_maxlen, END_TOKEN_IDX) + for seq in src_seq]).astype('int32') + + trg_seq = np.array( + [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX) + for seq in trg_seq]).astype('int32') + + lbl_seq = np.array( + [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX) + for seq in lbl_seq]).astype('int32') + + return { + 'src_word_idx': src_seq, + 'src_sequence_length': src_sequence_length, + 'trg_word_idx': trg_seq, + 'trg_sequence_length': trg_sequence_length, + 'lbl_word_idx': lbl_seq + } + + +def train(): + feeding_dict, loss = seq_to_seq_net( + embedding_dim=args.embedding_dim, + encoder_size=args.encoder_size, + decoder_size=args.decoder_size, + source_dict_dim=args.dict_size, + target_dict_dim=args.dict_size, + is_generating=False, + beam_size=args.beam_size, + max_generation_length=args.max_generation_length) + + global_step = tf.Variable(0, trainable=False, name='global_step') + trainable_params = tf.trainable_variables() + optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) + + gradients = tf.gradients(loss, trainable_params) + # may clip the parameters + clip_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) + + updates = optimizer.apply_gradients( + zip(gradients, trainable_params), global_step=global_step) + + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size) + + train_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + def do_validataion(): + total_loss = 0.0 + count = 0 + for batch_id, data in enumerate(test_batch_generator()): + adapted_batch_data = adapt_batch_data(data) + outputs = sess.run([loss], + feed_dict={ + item[1]: adapted_batch_data[item[0]] + for item in feeding_dict.items() + }) + total_loss += outputs[0] + count += 1 + return total_loss / count + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_l) + sess.run(init_g) + for pass_id in xrange(args.pass_num): + pass_start_time = time.time() + words_seen = 0 + for batch_id, data in enumerate(train_batch_generator()): + adapted_batch_data = adapt_batch_data(data) + words_seen += np.sum(adapted_batch_data['src_sequence_length']) + words_seen += np.sum(adapted_batch_data['trg_sequence_length']) + outputs = sess.run([updates, loss], + feed_dict={ + item[1]: adapted_batch_data[item[0]] + for item in feeding_dict.items() + }) + print("pass_id=%d, batch_id=%d, train_loss: %f" % + (pass_id, batch_id, outputs[1])) + pass_end_time = time.time() + test_loss = do_validataion() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_loss, words_per_sec, time_consumed)) + + +def infer(): + feeding_dict, predicted_ids = seq_to_seq_net( + embedding_dim=args.embedding_dim, + encoder_size=args.encoder_size, + decoder_size=args.decoder_size, + source_dict_dim=args.dict_size, + target_dict_dim=args.dict_size, + is_generating=True, + beam_size=args.beam_size, + max_generation_length=args.max_generation_length) + + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size) + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size) + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + with tf.Session(config=config) as sess: + restore(sess, './checkpoint/tf_seq2seq-1500') + for batch_id, data in enumerate(test_batch_generator()): + src_seq = map(lambda x: x[0], data) + + source_language_seq = [ + src_dict[item] for seq in src_seq for item in seq + ] + + src_sequence_length = np.array( + [len(seq) for seq in src_seq]).astype('int32') + src_seq_maxlen = np.max(src_sequence_length) + src_seq = np.array([ + padding_data(seq, src_seq_maxlen, END_TOKEN_IDX) + for seq in src_seq + ]).astype('int32') + + outputs = sess.run([predicted_ids], + feed_dict={ + feeding_dict['src_word_idx']: src_seq, + feeding_dict['src_sequence_length']: + src_sequence_length + }) + + print("\nDecoder result comparison: ") + source_language_seq = ' '.join(source_language_seq).lstrip( + '').rstrip('').strip() + inference_seq = '' + print(" --> source: " + source_language_seq) + for item in outputs[0][0]: + if item[0] == END_TOKEN_IDX: break + inference_seq += ' ' + trg_dict.get(item[0], '') + print(" --> inference: " + inference_seq) + + +if __name__ == '__main__': + args = parser.parse_args() + print_arguments(args) + if args.infer_only: + infer() + else: + train() diff --git a/benchmark/tensorflow/mnist.py b/benchmark/tensorflow/mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..7140eed6eaff49b5c65f9ccb2e38f113a4cdbdbf --- /dev/null +++ b/benchmark/tensorflow/mnist.py @@ -0,0 +1,180 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import time +import numpy as np + +import tensorflow as tf +import paddle.v2 as paddle + +DTYPE = tf.float32 + + +def parse_args(): + parser = argparse.ArgumentParser("mnist model benchmark.") + parser.add_argument( + '--batch_size', type=int, default=128, help='The minibatch size.') + parser.add_argument( + '--iterations', type=int, default=35, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=5, help='The number of passes.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + args = parser.parse_args() + return args + + +def run_benchmark(args): + def weight_variable(dtype, shape): + initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype) + return tf.Variable(initial) + + def bias_variable(dtype, shape): + initial = tf.constant(0.1, shape=shape, dtype=dtype) + return tf.Variable(initial) + + device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0' + with tf.device(device): + images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1)) + labels = tf.placeholder(tf.int64, shape=(None, )) + + # conv1, relu, pool1 + conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20]) + conv1_bias = bias_variable(DTYPE, [20]) + conv1 = tf.nn.conv2d( + images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID") + relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias)) + pool1 = tf.nn.max_pool( + relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID") + + # conv2, relu, pool2 + conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50]) + conv2_bias = bias_variable(DTYPE, [50]) + conv2 = tf.nn.conv2d( + pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID") + relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias)) + pool2 = tf.nn.max_pool( + relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID") + + # FC + pool_shape = pool2.get_shape().as_list() + hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1) + reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim)) + fc_weights = weight_variable(DTYPE, [hidden_dim, 10]) + fc_bias = bias_variable(DTYPE, [10]) + logits = tf.matmul(reshape, fc_weights) + fc_bias + + # Get prediction + prediction = tf.nn.softmax(logits) + + # Loss + one_hot_labels = tf.one_hot(labels, depth=10) + cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1]) + avg_cost = tf.reduce_mean(cost) + + # Get accuracy + correct = tf.equal(tf.argmax(prediction, 1), labels) + accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) + + # metrics, g_accuracy + with tf.variable_scope("reset_metrics_accuracy_scope") as scope: + g_accuracy = tf.metrics.accuracy( + labels, tf.argmax( + prediction, axis=1)) + vars = tf.contrib.framework.get_variables( + scope, collection=tf.GraphKeys.LOCAL_VARIABLES) + g_accuracy_reset_op = tf.variables_initializer(vars) + + # Optimizer + opt = tf.train.AdamOptimizer( + learning_rate=0.001, beta1=0.9, beta2=0.999) + train_op = opt.minimize(avg_cost) + # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost) + + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=args.batch_size) + + def eval_test(): + sess.run(g_accuracy_reset_op) + for batch_id, data in enumerate(test_reader()): + images_data = np.array( + map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32") + labels_data = np.array(map(lambda x: x[1], data)).astype("int64") + + loss, acc, g_acc = sess.run( + [avg_cost, accuracy, g_accuracy], + feed_dict={images: images_data, + labels: labels_data}) + return g_acc[1] + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_g) + sess.run(init_l) + for pass_id in range(args.pass_num): + sess.run(g_accuracy_reset_op) + + pass_start = time.time() + for batch_id, data in enumerate(train_reader()): + images_data = np.array( + map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32") + labels_data = np.array(map(lambda x: x[1], data)).astype( + "int64") + + start = time.time() + _, loss, acc, g_acc = sess.run( + [train_op, avg_cost, accuracy, g_accuracy], + feed_dict={images: images_data, + labels: labels_data}) + end = time.time() + + print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" % + (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000)) + + pass_end = time.time() + test_avg_acc = eval_test() + + print( + "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f" + % (pass_id, g_acc[1], test_avg_acc, + (pass_end - pass_start) / 1000)) + + +def print_arguments(args): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + run_benchmark(args) diff --git a/benchmark/tensorflow/resnet.py b/benchmark/tensorflow/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..c432fa8d59571e128b9ff9e3ffa1949b792ef3a4 --- /dev/null +++ b/benchmark/tensorflow/resnet.py @@ -0,0 +1,504 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py + +Get help: python resnet.py --help +See performance on flowers: python resnet.py +Train on cifar10: python resnet.py --data=cifar10 --with_test +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import time +import numpy as np + +import paddle.v2 as paddle +import tensorflow as tf + +DTYPE = tf.float32 + + +def parse_args(): + parser = argparse.ArgumentParser('Convolution model benchmark.') + parser.add_argument( + '--model', + type=str, + choices=['resnet'], + default='resnet', + help='The model architecture.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='use real data or fake data') + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', + type=int, + default=105, + help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=300, help='The number of passes.') + parser.add_argument( + '--order', + type=str, + default='NHWC', + choices=['NCHW', 'NHWC'], + help='The data order, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--data', + type=str, + default='flowers102', + choices=['flowers102', 'cifar10'], + help='The kinds of data.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--with_test', + action='store_true', + help='If set, test the testset during training.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + args = parser.parse_args() + return args + + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[ + 'with_test'] else vars(args)['iterations'] + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def fixed_padding(inputs, kernel_size, data_format): + """Pads the input along the spatial dimensions independently of input size. + Args: + inputs: A tensor of size [batch, channels, height_in, width_in] or + [batch, height_in, width_in, channels] depending on data_format. + kernel_size: The kernel to be used in the conv2d or max_pool2d operation. + Should be a positive integer. + data_format: The input format ('channels_last' or 'channels_first'). + Returns: + A tensor with the same format as the input with the data either intact + (if kernel_size == 1) or padded (if kernel_size > 1). + """ + pad_total = kernel_size - 1 + pad_beg = pad_total // 2 + pad_end = pad_total - pad_beg + + if data_format == 'channels_first': + padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end], + [pad_beg, pad_end]]) + else: + padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end], + [pad_beg, pad_end], [0, 0]]) + return padded_inputs + + +def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format): + """Strided 2-D convolution with explicit padding.""" + # The padding is consistent and is based only on `kernel_size`, not on the + # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone). + # This is consistent with PaddlePaddle. + # In addition, the calculation for output size in TensorFlow can refer: + # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc + if strides > 1: + inputs = fixed_padding(inputs, kernel_size, data_format) + + return tf.layers.conv2d( + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=('SAME' if strides == 1 else 'VALID'), + use_bias=False, + kernel_initializer=tf.variance_scaling_initializer(), + data_format=data_format) + + +def conv_bn(inputs, + filters, + kernel_size, + strides, + is_training, + data_format, + act=True): + # def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format): + # set fused=True for a significant performance boost. See + # https://www.tensorflow.org/performance/performance_guide#common_fused_ops + inputs = conv2d_fixed_padding( + inputs=inputs, + filters=filters, + kernel_size=kernel_size, + strides=strides, + data_format=data_format) + inputs = tf.layers.batch_normalization( + inputs=inputs, + axis=1 if data_format == 'channels_first' else 3, + momentum=0.9, + epsilon=1e-05, + center=True, + scale=True, + training=is_training, + fused=True) + if act: + inputs = tf.nn.relu(inputs) + return inputs + + +def basicblock(inputs, filters, is_training, projection_shortcut, strides, + data_format): + shortcut = inputs + if projection_shortcut is not None: + shortcut = projection_shortcut(inputs) + inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format) + inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False) + inputs = inputs + shortcut + inputs = tf.nn.relu(inputs) + return inputs + + +def bottleneck(inputs, filters, is_training, projection_shortcut, strides, + data_format): + shortcut = inputs + if projection_shortcut is not None: + shortcut = projection_shortcut(inputs) + inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format) + inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False) + inputs = conv_bn( + inputs, filters * 4, 1, 1, is_training, data_format, act=False) + inputs = inputs + shortcut + inputs = tf.nn.relu(inputs) + return inputs + + +def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name, + data_format): + # Bottleneck blocks end with 4x the number of filters as they start with + filters_out = 4 * filters if block_fn is bottleneck else filters + + def projection_shortcut(inputs): + return conv2d_fixed_padding( + inputs=inputs, + filters=filters_out, + kernel_size=1, + strides=strides, + data_format=data_format) + + # Only the first block per block_layer uses projection_shortcut and strides + inputs = block_fn(inputs, filters, is_training, projection_shortcut, + strides, data_format) + + for _ in range(1, blocks): + inputs = block_fn(inputs, filters, is_training, None, 1, data_format) + + return tf.identity(inputs, name) + + +def resnet_imagenet(depth, class_dim, data_format): + """Returns the ResNet model for a given size and number of output classes.""" + + def resnet_generator(block_fn, + layers, + num_classes, + data_format='channels_last'): + if data_format is None: + data_format = ('channels_first' + if tf.test.is_built_with_cuda() else 'channels_last') + + def model(inputs, is_training): + """Constructs the ResNet model given the inputs.""" + if data_format == 'channels_first': + # Convert the inputs from channels_last (NHWC) to channels_first (NCHW). + # This provides a large performance boost on GPU. See + # https://www.tensorflow.org/performance/performance_guide#data_formats + inputs = tf.transpose(inputs, [0, 3, 1, 2]) + + inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format) + inputs = tf.identity(inputs, 'initial_conv') + inputs = tf.layers.max_pooling2d( + inputs=inputs, + pool_size=3, + strides=2, + padding='SAME', + data_format=data_format) + inputs = tf.identity(inputs, 'initial_max_pool') + inputs = block_layer(inputs, 64, block_fn, layers[0], 1, + is_training, 'block_layer1', data_format) + inputs = block_layer(inputs, 128, block_fn, layers[1], 2, + is_training, 'block_layer2', data_format) + inputs = block_layer(inputs, 256, block_fn, layers[2], 2, + is_training, 'block_layer3', data_format) + inputs = block_layer(inputs, 512, block_fn, layers[3], 2, + is_training, 'block_layer4', data_format) + inputs = tf.layers.average_pooling2d( + inputs=inputs, + pool_size=7, + strides=1, + padding='VALID', + data_format=data_format) + inputs = tf.identity(inputs, 'final_avg_pool') + inputs = tf.reshape(inputs, + [-1, 512 if block_fn is basicblock else 2048]) + inputs = tf.layers.dense(inputs=inputs, units=num_classes) + inputs = tf.identity(inputs, 'final_dense') + return inputs + + return model + + model_params = { + 18: { + 'block': basicblock, + 'layers': [2, 2, 2, 2] + }, + 34: { + 'block': basicblock, + 'layers': [3, 4, 6, 3] + }, + 50: { + 'block': bottleneck, + 'layers': [3, 4, 6, 3] + }, + 101: { + 'block': bottleneck, + 'layers': [3, 4, 23, 3] + }, + 152: { + 'block': bottleneck, + 'layers': [3, 8, 36, 3] + }, + 200: { + 'block': bottleneck, + 'layers': [3, 24, 36, 3] + } + } + if depth not in model_params: + raise ValueError('Not a valid depth:', depth) + params = model_params[depth] + return resnet_generator(params['block'], params['layers'], class_dim, + data_format) + + +def resnet_cifar10(depth, num_classes, data_format): + if depth % 6 != 2: + raise ValueError('depth must be 6n + 2:', depth) + + num_blocks = (depth - 2) // 6 + + if data_format is None: + data_format = ('channels_first' + if tf.test.is_built_with_cuda() else 'channels_last') + + def model(inputs, is_training): + inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format) + inputs = tf.identity(inputs, 'initial_conv') + inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training, + 'block_layer1', data_format) + inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training, + 'block_layer2', data_format) + inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training, + 'block_layer3', data_format) + inputs = tf.layers.average_pooling2d( + inputs=inputs, + pool_size=8, + strides=1, + padding='VALID', + data_format=data_format) + inputs = tf.identity(inputs, 'final_avg_pool') + inputs = tf.reshape(inputs, [-1, 64]) + inputs = tf.layers.dense(inputs=inputs, units=num_classes) + inputs = tf.identity(inputs, 'final_dense') + return inputs + + return model + + +def run_benchmark(args, data_format='channels_last', device='/cpu:0'): + """Our model_fn for ResNet to be used with our Estimator.""" + + class_dim = 1000 + dshape = (None, 224, 224, 3) + + pdshape = (3, 224, 224) + if args.data == 'flowers102': + class_dim = 102 + dshape = (None, 224, 224, 3) + pdshape = (3, 224, 224) + elif args.data == 'cifar10': + class_dim = 10 + dshape = (None, 32, 32, 3) + pdshape = (3, 32, 32) + + with tf.device(device): + images = tf.placeholder(DTYPE, shape=dshape) + labels = tf.placeholder(tf.int64, shape=(None, )) + is_training = tf.placeholder('bool') + onehot_labels = tf.one_hot(labels, depth=class_dim) + + network = resnet_cifar10( + 32, class_dim, + data_format) if args.data == 'cifar10' else resnet_imagenet( + 50, class_dim, data_format) + + logits = network(inputs=images, is_training=is_training) + + cross_entropy = tf.losses.softmax_cross_entropy( + logits=logits, onehot_labels=onehot_labels) + avg_cost = tf.reduce_mean(cross_entropy) + + correct = tf.equal(tf.argmax(logits, 1), labels) + accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) + + lr = 0.1 if args.data == 'cifar10' else 0.01 + optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9) + + # Batch norm requires update_ops to be added as a train_op dependency. + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(update_ops): + train_op = optimizer.minimize(avg_cost) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data == 'cifar10' else paddle.dataset.flowers.test(), + batch_size=100) + + def test(): + test_accs = [] + for batch_id, data in enumerate(test_reader()): + test_images = np.array( + map(lambda x: np.transpose(x[0].reshape(pdshape), + axes=[1, 2, 0]), data)).astype("float32") + test_labels = np.array(map(lambda x: x[1], data)).astype('int64') + test_accs.append( + accuracy.eval(feed_dict={ + images: test_images, + labels: test_labels, + is_training: False + })) + print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" % + (pass_id, num_samples / train_elapsed, np.mean(test_accs))) + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_g) + sess.run(init_l) + + if args.use_fake_data: + data = train_reader().next() + images_data = np.array( + map(lambda x: np.transpose(x[0].reshape(pdshape), + axes=[1, 2, 0]), data)).astype("float32") + labels_data = np.array(map(lambda x: x[1], data)).astype('int64') + iters, num_samples, start_time = 0, 0, 0.0 + for pass_id in range(args.pass_num): + if iters == args.iterations: + break + train_accs = [] + train_losses = [] + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + if not args.use_fake_data: + images_data = np.array( + map(lambda x: np.transpose(x[0].reshape(pdshape), + axes=[1, 2, 0]), data)).astype("float32") + labels_data = np.array(map(lambda x: x[1], data)).astype( + 'int64') + _, loss, acc = sess.run([train_op, avg_cost, accuracy], + feed_dict={ + images: images_data, + labels: labels_data, + is_training: True + }) + iters += 1 + train_accs.append(acc) + train_losses.append(loss) + num_samples += len(data) + print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" % + (pass_id, iters, loss, acc)) + + train_elapsed = time.time() - start_time + print("Pass=%d, Loss=%f, Accuray=%f\n" % + (pass_id, np.mean(train_losses), np.mean(train_accs))) + + # evaluation + if args.with_test: + test() + + if not args.with_test: + duration = time.time() - start_time + examples_per_sec = num_samples / duration + sec_per_batch = duration / (iters - args.skip_batch_num) + + print('Total examples: %d, total time: %.5f' % + (num_samples, duration)) + print('%.5f examples/sec, %.5f sec/batch' % + (examples_per_sec, sec_per_batch)) + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + if tf.test.is_built_with_cuda(): + device = '/device:GPU:0' + if args.order == 'NHWC': + data_format = 'channels_last' + else: + data_format = 'channels_first' + else: + device = '/cpu:0' + if args.order == 'NHWC': + data_format = 'channels_last' + else: + raise ValueError('Only support NHWC order in CPU mode') + + run_benchmark(args, data_format, device) diff --git a/benchmark/tensorflow/stacked_dynamic_lstm.py b/benchmark/tensorflow/stacked_dynamic_lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..5285033005044d907d0b7e91eb66ee7281c4f27a --- /dev/null +++ b/benchmark/tensorflow/stacked_dynamic_lstm.py @@ -0,0 +1,220 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time +import tensorflow as tf + +import paddle.v2 as paddle + + +def parse_args(): + parser = argparse.ArgumentParser("LSTM model benchmark.") + parser.add_argument( + '--batch_size', + type=int, + default=32, + help='The sequence number of a batch data. (default: %(default)d)') + parser.add_argument( + '--stacked_num', + type=int, + default=5, + help='Number of lstm layers to stack. (default: %(default)d)') + parser.add_argument( + '--embedding_dim', + type=int, + default=512, + help='Dimension of embedding table. (default: %(default)d)') + parser.add_argument( + '--hidden_dim', + type=int, + default=512, + help='Hidden size of lstm unit. (default: %(default)d)') + parser.add_argument( + '--pass_num', + type=int, + default=10, + help='Epoch number to train. (default: %(default)d)') + parser.add_argument( + '--learning_rate', + type=float, + default=0.0002, + help='Learning rate used to train. (default: %(default)f)') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + args = parser.parse_args() + return args + + +def print_arguments(args): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def dynamic_lstm_model(dict_size, + embedding_dim, + hidden_dim, + stacked_num, + class_num=2, + is_train=True): + word_idx = tf.placeholder(tf.int64, shape=[None, None]) + sequence_length = tf.placeholder(tf.int64, shape=[None, ]) + + embedding_weights = tf.get_variable('word_embeddings', + [dict_size, embedding_dim]) + embedding = tf.nn.embedding_lookup(embedding_weights, word_idx) + + lstm_cell = tf.nn.rnn_cell.LSTMCell( + num_units=hidden_dim, use_peepholes=False) + stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num) + + # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples + _, final_state = tf.nn.dynamic_rnn( + cell=stacked_cell, + inputs=embedding, + dtype=tf.float32, + sequence_length=sequence_length) + + w = tf.Variable( + tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32) + bias = tf.Variable( + tf.constant( + value=0.0, shape=[class_num], dtype=tf.float32)) + prediction = tf.matmul(final_state[-1][1], w) + bias + + if not is_train: + return (word_idx, sequence_length), tf.nn.softmax(prediction) + + label = tf.placeholder(tf.int64, shape=[None, ]) + loss = tf.nn.softmax_cross_entropy_with_logits( + labels=tf.one_hot(label, 2), logits=prediction) + avg_loss = tf.reduce_mean(loss) + + correct_count = tf.equal(tf.argmax(prediction, 1), label) + acc = tf.reduce_mean(tf.cast(correct_count, tf.float32)) + + with tf.variable_scope("reset_metrics_accuracy_scope") as scope: + g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1)) + vars = tf.contrib.framework.get_variables( + scope, collection=tf.GraphKeys.LOCAL_VARIABLES) + reset_op = tf.variables_initializer(vars) + + return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op + + +def padding_data(data, padding_size, value): + data = data + [value] * padding_size + return data[:padding_size] + + +def train(args): + word_dict = paddle.dataset.imdb.word_dict() + dict_size = len(word_dict) + + feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model( + dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num) + + adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) + train_op = adam_optimizer.minimize(avg_loss) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=25000), + batch_size=args.batch_size) + + test_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.test(word_dict), buf_size=25000), + batch_size=args.batch_size) + + def do_validation(sess): + sess.run(reset_op) + for batch_id, data in enumerate(test_reader()): + word_idx = map(lambda x: x[0], data) + sequence_length = np.array( + [len(seq) for seq in word_idx]).astype('int64') + maxlen = np.max(sequence_length) + word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx] + word_idx = np.array(word_idx).astype('int64') + label = np.array(map(lambda x: x[1], data)).astype('int64') + + _, loss, fetch_acc, fetch_g_acc = sess.run( + [train_op, avg_loss, acc, g_acc], + feed_dict={ + feeding_list[0]: word_idx, + feeding_list[1]: sequence_length, + feeding_list[2]: label + }) + + return fetch_g_acc[1] + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_l) + sess.run(init_g) + + for pass_id in xrange(args.pass_num): + # clear accuracy local variable + sess.run(reset_op) + pass_start_time = time.time() + words_seen = 0 + + for batch_id, data in enumerate(train_reader()): + word_idx = map(lambda x: x[0], data) + sequence_length = np.array( + [len(seq) for seq in word_idx]).astype('int64') + words_seen += np.sum(sequence_length) + maxlen = np.max(sequence_length) + word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx] + word_idx = np.array(word_idx).astype('int64') + label = np.array(map(lambda x: x[1], data)).astype('int64') + + _, loss, fetch_acc, fetch_g_acc = sess.run( + [train_op, avg_loss, acc, g_acc], + feed_dict={ + feeding_list[0]: word_idx, + feeding_list[1]: sequence_length, + feeding_list[2]: label + }) + + print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f" + % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1])) + + pass_end_time = time.time() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + test_acc = do_validation(sess) + print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_acc, words_per_sec, time_consumed)) + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + + if args.infer_only: + pass + else: + train(args) diff --git a/benchmark/tensorflow/vgg.py b/benchmark/tensorflow/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..fba5ec71a46b3ac8b2e1244424c39fd5192e5458 --- /dev/null +++ b/benchmark/tensorflow/vgg.py @@ -0,0 +1,324 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""VGG16 benchmark in TensorFlow""" +import tensorflow as tf +import paddle.v2 as paddle +import numpy as np +import argparse +import time + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + '--batch_size', type=int, default=128, help="Batch size for training.") +parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test') +parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') +parser.add_argument( + '--learning_rate', + type=float, + default=1e-3, + help="Learning rate for training.") +parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.") +parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help="The device type.") +parser.add_argument( + '--data_format', + type=str, + default='NHWC', + choices=['NCHW', 'NHWC'], + help='The data order, NCHW=[batch, channels, height, width].' + 'Only support NHWC right now.') +parser.add_argument( + '--data_set', + type=str, + default='cifar10', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') +args = parser.parse_args() + + +class VGG16Model(object): + def __init__(self): + self.parameters = [] + + def batch_norm_relu(self, inputs, is_training): + """Performs a batch normalization followed by a ReLU.""" + # We set fused=True for a significant speed boost. See + # https://www.tensorflow.org/speed/speed_guide#common_fused_ops + inputs = tf.layers.batch_normalization( + inputs=inputs, + axis=1 if args.data_format == 'NCHW' else -1, + momentum=0.9, + epsilon=1e-05, + center=True, + scale=True, + training=is_training, + fused=True) + inputs = tf.nn.relu(inputs) + return inputs + + def conv_bn_layer(self, + name, + images, + kernel_shape, + is_training, + drop_rate=0.0): + with tf.name_scope(name) as scope: + kernel = tf.Variable( + tf.truncated_normal( + kernel_shape, dtype=tf.float32, stddev=1e-1), + name='weights') + conv = tf.nn.conv2d( + images, + kernel, [1, 1, 1, 1], + data_format=args.data_format, + padding='SAME') + biases = tf.Variable( + tf.constant( + 0.0, shape=[kernel_shape[-1]], dtype=tf.float32), + trainable=True, + name='biases') + out = tf.nn.bias_add(conv, biases) + out = self.batch_norm_relu(out, is_training) + out = tf.layers.dropout(out, rate=drop_rate, training=is_training) + return out + + def fc_layer(self, name, inputs, shape): + with tf.name_scope(name) as scope: + fc_w = tf.Variable( + tf.truncated_normal( + shape, dtype=tf.float32, stddev=1e-1), + name='weights') + fc_b = tf.Variable( + tf.constant( + 0.0, shape=[shape[-1]], dtype=tf.float32), + trainable=True, + name='biases') + out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b) + return out + + def network(self, images, class_dim, is_training): + """ VGG16 model structure. + + TODO(kuke): enable this network to support the 'NCHW' data format + """ + + # conv1 + conv1_1 = self.conv_bn_layer( + 'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3) + conv1_2 = self.conv_bn_layer( + 'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0) + # pool1 + pool1 = tf.nn.max_pool( + conv1_2, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool1') + # conv2 + conv2_1 = self.conv_bn_layer( + 'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4) + conv2_2 = self.conv_bn_layer( + 'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0) + # pool2 + pool2 = tf.nn.max_pool( + conv2_2, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool2') + # conv3 + conv3_1 = self.conv_bn_layer( + 'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4) + conv3_2 = self.conv_bn_layer( + 'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4) + conv3_3 = self.conv_bn_layer( + 'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0) + # pool3 + pool3 = tf.nn.max_pool( + conv3_3, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool3') + # conv4 + conv4_1 = self.conv_bn_layer( + 'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4) + conv4_2 = self.conv_bn_layer( + 'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4) + conv4_3 = self.conv_bn_layer( + 'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0) + # pool4 + pool4 = tf.nn.max_pool( + conv4_3, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool4') + # conv5 + conv5_1 = self.conv_bn_layer( + 'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4) + conv5_2 = self.conv_bn_layer( + 'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4) + conv5_3 = self.conv_bn_layer( + 'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0) + # pool5 + pool5 = tf.nn.max_pool( + conv5_3, + ksize=[1, 2, 2, 1], + strides=[1, 2, 2, 1], + padding='SAME', + name='pool4') + # flatten + shape = int(np.prod(pool5.get_shape()[1:])) + pool5_flat = tf.reshape(pool5, [-1, shape]) + # fc1 + drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training) + fc1 = self.fc_layer('fc1', drop, [shape, 512]) + # fc2 + bn = self.batch_norm_relu(fc1, is_training) + drop = tf.layers.dropout(bn, rate=0.5, training=is_training) + fc2 = self.fc_layer('fc2', drop, [512, 512]) + + fc3 = self.fc_layer('fc3', fc2, [512, class_dim]) + + return fc3 + + +def run_benchmark(): + """Run benchmark on cifar10 or flowers.""" + + if args.data_set == "cifar10": + class_dim = 10 + raw_shape = (3, 32, 32) + dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else ( + None, 3, 32, 32) + else: + class_dim = 102 + raw_shape = (3, 224, 224) + dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else ( + None, 3, 224, 224) + + device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0' + + with tf.device(device): + images = tf.placeholder(tf.float32, shape=dat_shape) + labels = tf.placeholder(tf.int64, shape=(None, )) + is_training = tf.placeholder('bool') + onehot_labels = tf.one_hot(labels, depth=class_dim) + + vgg16 = VGG16Model() + logits = vgg16.network(images, class_dim, is_training) + loss = tf.losses.softmax_cross_entropy( + onehot_labels=onehot_labels, logits=logits) + avg_loss = tf.reduce_mean(loss) + + correct = tf.equal(tf.argmax(logits, 1), labels) + accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) + + optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(update_ops): + train_op = optimizer.minimize(avg_loss) + + # data reader + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.train(), + buf_size=5120), + batch_size=args.batch_size) + test_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.test10() + if args.data_set == 'cifar10' else paddle.dataset.flowers.test(), + buf_size=5120), + batch_size=args.batch_size) + + # test + def test(): + test_accs = [] + for batch_id, data in enumerate(test_reader()): + test_images = np.array( + map(lambda x: np.transpose(x[0].reshape(raw_shape), + axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32") + test_labels = np.array(map(lambda x: x[1], data)).astype('int64') + test_accs.append( + accuracy.eval(feed_dict={ + images: test_images, + labels: test_labels, + is_training: False + })) + return np.mean(test_accs) + + config = tf.ConfigProto( + intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + + with tf.Session(config=config) as sess: + init_g = tf.global_variables_initializer() + init_l = tf.local_variables_initializer() + sess.run(init_g) + sess.run(init_l) + iters, num_samples, start_time = 0, 0, time.time() + for pass_id in range(args.num_passes): + # train + num_samples = 0 + start_time = time.time() + for batch_id, data in enumerate(train_reader()): + if iters == args.skip_batch_num: + start_time = time.time() + num_samples = 0 + if iters == args.iterations: + break + train_images = np.array( + map(lambda x: np.transpose(x[0].reshape(raw_shape), + axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32") + train_labels = np.array(map(lambda x: x[1], data)).astype( + 'int64') + _, loss, acc = sess.run([train_op, avg_loss, accuracy], + feed_dict={ + images: train_images, + labels: train_labels, + is_training: True + }) + iters += 1 + num_samples += len(data) + print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" % + (pass_id, iters, loss, acc)) + train_elapsed = time.time() - start_time + # test + pass_test_acc = test() + print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" % + (pass_id, num_samples / train_elapsed, pass_test_acc)) + + +def print_arguments(): + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +if __name__ == '__main__': + print_arguments() + run_benchmark() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index e8bc285bdc95e213b9da2ee388078349a46d2798..c4c9f77df8d57fe162616d2250bd4dfe5b7754e7 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -244,11 +244,11 @@ function(cc_test TARGET_NAME) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) # Support linking flags: --whole-archive (Linux) / -force_load (MacOS) - target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) + target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog) if("${cc_test_DEPS}" MATCHES "ARCHIVE_START") list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END) endif() - add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) + add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) @@ -311,8 +311,8 @@ function(nv_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) - add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog) + add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(nv_test) @@ -387,8 +387,8 @@ function(hip_test TARGET_NAME) endif() add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) - target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) - add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) + add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(hip_test) diff --git a/doc/design/file_manager/README.md b/doc/design/file_manager/README.md deleted file mode 100644 index 3df10d801e568834729f902aace483d033340e2d..0000000000000000000000000000000000000000 --- a/doc/design/file_manager/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# FileManager设计文档 -## 目标 -在本文档中,我们设计说明了名为FileManager系统,方便用户上传自己的训练数据以进行分布式训练 - -主要功能包括: - -- 提供常用的命令行管理命令管理文件和目录 -- 支持大文件的断点上传、下载 - -## 名词解释 -- PFS:是`Paddlepaddle cloud File System`的缩写,是对用户文件存储空间的抽象,与之相对的是local filesystem。目前我们用CephFS来搭建。 -- [CephFS](http://docs.ceph.com/docs/master/cephfs/):一个POSIX兼容的文件系统。 -- Chunk:逻辑划上文件分块的单位。 - -## 模块 -### 架构图 - - -### PFSClient -- 功能: 详细设计[link](./pfs/pfsclient.md) - - 提供用户管理文件的命令 - - 需要可以跨平台执行 - -- 双向验证 - PFSClient需要和Ingress之间做双向验证[tls](#tls),所以用户需要首先在`cloud.paddlepaddle.org`上注册一下,申请用户空间,并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地,然后才能使用PFSClient。 - -### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) -- 功能: - 提供七层协议的反向代理、基于粘性会话的负载均衡功能。 - -- 透传用户身份的办法 - Ingress需要把PFSClient的身份信息传给PFSServer,配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3) - -### PFSServer -PFSServer提供RESTful API接口,接收处理PFSClient端的文件管理请求,并且把结果返回PFSClient端。 - -RESTful API - -- /api/v1/files - - `GET /api/v1/files`: Get metadata of files or directories. - - `POST /api/v1/files`: Create files or directories. - - `PATCH /api/v1/files`: Update files or directories. - - `DELETE /api/v1/files`: Delete files or directories. - -- /api/v1/file/chunks - - `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file. - -- /api/v1/storage/files - - `GET /api/v1/storage/files`: Download files or directories. - - `POST /api/v1/storage/files`: Upload files or directories. - -- /api/v1/storage/file/chunks - - `GET /api/v1/storage/file/chunks`: Download chunks's data. - - `POST /api/v1/storage/file/chunks`: Upload chunks's data. - -## 文件传输优化 - -### 分块文件传输 -用户文件可能是比较大的,上传到Cloud或者下载到本地的时间可能比较长,而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题,我们提出了Chunk的概念,一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小(默认256K),完成一个传输动作完成的时间也比较短,不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。 - -一个典型的Chunk如下所示: - -``` -type Chunk struct { - fileOffset int64 - checksum uint32 - len uint32 - data []byte -} -``` - -### 生成sparse文件 -当destination文件不存在或者大小和source文件不一致时,可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件,然后就可以并发写入多个Chunk。 - -### 覆盖不一致的部分 -文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致,不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。 - -## 用户使用流程 -参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md) - -## 框架生成 -用[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分,以便我们可以把更多的精力放到逻辑本身上。 - -## 参考文档 -- [TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md) -- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/) -- [linux man document](https://linux.die.net/man/) diff --git a/doc/design/file_manager/pfs/pfsclient.md b/doc/design/file_manager/pfs/pfsclient.md deleted file mode 100644 index 56bc70c54bbc92b78d66e04fb495b1300cf8ebe0..0000000000000000000000000000000000000000 --- a/doc/design/file_manager/pfs/pfsclient.md +++ /dev/null @@ -1,129 +0,0 @@ -# PFSClient - -## Description -The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud - -## Synopsis -``` -paddle [options] pfs [parameters] -``` - -## Options -``` ---profile (string) - Use a specific profile from your credential file. - ---help (string) - Display more information about command - ---version - Output version information and exit - ---debug - Show detailed debugging log - ---only-show-errors (boolean) - Only errors and warnings are displayed. All other output is suppressed. -``` - -## Path Arguments -When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`. - -A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`. - -[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters. - -## order of Path Arguments -Commonly, if there are two path arguments, the first is the source, and the second is the destination. - -## Subcommonds -- rm - remove files or directories - -``` -Synopsis: - rm [-r] [-v] ... - -Options: - -r - Remove directories and their contents recursively - -v - Cause rm to be verbose, showing files after they are removed. - -Examples: - paddle pfs rm /pfs/$DATACENTER/home/$USER/file - paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder -``` -- mv - move (rename) files - -``` -Synopsis: - mv [-f | -n] [-v] - mv [-f | -n] [-v] ... - mv [-f | -n] [-v] - mv [-f | -n] [-v] ... - mv [-f | -n] [-v] - mv [-f | -n] [-v] ... - -Options: - -f - Do not prompt for confirmation before overwriting the destination path. (The -f option overrides previous -n options.) - -n - Do not overwrite an existing file. (The -n option overrides previous -f options.) - -v - Cause mv to be verbose, showing files after they are moved. - -Examples: - paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt -``` -- cp - copy files or directories - -``` -Synopsis: - cp [-r] [-f | -n] [-v] [--preserve--links] - cp [-r] [-f | -n] [-v] [--preserve--links] ... - cp [-r] [-f | -n] [-v] [--preserve--links] - cp [-r] [-f | -n] [-v] [--preserve--links] ... - cp [-r] [-f | -n] [-v] [--preserve--links] - cp [-r] [-f | -n] [-v] [--preserve--links] ... - -Options: - -r - Copy directories recursively - -f - Do not prompt for confirmation before overwriting the destination path. (The -f option overrides previous -n options.) - -n - Do not overwrite an existing file. (The -n option overrides previous -f options.) - -v - Cause cp to be verbose, showing files after they are copied. - --preserve--links - Reserve links when copy links - -Examples: - paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file - paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file -``` -- ls- list files - -``` -Synopsis: - ls [-r] ... - -Options: - -R - List directory(ies) recursively - -Examples: - paddle pfs ls /pfs/$DATACENTER/home/$USER/file - paddle pfs ls /pfs/$DATACENTER/home/$USER/folder -``` - -- mkdir - mkdir directory(ies) -Create intermediate directory(ies) as required. - -``` -Synopsis: - mkdir ... - -Examples: - paddle pfs mkdir /pfs/$DATACENTER/home/$USER/folder -``` diff --git a/doc/design/file_manager/src/filemanager.graffle b/doc/design/file_manager/src/filemanager.graffle deleted file mode 100644 index 7861a33072bc1908f69d12b37c20491dd8663103..0000000000000000000000000000000000000000 Binary files a/doc/design/file_manager/src/filemanager.graffle and /dev/null differ diff --git a/doc/design/file_manager/src/filemanager.png b/doc/design/file_manager/src/filemanager.png deleted file mode 100644 index 8139a19f5722f56d3c211f3ab0d3982f751134b9..0000000000000000000000000000000000000000 Binary files a/doc/design/file_manager/src/filemanager.png and /dev/null differ diff --git a/doc/design/images/parallel_executor_overview.dot b/doc/fluid/design/concepts/images/parallel_executor_overview.dot similarity index 100% rename from doc/design/images/parallel_executor_overview.dot rename to doc/fluid/design/concepts/images/parallel_executor_overview.dot diff --git a/doc/design/images/parallel_executor_overview.png b/doc/fluid/design/concepts/images/parallel_executor_overview.png similarity index 100% rename from doc/design/images/parallel_executor_overview.png rename to doc/fluid/design/concepts/images/parallel_executor_overview.png diff --git a/doc/fluid/design/concepts/index_cn.rst b/doc/fluid/design/concepts/index_cn.rst index eec8a2f14ca9e8b3bf0d0acbbb6004972790d795..dcdc894937ff328e6002623275ca3c65e87b2bb0 100644 --- a/doc/fluid/design/concepts/index_cn.rst +++ b/doc/fluid/design/concepts/index_cn.rst @@ -16,3 +16,4 @@ block.md scope.md executor.md + parallel_executor.md diff --git a/doc/fluid/design/concepts/index_en.rst b/doc/fluid/design/concepts/index_en.rst index 036e1da2550cf520f5c40ecd9657f71603755adc..b85a3055746facaa642e8fc899976b58435f1ef2 100644 --- a/doc/fluid/design/concepts/index_en.rst +++ b/doc/fluid/design/concepts/index_en.rst @@ -16,3 +16,4 @@ Core Concepts block.md scope.md executor.md + parallel_executor.md diff --git a/doc/design/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md similarity index 100% rename from doc/design/parallel_executor.md rename to doc/fluid/design/concepts/parallel_executor.md diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst index f627437f354a12c79cad25c959409db29ecbd874..b123b756e2251c38f319e1aefa2cb04fd7a36b03 100644 --- a/doc/fluid/dev/index_cn.rst +++ b/doc/fluid/dev/index_cn.rst @@ -9,5 +9,5 @@ use_eigen_cn.md name_convention.md support_new_device.md - releasing_process.md + releasing_process_cn.md op_markdown_format.md diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst index 0b65fed67ad45eb399b624184485a99a082d79e9..98988fc22dcedecdbcd67fb3bf761377bf046337 100644 --- a/doc/fluid/dev/index_en.rst +++ b/doc/fluid/dev/index_en.rst @@ -9,5 +9,5 @@ Development use_eigen_en.md name_convention.md support_new_device.md - releasing_process.md + releasing_process_en.md op_markdown_format.md diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process_cn.md similarity index 74% rename from doc/fluid/dev/releasing_process.md rename to doc/fluid/dev/releasing_process_cn.md index c5943ccd81c2ae2aaacd2676da12509db889f54a..4c6728fba7150b0f1e180e57590f18a5b677c70d 100644 --- a/doc/fluid/dev/releasing_process.md +++ b/doc/fluid/dev/releasing_process_cn.md @@ -10,19 +10,10 @@ PaddlePaddle每次发新的版本,遵循以下流程: * 使用Regression Test List作为检查列表,测试本次release的正确性。 * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步 * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。 - * 编译这个版本的python wheel包,并发布到pypi。 - * 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。 - * pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`。 - * 上传方法: - ``` - cd build/python - pip install twine - twine upload dist/[package to upload] - ``` - * 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步 -1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 -1. 协同完成Release Note的书写 - + * 将这个版本的python wheel包发布到pypi。 + * 更新Docker镜像(参考后面的操作细节)。 +1. 第三步完成后,将`release/版本号`分支合入master分支,将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。 +1. 协同完成Release Note的书写。 需要注意的是: @@ -31,13 +22,18 @@ PaddlePaddle每次发新的版本,遵循以下流程: ## 发布wheel包到pypi -使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) +1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) 完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以 -弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。等待编译完成后 -可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m`和`cp27mu`的版本。然后按照上述的方法 -使用`twine`工具上传即可。 - - +弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。 + +1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m`和`cp27mu`的版本。 +1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。 +1. 上传: +``` +cd build/python +pip install twine +twine upload dist/[package to upload] +``` * 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux 发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。 @@ -48,10 +44,20 @@ PaddlePaddle每次发新的版本,遵循以下流程: 上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上 版本号对应的tag即可: -1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。 -1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`,latest tag可以是latest或latest-gpu等。 -1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]` -1. 执行 `docker push paddlepaddle/paddle:[version]` +``` +docker pull [镜像]:latest +docker tag [镜像]:latest [镜像]:[version] +docker push [镜像]:[version] +``` + +需要更新的镜像tag包括: + +* `[version]`: CPU版本 +* `[version]-openblas`: openblas版本 +* `[version]-gpu`: GPU版本(CUDA 8.0 cudnn 5) +* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像 + +之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。 ## PaddlePaddle 分支规范 @@ -76,7 +82,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git- ### PaddlePaddle Book中所有章节 -PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。 +PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练(V2和Fluid)模型正确性。 diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md new file mode 100644 index 0000000000000000000000000000000000000000..f989b964d6d1a329bbe31adc7ec10db017acaefa --- /dev/null +++ b/doc/fluid/dev/releasing_process_en.md @@ -0,0 +1,210 @@ +# PaddlePaddle Releasing Process + +PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics. + +Each time we release a new PaddlePaddle version, we should follow the below steps: + +1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`. +1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The + first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on. +1. After that, we should do: + * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm + that this release has no major bugs. + * If regression test fails, we must fix those bugs and create a new `release/[version]` + branch from previous release branch. + * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`. + * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail). + * Update the Docker images (see below instructions for detail). +1. After above step, merge `release/[version]` branch to master and push a tag on the master commit, + then merge `master` to `develop`. +1. Update the Release Note. + +***NOTE:*** + +* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain + features only for current release, so that we can test on that version. +* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch. + +## Publish Wheel Packages to pypi + +1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) + to build all wheel packages needed to publish. As shown in the following picture, choose a build + version, click "..." button on the right side of "Run" button, and switch to the second tab in the +pop-up box, choose the current release branch and click "Run Build" button. You may repeat this + step to start different versions of builds. + +1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`. +1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we + upload the package using `twine`, we need to rename the package from `linux_x86_64` to + `manylinux1_x86_64`. +1. Start the upload: + ``` + cd build/python + pip install twine + twine upload dist/[package to upload] + ``` + +* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can + download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using + scripts under `tools/manylinux1`. +* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the + old version. you must change the version number before upload a new one. + +## Publish Docker Images + +Our CI tool will push latest images to DockerHub, so we only need to push a version tag like: + +``` +docker pull [image]:latest +docker tag [image]:latest [image]:[version] +docker push [image]:[version] +``` + +Tags that need to be updated are: +* `[version]`: CPU only version image +* `[version]-openblas`: openblas version image +* `[version]-gpu`: GPU version(using CUDA 8.0 cudnn 5) +* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions + +You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/. + +## Branching Model + +We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model, +with some modifications: + +* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed. +* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no + regression tests are run. +* `release/[version]` branch is used to publish each release. Latest release version branches have + bugfix only for that version, but no feature updates. +* Developer forks are not required to follow + [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) + branching model, all forks is like a feature branch. + * Advise: developer fork's develop branch is used to sync up with main repo's develop branch. + * Advise: developer use it's fork's develop branch to for new branch to start developing. + * Use that branch on developer's fork to create pull requests and start reviews. + * developer can push new commits to that branch when the pull request is open. +* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to + `master`, `develop` and `releases`. + +## PaddlePaddle Regression Test List + +### All Chapters of PaddlePaddle Book + +We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including +V1 (`paddle_trainer` training) and V2 training and Fluid training. + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Linear RegressionRecognize DigitsImage ClassificationWord2VecPersonalized RecommendationSentiment AnalysisSemantic Role LabelingMachine Translation
API.V2 + Docker + GPU
API.V2 + Docker + CPU
`paddle_trainer` + Docker + GPU
`paddle_trainer` + Docker + CPU
API.V2 + Ubuntu + GPU
API.V2 + Ubuntu + CPU
`paddle_trainer` + Ubuntu + GPU
`paddle_trainer` + Ubuntu + CPU
diff --git a/paddle/.gitignore b/paddle/.gitignore index f921eef14156a97e4fd250f014960e306b43f35a..1c1c0c2c829f088d7e3f52ca007fcb8f33a16a36 100644 --- a/paddle/.gitignore +++ b/paddle/.gitignore @@ -1,3 +1,4 @@ +.timestamp *.o *.a .svn diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index a473ed7400012b7d0cbc5ab9bed263b3cca8c6ec..3840bbe83b68dc2a49aa73feb57a80e9992cad5f 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -7,9 +7,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) if(WITH_GPU) - nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto) + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place memory device_context framework_proto) else() - cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto) + cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place memory device_context framework_proto) endif() cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) @@ -21,9 +21,9 @@ endif() cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) -nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init) +nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio) -cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) +cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init) cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 16a118090ba9cfd50b4b03484983f9fc73cf7973..c2ca1bbc78f3ebc6066df6b666720af0d1fbbf59 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -93,6 +93,43 @@ static void CheckTensorNANOrInf(const std::string& name, "Tensor %s contains NAN", name); } +void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, + int block_id) { + auto& global_block = pdesc.Block(block_id); + + const Scope* ancestor_scope = scope; + while (ancestor_scope->parent()) { + ancestor_scope = ancestor_scope->parent(); + } + + if (ancestor_scope != scope) { + for (auto& var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var->Persistable()) { + auto* ptr = const_cast(ancestor_scope)->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { + auto* ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + } else { + for (auto& var : global_block.AllVars()) { + auto* ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; + } + } +} + void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { platform::RecordBlock b(block_id); @@ -184,8 +221,8 @@ static bool has_fetch_operators( void Executor::Run(const ProgramDesc& program, Scope* scope, std::map& feed_targets, std::map& fetch_targets, - const std::string& feed_holder_name, - const std::string& fetch_holder_name, bool create_vars) { + bool create_vars, const std::string& feed_holder_name, + const std::string& fetch_holder_name) { platform::RecordBlock b(kProgramId); bool has_feed_ops = has_feed_operators(program.Block(0), feed_targets, feed_holder_name); @@ -296,38 +333,13 @@ std::vector> Executor::Prepare( void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, bool create_local_scope, bool create_vars) { - auto& block = ctx->prog_.Block(ctx->block_id_); - Scope* local_scope = scope; if (create_vars) { if (create_local_scope) { local_scope = &scope->NewScope(); - for (auto& var : block.AllVars()) { - if (var->Name() == framework::kEmptyVarName) { - continue; - } - - if (var->Persistable()) { - auto* ptr = scope->Var(var->Name()); - InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; - } else { - auto* ptr = local_scope->Var(var->Name()); - InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; - } - } - } else { - for (auto& var : block.AllVars()) { - auto* ptr = local_scope->Var(var->Name()); - InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create variable " << var->Name() << ", which pointer is " - << ptr; - } - } // if (create_local_scope) - } // if (create_vars) + } + CreateVariables(ctx->prog_, local_scope, ctx->block_id_); + } for (auto& op : ctx->ops_) { VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index d7c99165f0c9d3b1ae11a3b4753a61e8118f7b52..75b29b2f4065ad75b62a134b890b8f9f6730fdc7 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -54,9 +54,9 @@ class Executor { void Run(const ProgramDesc& program, Scope* scope, std::map& feed_targets, std::map& fetch_targets, + bool create_vars = true, const std::string& feed_holder_name = "feed", - const std::string& fetch_holder_name = "fetch", - bool create_vars = true); + const std::string& fetch_holder_name = "fetch"); static std::unique_ptr Prepare( const ProgramDesc& program, int block_id); @@ -64,6 +64,8 @@ class Executor { static std::vector> Prepare( const ProgramDesc& program, const std::vector& block_ids); + void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); + void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, bool create_local_scope = true, bool create_vars = true); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7be93fa6002ae93c3e1b75c8f7fe5ca5f40b271f..74945fb4f2f745b6ca9c48adb0c8b9e6ae1e94a4 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" -#include "paddle/fluid/platform/profiler.h" #include #include @@ -24,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -43,30 +43,40 @@ class ParallelExecutorPrivate { #endif }; +std::vector &ParallelExecutor::GetLocalScopes() { + return member_->local_scopes_; +} + ParallelExecutor::ParallelExecutor( size_t num_threads, bool use_event, const std::vector &places, const std::unordered_set ¶ms, - const ProgramDesc &startup_program, const ProgramDesc &main_program, - const std::string &loss_var_name, Scope *scope, bool allow_op_delay) + const std::unordered_set &bcast_vars, + const ProgramDesc &main_program, const std::string &loss_var_name, + Scope *scope, const std::vector &local_scopes, bool allow_op_delay) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; - // Step 1. RunStartupProgram and Bcast the params to devs. - Executor exe(places[0]); - exe.Run(startup_program, scope, 0); + // Step 1. Bcast the params to devs. // Create local scopes - for (size_t i = 0; i < member_->places_.size(); ++i) { - member_->local_scopes_.push_back(&scope->NewScope()); + if (local_scopes.empty()) { + for (size_t i = 0; i < member_->places_.size(); ++i) { + member_->local_scopes_.push_back(&scope->NewScope()); + } + } else { + PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size()); + for (size_t i = 0; i < member_->places_.size(); ++i) { + member_->local_scopes_.push_back(local_scopes[i]); + } } // Bcast Parameters to all GPUs #ifdef PADDLE_WITH_CUDA member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_)); #endif - if (platform::is_gpu_place(places[0]) && - member_->local_scopes_.size() != 1) { // Is CUDA - BCastParamsToGPUs(startup_program); + if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 && + local_scopes.empty()) { // Is CUDA + BCastParamsToGPUs(bcast_vars); } // Startup Program has been run. All local scopes has correct parameters. @@ -99,48 +109,47 @@ ParallelExecutor::ParallelExecutor( } void ParallelExecutor::BCastParamsToGPUs( - const ProgramDesc &startup_program) const { + const std::unordered_set &vars) const { #ifdef PADDLE_WITH_CUDA auto *main_scope = member_->local_scopes_[0]; - for (auto *var_desc : startup_program.Block(0).AllVars()) { - size_t idx = var_desc->Name().find("@GRAD"); - if (idx != std::string::npos) continue; - if (var_desc->GetType() == proto::VarType::LOD_TENSOR) { - auto &main_tensor = - main_scope->FindVar(var_desc->Name())->Get(); - - auto &dims = main_tensor.dims(); - - if (paddle::platform::is_gpu_place(main_tensor.place())) { - size_t numel = main_tensor.numel(); - ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); - platform::NCCLGroupGuard guard; - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto place = member_->places_[i]; - void *buffer; - if (i == 0) { - buffer = const_cast(main_tensor.data()); - } else { - auto local_scope = member_->local_scopes_[i]; - auto *t = - local_scope->Var(var_desc->Name())->GetMutable(); - t->Resize(dims); - buffer = t->mutable_data(place, main_tensor.type()); - } - auto &nccl_ctx = member_->nccl_ctxs_->at(place); - platform::dynload::ncclBcast(buffer, numel, data_type, 0, - nccl_ctx.comm_, nccl_ctx.stream()); - } - } else { - platform::CPUPlace cpu; - for (size_t i = 1; i < member_->places_.size(); ++i) { + for (auto &var : vars) { + auto *main_var = main_scope->FindVar(var); + if (!main_var->IsType()) { + continue; + } + + auto &main_tensor = main_var->Get(); + + auto &dims = main_tensor.dims(); + + if (paddle::platform::is_gpu_place(main_tensor.place())) { + size_t numel = main_tensor.numel(); + ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); + platform::NCCLGroupGuard guard; + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto place = member_->places_[i]; + void *buffer; + if (i == 0) { + buffer = const_cast(main_tensor.data()); + } else { auto local_scope = member_->local_scopes_[i]; - auto *t = local_scope->Var(var_desc->Name())->GetMutable(); + auto *t = local_scope->Var(var)->GetMutable(); t->Resize(dims); - t->mutable_data(cpu, main_tensor.type()); - paddle::framework::TensorCopy(main_tensor, cpu, t); + buffer = t->mutable_data(place, main_tensor.type()); } + auto &nccl_ctx = member_->nccl_ctxs_->at(place); + platform::dynload::ncclBcast(buffer, numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); + } + } else { + platform::CPUPlace cpu; + for (size_t i = 1; i < member_->places_.size(); ++i) { + auto local_scope = member_->local_scopes_[i]; + auto *t = local_scope->Var(var)->GetMutable(); + t->Resize(dims); + t->mutable_data(cpu, main_tensor.type()); + paddle::framework::TensorCopy(main_tensor, cpu, t); } } member_->nccl_ctxs_->WaitAll(); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index c7c58b2b808383621a6d492f9188b0d36bfa6858..c048c3865f14822be4a0015e385ea1b8e05d0ced 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -36,11 +36,14 @@ class ParallelExecutor { explicit ParallelExecutor(size_t num_threads, bool use_event, const std::vector& places, const std::unordered_set& params, - const ProgramDesc& startup_program, + const std::unordered_set& bcast_vars, const ProgramDesc& main_program, const std::string& loss_var_name, Scope* scope, + const std::vector& local_scopes, bool allow_op_delay); + std::vector& GetLocalScopes(); + void Run(const std::vector& fetch_tensors, const std::string& fetched_var_name, const std::unordered_map& feed_tensors); @@ -51,7 +54,7 @@ class ParallelExecutor { ParallelExecutorPrivate* member_; - void BCastParamsToGPUs(const ProgramDesc& startup_program) const; + void BCastParamsToGPUs(const std::unordered_set& vars) const; }; } // namespace framework diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 17e38b1cf042657834b4d0d1c12cbbb92f19fa45..194df3e4a8b50700e2be01ce5ebca83b92501fb8 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include // for unique_ptr -#include // for call_once #include #include "glog/logging.h" #include "paddle/fluid/framework/threadpool.h" @@ -39,6 +38,7 @@ Scope::~Scope() { } Scope& Scope::NewScope() const { + std::unique_lock lock(mutex_); kids_.push_back(new Scope(this)); return *kids_.back(); } @@ -92,6 +92,7 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) { + std::unique_lock lock(mutex_); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); this->kids_.erase(it); @@ -103,7 +104,7 @@ void Scope::DeleteScope(Scope* scope) { } } -void Scope::EraseVars(std::vector& var_names) { +void Scope::EraseVars(const std::vector& var_names) { std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index c1e1f49caaa5a60df0e97289aada465b45213971..c8cb70549f1d131b66fa7c6eeb35f3b7151a9e7f 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include // NOLINT #include #include #include @@ -51,13 +52,13 @@ class Scope { /// Create a variable with a scope-unique name. Variable* Var(std::string* name = nullptr); - void EraseVars(std::vector& var_names); + void EraseVars(const std::vector& var_names); /// Find a variable in the scope or any of its ancestors. Returns /// nullptr if cannot find. Variable* FindVar(const std::string& name) const; - const Scope& parent() const { return *parent_; } + const Scope* parent() const { return parent_; } /// Find the scope or an ancestor scope that contains the given variable. const Scope* FindScope(const Variable* var) const; @@ -88,6 +89,9 @@ class Scope { Scope const* parent_{nullptr}; DISABLE_COPY_AND_ASSIGN(Scope); + + private: + mutable std::mutex mutex_; }; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index aff427310f15be72f5c8d0fa1537ffa6bbe2881d..f417f62f3f75360f4ae1b7795608ae95200cfeb8 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -1,4 +1,4 @@ -set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init) +set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init) cc_library(paddle_fluid_api SRCS io.cc diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index a6b6c3f828f4c6f59fca42e4c3d9580d6c136524..ca2077d07411d2cd6095e0dc2a874af0890145c5 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -46,8 +46,8 @@ TEST(inference, image_classification) { // Run inference on CPU LOG(INFO) << "--- CPU Runs: ---"; - TestInference(dirname, cpu_feeds, cpu_fetchs1, - FLAGS_repeat); + TestInference(dirname, cpu_feeds, + cpu_fetchs1, FLAGS_repeat); LOG(INFO) << output1.dims(); #ifdef PADDLE_WITH_CUDA @@ -57,8 +57,8 @@ TEST(inference, image_classification) { // Run inference on CUDA GPU LOG(INFO) << "--- GPU Runs: ---"; - TestInference(dirname, cpu_feeds, cpu_fetchs2, - FLAGS_repeat); + TestInference(dirname, cpu_feeds, + cpu_fetchs2, FLAGS_repeat); LOG(INFO) << output2.dims(); CheckError(output1, output2); diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 5118e66f1e7a36333ff4425361e54ec59e6ba05b..aae34ceda07fea6e881cf61b3755ec45d1d6f2dc 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -88,7 +88,7 @@ void CheckError(const paddle::framework::LoDTensor& output1, EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; } -template +template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, const std::vector& cpu_fetchs, @@ -166,8 +166,16 @@ void TestInference(const std::string& dirname, // 6. Run the inference program { + if (!CreateVars) { + // If users don't want to create and destroy variables every time they + // run, they need to set `create_vars` to false and manually call + // `CreateVariables` before running. + executor.CreateVariables(*inference_program, scope, 0); + } + // Ignore the profiling results of the first run - executor.Run(*inference_program, scope, feed_targets, fetch_targets); + executor.Run(*inference_program, scope, feed_targets, fetch_targets, + CreateVars); // Enable the profiler paddle::platform::EnableProfiler(state); @@ -178,7 +186,8 @@ void TestInference(const std::string& dirname, "run_inference", paddle::platform::DeviceContextPool::Instance().Get(place)); - executor.Run(*inference_program, scope, feed_targets, fetch_targets); + executor.Run(*inference_program, scope, feed_targets, fetch_targets, + CreateVars); } // Disable the profiler and print the timing information diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 8b3043af7a18787a08583d47b76da679ccb63740..709fc7e12e1db537ceece30c405c0e8a2582e8ca 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -1,20 +1,15 @@ add_subdirectory(detail) -cc_library(memory SRCS memory.cc DEPS place enforce) +cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce) cc_library(memcpy SRCS memcpy.cc DEPS place) -cc_library(paddle_memory +cc_library(memory DEPS - memory - memcpy - meta_data - meta_cache - memory_block - buddy_allocator - system_allocator) + malloc + memcpy) -cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory) +cc_test(malloc_test SRCS malloc_test.cc DEPS malloc) #if (WITH_GPU) -# nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place paddle_memory) +# nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory) #endif() diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt index b9c3fc31c1523abf3acbd116745bbf1596454aac..c725dba5e98c200c2542d97cb8f53a938f6b614a 100644 --- a/paddle/fluid/memory/detail/CMakeLists.txt +++ b/paddle/fluid/memory/detail/CMakeLists.txt @@ -1,3 +1,5 @@ +cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc) + if(${WITH_GPU}) nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info) else(${WITH_GPU}) @@ -6,10 +8,4 @@ endif(${WITH_GPU}) cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) -cc_library(meta_data SRCS meta_data.cc) - -cc_library(meta_cache SRCS meta_cache.cc) - -cc_library(memory_block SRCS memory_block.cc) - -cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog) +cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog) diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 876837838648d6733b104a5496454f5dc58bbb71..4194ba197948b47003863196efdac1c08a7ae4f6 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -46,7 +46,8 @@ inline size_t align(size_t size, size_t alignment) { void* BuddyAllocator::Alloc(size_t unaligned_size) { // adjust allocation alignment - size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_); + size_t size = + align(unaligned_size + sizeof(MemoryBlock::Desc), min_chunk_size_); // acquire the allocator lock std::lock_guard lock(mutex_); @@ -103,7 +104,7 @@ void BuddyAllocator::Free(void* p) { return; } - block->mark_as_free(cache_); + block->mark_as_free(&cache_); total_used_ -= block->total_size(cache_); total_free_ += block->total_size(cache_); @@ -122,7 +123,7 @@ void BuddyAllocator::Free(void* p) { right_buddy)); // merge its right buddy to the block - block->merge(cache_, right_buddy); + block->merge(&cache_, right_buddy); } } @@ -139,7 +140,7 @@ void BuddyAllocator::Free(void* p) { left_buddy->total_size(cache_), left_buddy)); // merge the block to its left buddy - left_buddy->merge(cache_, block); + left_buddy->merge(&cache_, block); block = left_buddy; } } @@ -163,13 +164,13 @@ size_t BuddyAllocator::Used() { return total_used_; } void* BuddyAllocator::SystemAlloc(size_t size) { size_t index = 0; - void* p = system_allocator_->Alloc(index, size); + void* p = system_allocator_->Alloc(&index, size); VLOG(10) << "Allocated " << p << " from system allocator."; if (p == nullptr) return nullptr; - static_cast(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index, + static_cast(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index, size, nullptr, nullptr); return static_cast(p)->data(); @@ -187,14 +188,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { // Allocate a new maximum sized block size_t index = 0; - void* p = system_allocator_->Alloc(index, max_chunk_size_); + void* p = system_allocator_->Alloc(&index, max_chunk_size_); if (p == nullptr) return pool_.end(); VLOG(10) << "Creating and inserting new block " << p << " from system allocator"; - static_cast(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, + static_cast(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index, max_chunk_size_, nullptr, nullptr); // gpu fallback allocation @@ -238,11 +239,11 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_) << ") into"; - block->split(cache_, size); + block->split(&cache_, size); VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_) << ")"; - block->set_type(cache_, MemoryBlock::ARENA_CHUNK); + block->set_type(&cache_, MemoryBlock::ARENA_CHUNK); // the rest of memory if exist if (block->has_right_buddy(cache_)) { diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index a4ee70c2586f37e3b2328dedfe28135e14d8b18d..2f39d774d6fb6a2bc37877eb2f8b90bebd3cda28 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -14,18 +14,18 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/memory/detail/meta_cache.h" -#include "paddle/fluid/memory/detail/meta_data.h" +#include // NOLINT +#include +#include +#include +#include + +#include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/gpu_info.h" -#include -#include -#include -#include - namespace paddle { namespace memory { namespace detail { diff --git a/paddle/fluid/memory/detail/memory_block.cc b/paddle/fluid/memory/detail/memory_block.cc index 07123f2669c3a829ff28e9fab5a404047c5a09c7..f34b922b25a0110690671d487f190e1b977a67bb 100644 --- a/paddle/fluid/memory/detail/memory_block.cc +++ b/paddle/fluid/memory/detail/memory_block.cc @@ -13,143 +13,142 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/memory/detail/memory_block.h" -#include "paddle/fluid/memory/detail/meta_cache.h" -#include "paddle/fluid/memory/detail/meta_data.h" #include "paddle/fluid/platform/assert.h" namespace paddle { namespace memory { namespace detail { -void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size, +void MemoryBlock::init(MetadataCache* cache, Type t, size_t index, size_t size, void* left_buddy, void* right_buddy) { - cache.store(this, Metadata(t, index, size - sizeof(Metadata), size, - static_cast(left_buddy), - static_cast(right_buddy))); + cache->save( + this, MemoryBlock::Desc(t, index, size - sizeof(MemoryBlock::Desc), size, + static_cast(left_buddy), + static_cast(right_buddy))); } -MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const { +MemoryBlock::Type MemoryBlock::type(const MetadataCache& cache) const { return cache.load(this).type; } -size_t MemoryBlock::size(MetadataCache& cache) const { +size_t MemoryBlock::size(const MetadataCache& cache) const { return cache.load(this).size; } -size_t MemoryBlock::total_size(MetadataCache& cache) const { +size_t MemoryBlock::index(const MetadataCache& cache) const { + return cache.load(this).index; +} + +size_t MemoryBlock::total_size(const MetadataCache& cache) const { return cache.load(this).total_size; } -MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const { +bool MemoryBlock::has_left_buddy(const MetadataCache& cache) const { + return left_buddy(cache) != nullptr; +} + +bool MemoryBlock::has_right_buddy(const MetadataCache& cache) const { + return right_buddy(cache) != nullptr; +} + +MemoryBlock* MemoryBlock::left_buddy(const MetadataCache& cache) const { return cache.load(this).left_buddy; } -MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const { +MemoryBlock* MemoryBlock::right_buddy(const MetadataCache& cache) const { return cache.load(this).right_buddy; } -void MemoryBlock::split(MetadataCache& cache, size_t size) { +void MemoryBlock::split(MetadataCache* cache, size_t size) { // make sure the split fits - PADDLE_ASSERT(total_size(cache) >= size); + PADDLE_ASSERT(total_size(*cache) >= size); // bail out if there is no room for another partition - if (total_size(cache) - size <= sizeof(Metadata)) { + if (total_size(*cache) - size <= sizeof(MemoryBlock::Desc)) { return; } // find the position of the split void* right_partition = reinterpret_cast(this) + size; - size_t remaining_size = total_size(cache) - size; + size_t remaining_size = total_size(*cache) - size; // Add the new block as a buddy - auto metadata = cache.load(this); + auto metadata = cache->load(this); // Write the metadata for the new block auto new_block_right_buddy = metadata.right_buddy; - cache.store( - static_cast(right_partition), - Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata), - remaining_size, this, new_block_right_buddy)); + cache->save(static_cast(right_partition), + MemoryBlock::Desc(FREE_CHUNK, index(*cache), + remaining_size - sizeof(MemoryBlock::Desc), + remaining_size, this, new_block_right_buddy)); metadata.right_buddy = static_cast(right_partition); - metadata.size = size - sizeof(Metadata); + metadata.size = size - sizeof(MemoryBlock::Desc); metadata.total_size = size; - cache.store(this, metadata); + cache->save(this, metadata); // Write metadata for the new block's right buddy if (new_block_right_buddy != nullptr) { - auto buddy_metadata = cache.load(new_block_right_buddy); + auto buddy_metadata = cache->load(new_block_right_buddy); buddy_metadata.left_buddy = static_cast(right_partition); - cache.store(new_block_right_buddy, buddy_metadata); + cache->save(new_block_right_buddy, buddy_metadata); } } -void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) { +void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) { // only free blocks can be merged - PADDLE_ASSERT(type(cache) == FREE_CHUNK); - PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK); + PADDLE_ASSERT(type(*cache) == FREE_CHUNK); + PADDLE_ASSERT(right_buddy->type(*cache) == FREE_CHUNK); - auto metadata = cache.load(this); + auto metadata = cache->load(this); // link this->buddy's buddy - metadata.right_buddy = right_buddy->right_buddy(cache); + metadata.right_buddy = right_buddy->right_buddy(*cache); // link buddy's buddy -> this if (metadata.right_buddy != nullptr) { - auto buddy_metadata = cache.load(metadata.right_buddy); + auto buddy_metadata = cache->load(metadata.right_buddy); buddy_metadata.left_buddy = this; - cache.store(metadata.right_buddy, buddy_metadata); + cache->save(metadata.right_buddy, buddy_metadata); } - metadata.size += right_buddy->total_size(cache); - metadata.total_size += right_buddy->total_size(cache); + metadata.size += right_buddy->total_size(*cache); + metadata.total_size += right_buddy->total_size(*cache); - cache.store(this, metadata); - cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr)); + cache->save(this, metadata); + cache->save(right_buddy, + MemoryBlock::Desc(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr)); } -void MemoryBlock::mark_as_free(MetadataCache& cache) { +void MemoryBlock::mark_as_free(MetadataCache* cache) { // check for double free or corruption - PADDLE_ASSERT(type(cache) != FREE_CHUNK); - PADDLE_ASSERT(type(cache) != INVALID_CHUNK); - + PADDLE_ASSERT(type(*cache) != FREE_CHUNK); + PADDLE_ASSERT(type(*cache) != INVALID_CHUNK); set_type(cache, FREE_CHUNK); } -void MemoryBlock::set_type(MetadataCache& cache, Type t) { - auto metadata = cache.load(this); - +void MemoryBlock::set_type(MetadataCache* cache, Type t) { + auto metadata = cache->load(this); metadata.type = t; - - cache.store(this, metadata); -} - -bool MemoryBlock::has_left_buddy(MetadataCache& cache) const { - return left_buddy(cache) != nullptr; -} - -bool MemoryBlock::has_right_buddy(MetadataCache& cache) const { - return right_buddy(cache) != nullptr; -} - -size_t MemoryBlock::index(MetadataCache& cache) const { - return cache.load(this).index; + cache->save(this, metadata); } void* MemoryBlock::data() const { - return const_cast(reinterpret_cast(this)) + 1; + return const_cast( + reinterpret_cast(this)) + + 1; } MemoryBlock* MemoryBlock::metadata() const { return const_cast(reinterpret_cast( - reinterpret_cast(this) - 1)); + reinterpret_cast(this) - 1)); } } // namespace detail diff --git a/paddle/fluid/memory/detail/memory_block.h b/paddle/fluid/memory/detail/memory_block.h index 72b40b73177d086aa912416e7f9cb3cd4ad5b45e..5cceba659beeec1b3c986dc43229f6725e3e11de 100644 --- a/paddle/fluid/memory/detail/memory_block.h +++ b/paddle/fluid/memory/detail/memory_block.h @@ -11,21 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once -#include +#include +#include namespace paddle { namespace memory { namespace detail { -// Forward Declarations +// Forward declaration. class MetadataCache; -/*! \brief A class used to interpret the contents of a memory block */ -class MemoryBlock { - public: +// MemoryBlock represents Each allocated memory block, which contains +// MemoryBlock::Desc and the payload. +struct MemoryBlock { enum Type { FREE_CHUNK, // memory is free and idle ARENA_CHUNK, // memory is being occupied @@ -33,57 +33,96 @@ class MemoryBlock { INVALID_CHUNK // memory is invalid }; - public: - void init(MetadataCache& cache, Type t, size_t index, size_t size, + // init saves the MemoryBlock::Desc of the memory block in a MetadataCache. + // If it is a CPU memory block, the MetadataCache writes the + // MemoryBlock::Desc to the beginning of the block; or, if it is a GPU memory + // block, the MetadataCache writes the Meatadata to a std::map in + // the CPU. + void init(MetadataCache* cache, Type t, size_t index, size_t size, void* left_buddy, void* right_buddy); - public: - /*! \brief The type of the allocation */ - Type type(MetadataCache& cache) const; - - /*! \brief The size of the data region */ - size_t size(MetadataCache& cache) const; + // All these accessors returns fields in the MemoryBlock::Desc of the memory + // block. They all need a MetadataCache instance as their first + // parameter because they read the MemoryBlock::Desc from the cache. + Type type(const MetadataCache& cache) const; + size_t size(const MetadataCache& cache) const; + size_t index(const MetadataCache& cache) const; + size_t total_size(const MetadataCache& cache) const; + bool has_left_buddy(const MetadataCache& cache) const; + bool has_right_buddy(const MetadataCache& cache) const; + MemoryBlock* left_buddy(const MetadataCache& cache) const; + MemoryBlock* right_buddy(const MetadataCache& cache) const; - /*! \brief An index to track the allocator */ - size_t index(MetadataCache& cache) const; + // Split the allocation into left/right blocks. + void split(MetadataCache* cache, size_t size); - /*! \brief The total size of the block */ - size_t total_size(MetadataCache& cache) const; + // Merge left and right blocks together. + void merge(MetadataCache* cache, MemoryBlock* right_buddy); - /*! \brief Check the left buddy of the block */ - bool has_left_buddy(MetadataCache& cache) const; + // Mark the allocation as free. + void mark_as_free(MetadataCache* cache); - /*! \brief Check the right buddy of the block */ - bool has_right_buddy(MetadataCache& cache) const; - - /*! \brief Get the left buddy */ - MemoryBlock* left_buddy(MetadataCache& cache) const; - - /*! \brief Get the right buddy */ - MemoryBlock* right_buddy(MetadataCache& cache) const; - - public: - /*! \brief Split the allocation into left/right blocks */ - void split(MetadataCache& cache, size_t size); + // Change the type of the allocation. + void set_type(MetadataCache* cache, Type t); - /*! \brief Merge left and right blocks together */ - void merge(MetadataCache& cache, MemoryBlock* right_buddy); - - /*! \brief Mark the allocation as free */ - void mark_as_free(MetadataCache& cache); - - /*! \brief Change the type of the allocation */ - void set_type(MetadataCache& cache, Type t); - - public: - /*! \brief Get a pointer to the memory block's data */ void* data() const; - - /*! \brief Get a pointer to the memory block's metadata */ MemoryBlock* metadata() const; + // MemoryBlock::Desc describes a MemoryBlock. + struct Desc { + Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l, + MemoryBlock* r); + Desc(); + + // Updates guard_begin and guard_end by hashes of the Metadata object. + void update_guards(); + + // Checks that guard_begin and guard_end are hashes of the Metadata object. + bool check_guards() const; + + // TODO(gangliao): compress this + size_t guard_begin = 0; + MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK; + size_t index = 0; + size_t size = 0; + size_t total_size = 0; + MemoryBlock* left_buddy = nullptr; + MemoryBlock* right_buddy = nullptr; + size_t guard_end = 0; + }; +}; + +// A cache for accessing memory block meta-data that may be expensive +// to access directly. This class exists to unify the +// MemoryBlock::Desc format between GPU and CPU allocations. It should +// be removed when the CPU can access all GPU allocations directly via +// UVM. +class MetadataCache { public: - static size_t overhead(); + explicit MetadataCache(bool uses_gpu); + + // Disable copying and assignment. + MetadataCache(const MetadataCache&) = delete; + MetadataCache& operator=(const MetadataCache&) = delete; + + // Returns the MemoryBlock::Desc for a memory block. When MetadataCache is + // used to manage CPU memory, the MemoryBlock::Desc resides at the beginning + // of the memory block; when used to manage GPU memory, the + // Meatadata resides in CPU memory indexed by cache_. + MemoryBlock::Desc load(const MemoryBlock* memory_block) const; + + // Saves the MemoryBlock::Desc of a memory block into the cache. For CPU + // memory block, writes the MemoryBlock::Desc to the beginning of the memory + // block; whereas for GPU memory, writes it to cache_. + void save(MemoryBlock* memory_block, const MemoryBlock::Desc& meta_data); + + // For GPU memory block, erases its MemoryBlock::Desc from cache_. + void invalidate(MemoryBlock* memory_block); + + private: + typedef std::unordered_map MetadataMap; + MetadataMap cache_; + bool uses_gpu_; }; } // namespace detail diff --git a/paddle/fluid/memory/detail/meta_data.cc b/paddle/fluid/memory/detail/memory_block_desc.cc similarity index 54% rename from paddle/fluid/memory/detail/meta_data.cc rename to paddle/fluid/memory/detail/memory_block_desc.cc index ad862af1705835c495a30232aa2bba2d2a56ad89..393dd9209c0aa443cd17c29b2f9de6eafb48bac9 100644 --- a/paddle/fluid/memory/detail/meta_data.cc +++ b/paddle/fluid/memory/detail/memory_block_desc.cc @@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/memory/detail/meta_data.h" - #include +#include "paddle/fluid/memory/detail/memory_block.h" + namespace paddle { namespace memory { namespace detail { -Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, - MemoryBlock* l, MemoryBlock* r) +MemoryBlock::Desc::Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts, + MemoryBlock* l, MemoryBlock* r) : type(t), index(i), size(s), @@ -29,7 +29,7 @@ Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, left_buddy(l), right_buddy(r) {} -Metadata::Metadata() +MemoryBlock::Desc::Desc() : type(MemoryBlock::INVALID_CHUNK), index(0), size(0), @@ -37,32 +37,36 @@ Metadata::Metadata() left_buddy(nullptr), right_buddy(nullptr) {} +namespace { + template -inline void hash_combine(std::size_t& seed, const T& v) { +inline void hash_combine(std::size_t* seed, const T& v) { std::hash hasher; - seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + (*seed) ^= hasher(v) + 0x9e3779b9 + ((*seed) << 6) + ((*seed) >> 2); } -inline size_t hash(const Metadata* metadata, size_t initial_seed) { +inline size_t hash(const MemoryBlock::Desc& metadata, size_t initial_seed) { size_t seed = initial_seed; - hash_combine(seed, (size_t)metadata->type); - hash_combine(seed, metadata->index); - hash_combine(seed, metadata->size); - hash_combine(seed, metadata->total_size); - hash_combine(seed, metadata->left_buddy); - hash_combine(seed, metadata->right_buddy); + hash_combine(&seed, static_cast(metadata.type)); + hash_combine(&seed, metadata.index); + hash_combine(&seed, metadata.size); + hash_combine(&seed, metadata.total_size); + hash_combine(&seed, metadata.left_buddy); + hash_combine(&seed, metadata.right_buddy); return seed; } -void Metadata::update_guards() { - guard_begin = hash(this, 1); - guard_end = hash(this, 2); +} // namespace + +void MemoryBlock::Desc::update_guards() { + guard_begin = hash(*this, 1); + guard_end = hash(*this, 2); } -bool Metadata::check_guards() const { - return guard_begin == hash(this, 1) && guard_end == hash(this, 2); +bool MemoryBlock::Desc::check_guards() const { + return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2); } } // namespace detail diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc index 43249e842ad4d2419fed041e6c9056021e9663cd..b86e4f38c42a26e155f276f9b73cbed1d0d83f7d 100644 --- a/paddle/fluid/memory/detail/meta_cache.cc +++ b/paddle/fluid/memory/detail/meta_cache.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/memory/detail/meta_cache.h" #include "glog/logging.h" #include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/platform/assert.h" @@ -23,29 +22,28 @@ namespace detail { MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {} -Metadata MetadataCache::load(const MemoryBlock* block) { +MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const { if (uses_gpu_) { - auto existing_metadata = cache_.find(block); - PADDLE_ASSERT(existing_metadata->second.check_guards()); - return existing_metadata->second; + auto existing_desc = cache_.find(block); + PADDLE_ASSERT(existing_desc->second.check_guards()); + return existing_desc->second; } else { - auto* meta = reinterpret_cast(block); - VLOG(10) << "Load MetaData type=" << meta->type; - PADDLE_ASSERT(meta->check_guards()); - return *reinterpret_cast(block); + auto* desc = reinterpret_cast(block); + VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type; + PADDLE_ASSERT(desc->check_guards()); + return *reinterpret_cast(block); } } -void MetadataCache::store(MemoryBlock* block, - const Metadata& original_metadata) { - auto metadata = original_metadata; - - metadata.update_guards(); +void MetadataCache::save(MemoryBlock* block, + const MemoryBlock::Desc& original_desc) { + auto desc = original_desc; + desc.update_guards(); if (uses_gpu_) { - cache_[block] = metadata; + cache_[block] = desc; } else { - *reinterpret_cast(block) = metadata; + *reinterpret_cast(block) = desc; } } diff --git a/paddle/fluid/memory/detail/meta_cache.h b/paddle/fluid/memory/detail/meta_cache.h deleted file mode 100644 index 3283d756a6e7f7f1750442039797846bdad51125..0000000000000000000000000000000000000000 --- a/paddle/fluid/memory/detail/meta_cache.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/memory/detail/memory_block.h" -#include "paddle/fluid/memory/detail/meta_data.h" - -#include - -namespace paddle { -namespace memory { -namespace detail { - -/** - * \brief A cache for accessing memory block meta-data that may be expensive - * to access directly. - * - * \note This class exists to unify the metadata format between GPU and CPU - * allocations. It should be removed when the CPU can access all GPU - * allocations directly via UVM. - */ -class MetadataCache { - public: - explicit MetadataCache(bool uses_gpu); - - public: - /*! \brief Load the associated metadata for the specified memory block. */ - Metadata load(const MemoryBlock* memory_block); - - /*! \brief Store the associated metadata for the specified memory block. */ - void store(MemoryBlock* memory_block, const Metadata& meta_data); - - /*! \brief Indicate that the specified metadata will no longer be used. */ - void invalidate(MemoryBlock* memory_block); - - public: - MetadataCache(const MetadataCache&) = delete; - MetadataCache& operator=(const MetadataCache&) = delete; - - private: - bool uses_gpu_; - - private: - typedef std::unordered_map MetadataMap; - - private: - MetadataMap cache_; -}; - -} // namespace detail -} // namespace memory -} // namespace paddle diff --git a/paddle/fluid/memory/detail/meta_data.h b/paddle/fluid/memory/detail/meta_data.h deleted file mode 100644 index 14895ee8727e98186b1f1295321951e12753fef6..0000000000000000000000000000000000000000 --- a/paddle/fluid/memory/detail/meta_data.h +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/memory/detail/memory_block.h" - -#include - -namespace paddle { -namespace memory { -namespace detail { - -class Metadata { - public: - Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l, - MemoryBlock* r); - Metadata(); - - public: - /*! \brief Update the guards when metadata is changed */ - void update_guards(); - - /*! \brief Check consistency to previous modification */ - bool check_guards() const; - - public: - // TODO(gangliao): compress this - // clang-format off - size_t guard_begin = 0; - MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK; - size_t index = 0; - size_t size = 0; - size_t total_size = 0; - MemoryBlock* left_buddy = nullptr; - MemoryBlock* right_buddy = nullptr; - size_t guard_end = 0; - // clang-format on -}; - -} // namespace detail -} // namespace memory -} // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index a45f8c33ee5956f3409ee1b7c43628aa0acafb98..d5390529163491c2711e50ffad236534e88b73ee 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -13,16 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/memory/detail/system_allocator.h" -#include "paddle/fluid/platform/assert.h" -#include "paddle/fluid/platform/cpu_info.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/gpu_info.h" #include // for malloc and free #include // for mlock and munlock #include // for std::max #include "gflags/gflags.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" // If use_pinned_memory is true, CPUAllocator calls mlock, which // returns pinned and locked memory as staging areas for data exchange @@ -35,13 +35,13 @@ namespace paddle { namespace memory { namespace detail { -void* CPUAllocator::Alloc(size_t& index, size_t size) { +void* CPUAllocator::Alloc(size_t* index, size_t size) { // According to http://www.cplusplus.com/reference/cstdlib/malloc/, // malloc might not return nullptr if size is zero, but the returned // pointer shall not be dereferenced -- so we make it nullptr. if (size <= 0) return nullptr; - index = 0; // unlock memory + *index = 0; // unlock memory void* p; @@ -56,7 +56,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) { if (p != nullptr) { if (FLAGS_use_pinned_memory) { - index = 1; + *index = 1; mlock(p, size); // lock memory } } @@ -75,7 +75,7 @@ bool CPUAllocator::UseGpu() const { return false; } #ifdef PADDLE_WITH_CUDA -void* GPUAllocator::Alloc(size_t& index, size_t size) { +void* GPUAllocator::Alloc(size_t* index, size_t size) { // CUDA documentation doesn't explain if cudaMalloc returns nullptr // if size is 0. We just make sure it does. if (size <= 0) return nullptr; @@ -93,7 +93,7 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) { } if (result == cudaSuccess) { - index = 0; + *index = 0; gpu_alloc_size_ += size; return p; } else { @@ -133,7 +133,7 @@ bool GPUAllocator::UseGpu() const { return true; } // PINNED memory allows direct DMA transfers by the GPU to and from system // memory. It’s locked to a physical address. -void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { +void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { if (size <= 0) return nullptr; // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size @@ -154,7 +154,7 @@ void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { cudaError_t result = cudaMallocHost(&p, size); if (result == cudaSuccess) { - index = 1; // PINNED memory + *index = 1; // PINNED memory cuda_pinnd_alloc_size_ += size; return p; } else { diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index e3c50ef6483c61e2016bbd967a4100057c87dca3..a0386a2dad1bb7faf54197a47ca7a5b6d9488817 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -29,14 +29,14 @@ namespace detail { class SystemAllocator { public: virtual ~SystemAllocator() {} - virtual void* Alloc(size_t& index, size_t size) = 0; + virtual void* Alloc(size_t* index, size_t size) = 0; virtual void Free(void* p, size_t size, size_t index) = 0; virtual bool UseGpu() const = 0; }; class CPUAllocator : public SystemAllocator { public: - virtual void* Alloc(size_t& index, size_t size); + virtual void* Alloc(size_t* index, size_t size); virtual void Free(void* p, size_t size, size_t index); virtual bool UseGpu() const; }; @@ -46,7 +46,7 @@ class GPUAllocator : public SystemAllocator { public: explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {} - virtual void* Alloc(size_t& index, size_t size); + virtual void* Alloc(size_t* index, size_t size); virtual void Free(void* p, size_t size, size_t index); virtual bool UseGpu() const; @@ -58,7 +58,7 @@ class GPUAllocator : public SystemAllocator { class CUDAPinnedAllocator : public SystemAllocator { public: - virtual void* Alloc(size_t& index, size_t size); + virtual void* Alloc(size_t* index, size_t size); virtual void Free(void* p, size_t size, size_t index); virtual bool UseGpu() const; diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc index 3e1926f632c57b7906e4a76f43ff7a753d71d97f..268260142c579ea9301d89fcec1613ce5b0e15a5 100644 --- a/paddle/fluid/memory/detail/system_allocator_test.cc +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -22,11 +22,11 @@ limitations under the License. */ DECLARE_bool(use_pinned_memory); -void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { +void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) { bool freed = false; { size_t index; - void* p = a.Alloc(index, size); + void* p = a->Alloc(&index, size); if (size > 0) { EXPECT_NE(p, nullptr); } else { @@ -36,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { int* i = static_cast(p); std::shared_ptr ptr(i, [&](void* p) { freed = true; - a.Free(p, size, index); + a->Free(p, size, index); }); } EXPECT_TRUE(freed); @@ -45,21 +45,21 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { TEST(CPUAllocator, NoLockMem) { FLAGS_use_pinned_memory = false; paddle::memory::detail::CPUAllocator a; - TestAllocator(a, 2048); - TestAllocator(a, 0); + TestAllocator(&a, 2048); + TestAllocator(&a, 0); } TEST(CPUAllocator, LockMem) { FLAGS_use_pinned_memory = true; paddle::memory::detail::CPUAllocator a; - TestAllocator(a, 2048); - TestAllocator(a, 0); + TestAllocator(&a, 2048); + TestAllocator(&a, 0); } #ifdef PADDLE_WITH_CUDA TEST(GPUAllocator, Alloc) { paddle::memory::detail::GPUAllocator a(0); - TestAllocator(a, 2048); - TestAllocator(a, 0); + TestAllocator(&a, 2048); + TestAllocator(&a, 0); } #endif diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/malloc.cc similarity index 99% rename from paddle/fluid/memory/memory.cc rename to paddle/fluid/memory/malloc.cc index 2c13dbc6d51bfa3853cec5270e8115c899f522ea..0c74f62de5c6f5d432ee928945db6dcf385ca209 100644 --- a/paddle/fluid/memory/memory.cc +++ b/paddle/fluid/memory/malloc.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/memory/malloc.h" #include "glog/logging.h" diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h new file mode 100644 index 0000000000000000000000000000000000000000..3e6bfddd69cb16edf323d040ea5369cd551f299e --- /dev/null +++ b/paddle/fluid/memory/malloc.h @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { + +/** + * \brief Allocate memory block in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * \param[in] size Allocation size. + * + * \return Allocated memory block address. + * + * \note If return nullptr, it indicates memory allocation failed + * because insufficient memory in current system. When Alloc + * function is invoked, you must check the returned memory + * address is valid or not. + */ +template +void* Alloc(Place place, size_t size); + +/** + * \brief Free memory block in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * \param[in] ptr Memory block address to free. + * + */ +template +void Free(Place place, void* ptr); + +/** + * \brief Total size of used memory in one place. + * + * \param[in] place Allocation place (CPU or GPU). + * + */ +template +size_t Used(Place place); + +struct Usage : public boost::static_visitor { + size_t operator()(const platform::CPUPlace& cpu) const; + size_t operator()(const platform::CUDAPlace& gpu) const; + size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const; +}; + +size_t memory_usage(const platform::Place& p); + +/** + * \brief Free memory block in one place. + * + * \note In some cases, custom deleter is used to + * deallocate the memory automatically for + * std::unique_ptr in tensor.h. + * + */ +template +class PODDeleter { + static_assert(std::is_pod::value, "T must be POD"); + + public: + explicit PODDeleter(Place place) : place_(place) {} + void operator()(T* ptr) { Free(place_, static_cast(ptr)); } + + private: + Place place_; +}; + +/** + * \brief Free memory block in one place does not meet POD + * + * \note In some cases, custom deleter is used to + * deallocate the memory automatically for + * std::unique_ptr in tensor.h. + * + */ +template +class PlainDeleter { + public: + explicit PlainDeleter(Place place) : place_(place) {} + void operator()(T* ptr) { Free(place_, reinterpret_cast(ptr)); } + + private: + Place place_; +}; + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memory_test.cc b/paddle/fluid/memory/malloc_test.cc similarity index 95% rename from paddle/fluid/memory/memory_test.cc rename to paddle/fluid/memory/malloc_test.cc index 9fbbe62559b1e29d6942a1ada62558b20830489b..d39466ef60c3750600dea726a6570397423d42f6 100644 --- a/paddle/fluid/memory/memory_test.cc +++ b/paddle/fluid/memory/malloc_test.cc @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/memory/malloc.h" #include #include "gtest/gtest.h" #include "paddle/fluid/memory/detail/memory_block.h" -#include "paddle/fluid/memory/detail/meta_data.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/place.h" @@ -28,7 +27,7 @@ inline bool is_aligned(void const *p) { } size_t align(size_t size, paddle::platform::CPUPlace place) { - size += sizeof(paddle::memory::detail::Metadata); + size += sizeof(paddle::memory::detail::MemoryBlock::Desc); size_t alignment = paddle::platform::CpuMinChunkSize(); size_t remaining = size % alignment; return remaining == 0 ? size : size + (alignment - remaining); @@ -86,7 +85,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { #ifdef PADDLE_WITH_CUDA size_t align(size_t size, paddle::platform::CUDAPlace place) { - size += sizeof(paddle::memory::detail::Metadata); + size += sizeof(paddle::memory::detail::MemoryBlock::Desc); size_t alignment = paddle::platform::GpuMinChunkSize(); size_t remaining = size % alignment; return remaining == 0 ? size : size + (alignment - remaining); @@ -142,7 +141,7 @@ TEST(BuddyAllocator, GPUMultAlloc) { } size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) { - size += sizeof(paddle::memory::detail::Metadata); + size += sizeof(paddle::memory::detail::MemoryBlock::Desc); size_t alignment = paddle::platform::CUDAPinnedMinChunkSize(); size_t remaining = size % alignment; return remaining == 0 ? size : size + (alignment - remaining); diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h index 3e6bfddd69cb16edf323d040ea5369cd551f299e..8d904e3be56abf0974ba7379f7ca1b676fcb0409 100644 --- a/paddle/fluid/memory/memory.h +++ b/paddle/fluid/memory/memory.h @@ -14,91 +14,5 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace memory { - -/** - * \brief Allocate memory block in one place. - * - * \param[in] place Allocation place (CPU or GPU). - * \param[in] size Allocation size. - * - * \return Allocated memory block address. - * - * \note If return nullptr, it indicates memory allocation failed - * because insufficient memory in current system. When Alloc - * function is invoked, you must check the returned memory - * address is valid or not. - */ -template -void* Alloc(Place place, size_t size); - -/** - * \brief Free memory block in one place. - * - * \param[in] place Allocation place (CPU or GPU). - * \param[in] ptr Memory block address to free. - * - */ -template -void Free(Place place, void* ptr); - -/** - * \brief Total size of used memory in one place. - * - * \param[in] place Allocation place (CPU or GPU). - * - */ -template -size_t Used(Place place); - -struct Usage : public boost::static_visitor { - size_t operator()(const platform::CPUPlace& cpu) const; - size_t operator()(const platform::CUDAPlace& gpu) const; - size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const; -}; - -size_t memory_usage(const platform::Place& p); - -/** - * \brief Free memory block in one place. - * - * \note In some cases, custom deleter is used to - * deallocate the memory automatically for - * std::unique_ptr in tensor.h. - * - */ -template -class PODDeleter { - static_assert(std::is_pod::value, "T must be POD"); - - public: - explicit PODDeleter(Place place) : place_(place) {} - void operator()(T* ptr) { Free(place_, static_cast(ptr)); } - - private: - Place place_; -}; - -/** - * \brief Free memory block in one place does not meet POD - * - * \note In some cases, custom deleter is used to - * deallocate the memory automatically for - * std::unique_ptr in tensor.h. - * - */ -template -class PlainDeleter { - public: - explicit PlainDeleter(Place place) : place_(place) {} - void operator()(T* ptr) { Free(place_, reinterpret_cast(ptr)); } - - private: - Place place_; -}; - -} // namespace memory -} // namespace paddle +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/memcpy.h" diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu index a000001f41788fb16ac075426f06357cbe42d642..0d898f59ee1b8c783c5357aa7e27581a993a6d30 100644 --- a/paddle/fluid/memory/pinned_memory_test.cu +++ b/paddle/fluid/memory/pinned_memory_test.cu @@ -15,7 +15,6 @@ limitations under the License. */ #include #include "paddle/fluid/memory/detail/memory_block.h" -#include "paddle/fluid/memory/detail/meta_data.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 84eabab563e3404ad2a28bf76116c592db04742e..5ff987ad8b3ba3c9195e87e6c11e70ac98fa0a11 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -263,7 +263,7 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) -cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) +cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc index 6ff363d766db7dd97e1bc193ef7b4a095a7b7c24..ab7c61227114fe7a0ce2ff2515dd560706058b64 100644 --- a/paddle/fluid/operators/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/activation_mkldnn_op.cc @@ -13,8 +13,8 @@ limitations under the License. */ #include "mkldnn.hpp" -#include "mkldnn_activation_op.h" #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/mkldnn_activation_op.h" namespace paddle { namespace operators { @@ -40,18 +40,24 @@ void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm, const T *dst_data = dst->template mutable_data(ctx.GetPlace()); // get memory dim - PADDLE_ENFORCE(src->dims().size() == 4, - "Input dim must be with 4, i.e. NCHW"); + PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4, + "Input dim must be with 2 or 4"); std::vector src_tz = framework::vectorize2int(src->dims()); // create memory description - // TODO(kbinias-intel): support more formats - auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, - mkldnn::memory::format::nchw); + auto data_md = src_tz.size() == 2 + ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nc) + : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); // create memory primitives - auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data); - auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data); + auto src_memory = + mkldnn::memory({data_md, mkldnn_engine}, + static_cast(const_cast(src_data))); + auto dst_memory = + mkldnn::memory({data_md, mkldnn_engine}, + static_cast(const_cast(dst_data))); auto forward_desc = mkldnn::eltwise_forward::desc( mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta); @@ -91,15 +97,21 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm, std::vector src_tz = framework::vectorize2int(x->dims()); // create memory description - auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, - mkldnn::memory::format::nchw); + auto data_md = src_tz.size() == 2 + ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nc) + : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); // create memory primitives - auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src); + auto src_memory = mkldnn::memory( + {data_md, mkldnn_engine}, static_cast(const_cast(src))); auto diff_src_memory = - mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src); + mkldnn::memory({data_md, mkldnn_engine}, + static_cast(const_cast(diff_src))); auto diff_dst_memory = - mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst); + mkldnn::memory({data_md, mkldnn_engine}, + static_cast(const_cast(diff_dst))); auto backward_desc = mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta); diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 7fbe4efc045b6539b498389af94769e5bdb1f82e..c4efbcd3f977ee285e13223d7e0ca420aec63b98 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" diff --git a/paddle/fluid/operators/adagrad_op.cc b/paddle/fluid/operators/adagrad_op.cc index c990fe784380bf78a7f3594c0f49ef5e06e6caea..0153e1253b00ded21a7a14e37faf5a76d904d8d1 100644 --- a/paddle/fluid/operators/adagrad_op.cc +++ b/paddle/fluid/operators/adagrad_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/adagrad_op.h" +#include #include diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h index dbcc7abb0996268b5a3571ba113d9cc56f6f65a3..4309f0a5497456065e5c43bc8f7b265fa711f699 100644 --- a/paddle/fluid/operators/array_operator.h +++ b/paddle/fluid/operators/array_operator.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc index e8123cb1a490be642d1061bba8129f63e681d3c3..993610fdedde4bafd99f59a0adeeeef4526eb089 100644 --- a/paddle/fluid/operators/assign_value_op.cc +++ b/paddle/fluid/operators/assign_value_op.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "paddle/fluid/operators/assign_value_op.h" +#include +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h index c7b1a55a5cd52bd2bacbdea3ee22c75c2a2c12d5..e749d6f6d3685f207f0ad4f2ebc7c3c7ae32992c 100644 --- a/paddle/fluid/operators/assign_value_op.h +++ b/paddle/fluid/operators/assign_value_op.h @@ -14,6 +14,7 @@ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc index 71de78b1181daf4bd0b6d73508638857bafcf560..a168eaeab56128b75bbe97d7ccf843a081b5dced 100644 --- a/paddle/fluid/operators/auc_op.cc +++ b/paddle/fluid/operators/auc_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/auc_op.h" +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h index f4e8208c3f2e238a4acecab4579fc955092d5978..8b016c3d31ad83e66baeb298c61840cc529efa1e 100644 --- a/paddle/fluid/operators/auc_op.h +++ b/paddle/fluid/operators/auc_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel { std::vector thresholds_list; thresholds_list.reserve(num_thresholds); for (int i = 1; i < num_thresholds - 1; i++) { - thresholds_list[i] = (float)i / (num_thresholds - 1); + thresholds_list[i] = static_cast(i) / (num_thresholds - 1); } const float kEpsilon = 1e-7; thresholds_list[0] = 0.0f - kEpsilon; @@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel { float* fp_rate_data = fp_rate.mutable_data(ctx.GetPlace()); float* rec_rate_data = rec_rate.mutable_data(ctx.GetPlace()); for (int i = 0; i < num_thresholds; i++) { - tp_rate_data[i] = - ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon); - fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon); - rec_rate_data[i] = - ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon); + tp_rate_data[i] = (static_cast(tp_data[i]) + epsilon) / + (tp_data[i] + fn_data[i] + epsilon); + fp_rate_data[i] = + static_cast(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon); + rec_rate_data[i] = (static_cast(tp_data[i]) + epsilon) / + (tp_data[i] + fp_data[i] + epsilon); } *auc_data = 0.0f; if (curve == "ROC") { diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc index c95077fcbdb6b6c0da31f30b795dbe4d7d4fe6fe..b21deaf9258567c05a8816b14ac7d6462964e8ba 100644 --- a/paddle/fluid/operators/average_accumulates_op.cc +++ b/paddle/fluid/operators/average_accumulates_op.cc @@ -19,15 +19,15 @@ namespace operators { template <> void GetAccumulators( - const framework::ExecutionContext& ctx, int64_t& num_updates_, - int64_t& num_accumulates_, int64_t& old_num_accumulates_) { + const framework::ExecutionContext& ctx, int64_t* num_updates_, + int64_t* num_accumulates_, int64_t* old_num_accumulates_) { auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); auto* in_num_accumulates = ctx.Input("in_num_accumulates"); auto* in_num_updates = ctx.Input("in_num_updates"); - old_num_accumulates_ = in_old_num_accumulates->data()[0]; - num_accumulates_ = in_num_accumulates->data()[0]; - num_updates_ = in_num_updates->data()[0]; + *old_num_accumulates_ = in_old_num_accumulates->data()[0]; + *num_accumulates_ = in_num_accumulates->data()[0]; + *num_updates_ = in_num_updates->data()[0]; } template <> diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu index 270c46984465e5ca62eaa8da3955ce7a3eaa0c57..046f72b471fa7ffcc82d84262a668c90a7f577a8 100644 --- a/paddle/fluid/operators/average_accumulates_op.cu +++ b/paddle/fluid/operators/average_accumulates_op.cu @@ -19,18 +19,18 @@ namespace paddle { namespace operators { template <> void GetAccumulators( - const framework::ExecutionContext& ctx, int64_t& num_updates_, - int64_t& num_accumulates_, int64_t& old_num_accumulates_) { + const framework::ExecutionContext& ctx, int64_t* num_updates_, + int64_t* num_accumulates_, int64_t* old_num_accumulates_) { auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); auto* in_num_accumulates = ctx.Input("in_num_accumulates"); auto* in_num_updates = ctx.Input("in_num_updates"); auto stream = ctx.cuda_device_context().stream(); - memory::Copy(platform::CPUPlace(), &old_num_accumulates_, + memory::Copy(platform::CPUPlace(), old_num_accumulates_, platform::CUDAPlace(), in_old_num_accumulates->data(), sizeof(int64_t), stream); - memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(), + memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(), in_num_accumulates->data(), sizeof(int64_t), stream); - memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(), + memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(), in_num_updates->data(), sizeof(int64_t), stream); } diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h index f858109d1428dc67d94c253e5a39818eb2d4560d..07ac5ced11605f6d0d5164d1c0f69acbd7bbed60 100644 --- a/paddle/fluid/operators/average_accumulates_op.h +++ b/paddle/fluid/operators/average_accumulates_op.h @@ -29,8 +29,8 @@ using EigenVector = framework::EigenVector; template void GetAccumulators(const framework::ExecutionContext& ctx, - int64_t& num_updates, int64_t& num_accumulates, - int64_t& old_num_accumulates); + int64_t* num_updates, int64_t* num_accumulates, + int64_t* old_num_accumulates); template void SetAccumulators(const framework::ExecutionContext& ctx, @@ -47,8 +47,8 @@ class AverageAccumulatesKernel : public framework::OpKernel { int64_t num_updates = 0; int64_t num_accumulates = 0; int64_t old_num_accumulates = 0; - GetAccumulators(ctx, num_updates, num_accumulates, - old_num_accumulates); + GetAccumulators(ctx, &num_updates, &num_accumulates, + &old_num_accumulates); // Get attrs float average_window = ctx.Attr("average_window"); diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index 3adeeda90645ca983d9d9229b4cc1c4c90302206..719a7465b8d58ef8588ff1e83c2b971eb6fbb00f 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -5,5 +5,5 @@ if(WITH_DISTRIBUTE) set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc) - cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) + cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op) endif() diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index 3cf286575e4b527356972ddec1d9d1ad98caad9b..45f88ec8697d9f3de2612f28889fefc36f7ddbf9 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -137,7 +137,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; - SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req); + SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val); // var handle VarHandle var_h; diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 2e7bf1921a26fc88d854e4db2c501548695a136a..d5fc163bc25409e0607b149b61c6266b38119d9d 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -138,39 +138,48 @@ class RequestPrefetch final : public RequestBase { framework::Scope* scope, const platform::DeviceContext* dev_ctx, framework::Executor* executor, - framework::ProgramDesc* program, int blkid) + framework::ProgramDesc* program, + framework::ExecutorPrepareContext* prefetch_ctx) : RequestBase(service, cq, dev_ctx), responder_(&ctx_), scope_(scope), executor_(executor), program_(program), - blkid_(blkid) { + prefetch_ctx_(prefetch_ctx) { + request_.reset(new VariableResponse(scope, dev_ctx_)); int method_id = static_cast(detail::GrpcMethod::kPrefetchVariable); - service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_, - cq_, this); + service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_, + cq_, cq_, this); } virtual ~RequestPrefetch() {} - virtual std::string GetReqName() { return request_.varname(); } + virtual std::string GetReqName() { return request_->Varname(); } virtual void Process() { // prefetch process... ::grpc::ByteBuffer reply; - // TODO(Yancey1989): execute the Block which containers prefetch ops - VLOG(3) << "RequestPrefetch Process in"; + std::string var_name = request_->OutVarname(); + auto var_desc = program_->Block(0).FindVar(var_name); + framework::Scope* local_scope = &scope_->NewScope(); + auto* var = local_scope->FindVar(var_name); + InitializeVariable(var, var_desc->GetType()); + executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false); + + SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply); responder_.Finish(reply, ::grpc::Status::OK, this); status_ = FINISH; } protected: - sendrecv::VariableMessage request_; + std::shared_ptr request_; ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; framework::Scope* scope_; framework::Executor* executor_; framework::ProgramDesc* program_; + framework::ExecutorPrepareContext* prefetch_ctx_; int blkid_; }; @@ -268,7 +277,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() { } RequestPrefetch* prefetch = new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_, - executor_, program_, prefetch_blk_id_); + executor_, program_, prefetch_ctx_); VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status(); } diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index 380447f47c142bdc16e60f78c4b2d94235ec5060..b6110f92ed4f38a156e0c99ecfb399f3f47a169e 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -63,6 +63,10 @@ class AsyncGRPCServer final { void SetExecutor(framework::Executor *executor) { executor_ = executor; } + void SetPrefetchPreparedCtx(framework::ExecutorPrepareContext *prepared) { + prefetch_ctx_ = prepared; + } + int GetSelectedPort() { return selected_port_; } const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); } @@ -111,6 +115,7 @@ class AsyncGRPCServer final { std::unique_ptr t_prefetch_; int prefetch_blk_id_; + framework::ExecutorPrepareContext *prefetch_ctx_; framework::ProgramDesc *program_; framework::Executor *executor_; int selected_port_; diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc index b89aed0157de8e95564015b3e7f42316a39537f5..c51933718f4ca78e87c77e007c485642000d247d 100644 --- a/paddle/fluid/operators/detail/grpc_server_test.cc +++ b/paddle/fluid/operators/detail/grpc_server_test.cc @@ -20,43 +20,121 @@ limitations under the License. */ #include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + namespace framework = paddle::framework; namespace platform = paddle::platform; namespace detail = paddle::operators::detail; +USE_OP(lookup_table); + std::unique_ptr rpc_service_; +framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { + auto root_block = program->MutableBlock(0); + auto* block = program->AppendBlock(*root_block); + + framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}}); + framework::VariableNameMap output({{"Output", {"out"}}}); + auto op = block->AppendOp(); + op->SetType("lookup_table"); + op->SetInput("W", {"w"}); + op->SetInput("Ids", {"ids"}); + op->SetOutput("Out", {"out"}); + + auto& out = *root_block->Var("out"); + out.SetType(framework::proto::VarType::SELECTED_ROWS); + out.SetShape({10, 10}); + + return block; +} + +void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) { + auto w_var = scope->Var("w"); + w_var->GetMutable(); + + auto out_var = scope->Var("out"); + out_var->GetMutable(); + + auto ids_var = scope->Var("ids"); + ids_var->GetMutable(); +} + +void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, + int64_t rows_numel) { + CreateVarsOnScope(scope, place); + auto ids_var = scope->Var("ids")->GetMutable(); + auto rows = ids_var->mutable_rows(); + for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2); + ids_var->mutable_value()->Resize({rows_numel, 1}); + ids_var->mutable_value()->mutable_data(*place); +} + +void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, + int64_t rows_numel) { + CreateVarsOnScope(scope, place); + auto w = scope->Var("w")->GetMutable(); + auto rows = w->mutable_rows(); + for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i); + auto w_value = w->mutable_value(); + w_value->Resize({rows_numel, 10}); + + auto ptr = w_value->mutable_data(*place); + + for (int64_t i = 0; i < w_value->numel(); ++i) { + ptr[i] = static_cast(i / 10); + } +} + void StartServer(const std::string& endpoint) { rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); + framework::ProgramDesc program; + framework::Scope scope; + platform::CPUPlace place; + framework::Executor exe(place); + platform::CPUDeviceContext ctx(place); + auto* block = AppendPrefetchBlcok(&program); + auto prepared = exe.Prepare(program, block->ID()); + InitTensorsOnServer(&scope, &place, 10); + + rpc_service_->SetProgram(&program); + rpc_service_->SetPrefetchPreparedCtx(prepared.get()); + rpc_service_->SetDevCtx(&ctx); + rpc_service_->SetScope(&scope); + rpc_service_->SetExecutor(&exe); + rpc_service_->RunSyncUpdate(); } TEST(PREFETCH, CPU) { // start up a server instance backend - // TODO(Yancey1989): Need to start a server with optimize blocks and - // prefetch blocks. std::thread server_thread(StartServer, "127.0.0.1:8889"); + sleep(2); framework::Scope scope; platform::CPUPlace place; platform::CPUDeviceContext ctx(place); // create var on local scope - std::string in_var_name("in"); + int64_t rows_numel = 5; + InitTensorsOnClient(&scope, &place, rows_numel); + std::string in_var_name("ids"); std::string out_var_name("out"); - auto* in_var = scope.Var(in_var_name); - auto* in_tensor = in_var->GetMutable(); - in_tensor->Resize({10, 10}); - VLOG(3) << "before mutable_data"; - in_tensor->mutable_data(place); - scope.Var(out_var_name); - - VLOG(3) << "before fetch"; detail::RPCClient client; client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name, out_var_name); client.Wait(); + auto var = scope.Var(out_var_name); + auto value = var->GetMutable()->value(); + auto ptr = value.mutable_data(place); + rpc_service_->ShutDown(); server_thread.join(); rpc_service_.reset(nullptr); + + for (int64_t i = 0; i < rows_numel; ++i) { + EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast(i * 2)); + } } diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index fc12e82a7e6bd10262092d1ca367980df64e91c2..02bb2b9cebb87b83aa1cbef0c644f969b4d17284 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -21,7 +21,7 @@ service SendRecvService { rpc SendVariable(VariableMessage) returns (VoidMessage) {} // Argument VariableMessage for GetVariable should only contain varname. rpc GetVariable(VariableMessage) returns (VariableMessage) {} - // Prefetch variable by Ids + // pre-fetch variable by given variable name and Ids rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} } @@ -67,6 +67,8 @@ message VariableMessage { bytes serialized = 8; // selected_rows data bytes rows = 9; + // Look up table block execution output variable name. + string out_varname = 10; } message VoidMessage {} diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index f8576d01b10f4c0fda4d12d371b2966739acfc21..16c612c45a37dd2ffd17f8d5f5946df30e9b3fe6 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -30,11 +30,9 @@ namespace detail { void SerializeToByteBuffer(const std::string& name, framework::Variable* var, const platform::DeviceContext& ctx, - ::grpc::ByteBuffer* msg) { + ::grpc::ByteBuffer* msg, + const std::string& out_name) { using VarMsg = sendrecv::VariableMessage; - sendrecv::VariableMessage request; - std::string header; - request.AppendToString(&header); // When using GPU, need to free the copied CPU buffer // when the ByteBuffer destroies // TODO(typhoonzero): add unref here, if we have dependent @@ -52,6 +50,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, e.WriteUint64(VarMsg::kTypeFieldNumber, 1); } + if (!out_name.empty()) { + e.WriteString(VarMsg::kOutVarnameFieldNumber, out_name); + } switch (framework::ToVarType(var->Type())) { case framework::proto::VarType_Type_LOD_TENSOR: { auto tensor = var->Get(); diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h index d7954440846b8db9a9add0110fb9a546a762774d..c72e1bd076f670458f3915072154847db6205092 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.h +++ b/paddle/fluid/operators/detail/sendrecvop_utils.h @@ -46,7 +46,8 @@ typedef void (*DestroyCallback)(void*); void SerializeToByteBuffer(const std::string& name, framework::Variable* var, const platform::DeviceContext& ctx, - ::grpc::ByteBuffer* msg); + ::grpc::ByteBuffer* msg, + const std::string& out_varname = std::string()); void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, const platform::DeviceContext& ctx, diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index 78e1d274a92241b5f2093beb63acdc8c497dfb83..c9d7fd6d1581f6f4182e9e3e0d633c13a3c336a5 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -416,6 +416,20 @@ int VariableResponse::Parse(Source* source) { } break; } + case sendrecv::VariableMessage::kOutVarnameFieldNumber: { + uint32_t length; + if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { + return tag; + } + + std::string temp; + if (!input.ReadString(&temp, length)) { + return tag; + } + + meta_.set_out_varname(temp); + break; + } default: { // Unknown tag, return unknown error. diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h index 050b6b84010b4f3e95bc88e5bb738ff18b7fe423..93b0d3cfb4f7d7f336414361773f872d7b259482 100644 --- a/paddle/fluid/operators/detail/variable_response.h +++ b/paddle/fluid/operators/detail/variable_response.h @@ -55,6 +55,7 @@ class VariableResponse { int Parse(const ::grpc::ByteBuffer& byte_buffer); inline std::string Varname() { return meta_.varname(); } + inline std::string OutVarname() { return meta_.out_varname(); } // should call parse first. framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); } diff --git a/paddle/fluid/operators/go_op.cc b/paddle/fluid/operators/go_op.cc index cfa797717d78aa72e1b931b6db6e153270b3424e..58fe32446217e07235b40b9b78190094e57e4951 100644 --- a/paddle/fluid/operators/go_op.cc +++ b/paddle/fluid/operators/go_op.cc @@ -56,11 +56,11 @@ class GoOp : public framework::OperatorBase { // TODO(varunarora): Consider moving this root scope lookup to scope.h. const framework::Scope *root_scope = &scope; - const framework::Scope *parent_scope = &(root_scope->parent()); + const framework::Scope *parent_scope = root_scope->parent(); while (parent_scope != nullptr) { root_scope = parent_scope; - parent_scope = &(parent_scope->parent()); + parent_scope = parent_scope->parent(); } framework::BlockDesc *block = Attr(kBlock); diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h index 99f01c2a255ade81421c2bba95ff3d38ced6f87c..bd19d8908e35e51872d324ea5aa6bb02110d5a92 100644 --- a/paddle/fluid/operators/lod_reset_op.h +++ b/paddle/fluid/operators/lod_reset_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -35,7 +37,7 @@ class LoDResetKernel : public framework::OpKernel { if (lod_t->lod().size() > 0) { auto y_lod = lod_t->lod(); auto last_level = y_lod[y_lod.size() - 1]; - PADDLE_ENFORCE_EQ(last_level.back(), in->dims()[0], + PADDLE_ENFORCE_EQ((int64_t)(last_level.back()), in->dims()[0], "Last value of `Y`'s last level LoD should be equal " "to the first dimension of `X`"); out->set_lod(y_lod); diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc index 277f2856c07b3fec2113486539aec1d9139fae92..04c5872bef4600e30ba572a025cc5f0a5e9839ca 100644 --- a/paddle/fluid/operators/reader/create_batch_reader_op.cc +++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc @@ -39,10 +39,13 @@ class CreateBatchReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) - ->Get(); auto* out = scope.FindVar(Output("Out")) ->template GetMutable(); + if (out->Get() != nullptr) { + return; + } + const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) + ->Get(); out->Reset( new BatchReader(underlying_reader.Get(), Attr("batch_size"))); } diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 96c0c1cbe6d588364416925a7ab1bc8f90ac6fd7..ed868786ab2a80efa42574ed4f579c633ce0becf 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -99,10 +99,13 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { - const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) - ->Get(); auto* out = scope.FindVar(Output("Out")) ->template GetMutable(); + if (out->Get() != nullptr) { + return; + } + const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) + ->Get(); auto place_str = Attr("place"); platform::Place place; diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc index 47d9989bc8748840ec2d39587fde24355d90b6b4..b72ccc77a3e1ec30fd817471d3ffd667974ae684 100644 --- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc +++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc @@ -62,12 +62,15 @@ class CreateMultiPassReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { + auto* out = detail::Ref(scope.FindVar(Output("Out"))) + .GetMutable(); + if (out->Get() != nullptr) { + return; + } const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) ->Get(); - auto& out = detail::Ref(scope.FindVar(Output("Out"))); int pass_num = Attr("pass_num"); - out.GetMutable()->Reset( - new MultiPassReader(underlying_reader.Get(), pass_num)); + out->Reset(new MultiPassReader(underlying_reader.Get(), pass_num)); } }; diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc index 3a1f3805a0483c2f5eabdc7432556051d8308964..b164ce232d6bea7b4ff0c67ee0a7dd83b14f61a2 100644 --- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc +++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc @@ -80,10 +80,14 @@ class CreateShuffleReaderOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { + auto* out = detail::Ref(scope.FindVar(Output("Out"))) + .GetMutable(); + if (out->Get() != nullptr) { + return; + } const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) ->Get(); - auto& var = detail::Ref(scope.FindVar(Output("Out"))); - var.GetMutable()->Reset( + out->Reset( new ShuffleReader(underlying_reader.Get(), static_cast(Attr("buffer_size")))); } diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc index f1c4415f27d54ad09e5cb3659bd16abd82e38215..8c55b4ebbc88f696e99b1194055bed3b0d0b3f0b 100644 --- a/paddle/fluid/operators/spp_op.cc +++ b/paddle/fluid/operators/spp_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/spp_op.h" +#include +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h index 3d2f22632570fe2a28a822370a400390c78b533a..08cb7849d20443862b66ea6096c095b294c7242c 100644 --- a/paddle/fluid/operators/spp_op.h +++ b/paddle/fluid/operators/spp_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/pooling.h" diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index d3d5c8a3429e2070c5472355b4440401eaa699cb..9061e137bd1c789d34665729c48c1c2ea9525c8e 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -10,6 +10,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sum_op.h" +#include +#include #include #include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/operators/detail/safe_ref.h" diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index e7e5346cdca5efaf81c2b0fddedde7406e3b874d..49a4afb3a8a19c97e844e66477c6288772ece807 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h index 42828b7e6564d7da91d608d63fbc0615ef6c4f97..9f8482adedb4c29e32d4109941a2752d942ae49f 100644 --- a/paddle/fluid/operators/top_k_op.h +++ b/paddle/fluid/operators/top_k_op.h @@ -15,6 +15,8 @@ limitations under the License. */ #pragma once #include #include +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index 87b1f530e08df7022d112b26e28511a982052126..4aea9cd65bed615c84c95d891a0a4092678e1444 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/transpose_op.h" +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h index 90f16499a6f52514bfed3dbeb4176ccc956b23d7..895d1ce2cca19c0c1e4aa03cc64eb1425e8bab1a 100644 --- a/paddle/fluid/operators/transpose_op.h +++ b/paddle/fluid/operators/transpose_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc index 0ca7ea00fafc5cf7ab240e1e41710d3b791dfbfb..31859fd1d70dc6e6387258cd5f7412e78a302567 100644 --- a/paddle/fluid/operators/unpool_op.cc +++ b/paddle/fluid/operators/unpool_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/unpool_op.h" +#include +#include namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h index a4421045756bd39728fc14c06efd11a56c7e55af..96abad3de9b959ee611355c67f1fa9e56c430b1b 100644 --- a/paddle/fluid/operators/unpool_op.h +++ b/paddle/fluid/operators/unpool_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/unpooling.h" diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h index 3e3e3089315ab9365925c38b9bce5fb0120d37c3..afbfe69973830bde93ec0af8d1c844580a786663 100644 --- a/paddle/fluid/operators/warpctc_op.h +++ b/paddle/fluid/operators/warpctc_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence_padding.h" diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 6780b8cc6deca64e9eaefa0b40d309449e730c8c..917bdc64abf608b8ade70c47f76a8adffb32046a 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -42,12 +42,12 @@ ENDIF() # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies -cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator - system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) +cc_library(device_context SRCS device_context.cc DEPS malloc + place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) -nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) +nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS}) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) diff --git a/paddle/fluid/platform/call_once.h b/paddle/fluid/platform/call_once.h deleted file mode 100644 index fa34972c38d6e7f77a7e178d68592f9886748fa1..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/call_once.h +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -namespace paddle { -namespace platform { - -/* - The current implementation of std::call_once has a bug described in - https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call. - This is likely caused by a deeper bug of pthread_once, which is discussed in - https://patchwork.ozlabs.org/patch/482350/ - - This wrap is a hack to avoid this bug. -*/ -template -inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) { - bool good = true; - std::exception ex; - try { - std::call_once(flag, - [&](Args&&... args) { - try { - f(args...); - } catch (const std::exception& e) { - ex = e; - good = false; - } catch (...) { - ex = std::runtime_error("excption caught in call_once"); - good = false; - } - }, - args...); - } catch (std::system_error& x) { - throw std::runtime_error("call once failed"); - } - if (!good) { - throw std::exception(ex); - } -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h index d21e29df3cf9b2d78920d8bac41209d200b5ba3a..c5a10a78a4f432b431680c089f255fea777277cb 100644 --- a/paddle/fluid/platform/dynload/nccl.h +++ b/paddle/fluid/platform/dynload/nccl.h @@ -18,7 +18,6 @@ limitations under the License. */ #include // NOLINT -#include "paddle/fluid/platform/call_once.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h" namespace paddle { diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc index b716ad9df41330bd6e22937381d24e33fa3a7914..d60aecf96c8828a5656f81fd3602cfb2e66990cf 100644 --- a/paddle/fluid/platform/float16_test.cc +++ b/paddle/fluid/platform/float16_test.cc @@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/fluid/platform/float16.h" + +#include + +#include "gtest/gtest.h" #include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/lod_tensor.h" -#include - namespace paddle { namespace platform { @@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) { // Conversion operator EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00); - EXPECT_EQ(float(float16(0.5f)), 0.5f); - EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001); - EXPECT_EQ(int(float16(-1)), -1); - EXPECT_EQ(bool(float16(true)), true); + EXPECT_EQ(static_cast(float16(0.5f)), 0.5f); + EXPECT_NEAR(static_cast(float16(0.33333)), 0.33333, 0.0001); + EXPECT_EQ(static_cast(float16(-1)), -1); + EXPECT_EQ(static_cast(float16(true)), true); } TEST(float16, arithmetic_cpu) { - EXPECT_EQ(float(float16(1) + float16(1)), 2); - EXPECT_EQ(float(float16(5) + float16(-5)), 0); - EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001); - EXPECT_EQ(float(float16(3) - float16(5)), -2); - EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001); - EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01); - EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01); - EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001); - EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f); - EXPECT_EQ(float(-float16(512.0f)), -512.0f); - EXPECT_EQ(float(-float16(-512.0f)), 512.0f); + EXPECT_EQ(static_cast(float16(1) + float16(1)), 2); + EXPECT_EQ(static_cast(float16(5) + float16(-5)), 0); + EXPECT_NEAR(static_cast(float16(0.33333f) + float16(0.66667f)), 1.0f, + 0.001); + EXPECT_EQ(static_cast(float16(3) - float16(5)), -2); + EXPECT_NEAR(static_cast(float16(0.66667f) - float16(0.33333f)), + 0.33334f, 0.001); + EXPECT_NEAR(static_cast(float16(3.3f) * float16(2.0f)), 6.6f, 0.01); + EXPECT_NEAR(static_cast(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01); + EXPECT_NEAR(static_cast(float16(2.0f) / float16(3.0f)), 0.66667f, + 0.001); + EXPECT_EQ(static_cast(float16(1.0f) / float16(2.0f)), 0.5f); + EXPECT_EQ(static_cast(-float16(512.0f)), -512.0f); + EXPECT_EQ(static_cast(-float16(-512.0f)), 512.0f); } TEST(float16, comparison_cpu) { diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index 567209df4edc483bcb5c6264c62034ddff50c413..577fc24ceb1d3c83cc0546dc5db9c8c7c1f01f86 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -36,19 +36,19 @@ limitations under the License. */ half *in1, *in2, *out; \ half *d_in1, *d_in2, *d_out; \ int size = sizeof(half); \ - cudaMalloc((void**)&d_in1, size); \ - cudaMalloc((void**)&d_in2, size); \ - cudaMalloc((void**)&d_out, size); \ - in1 = (half*)malloc(size); \ - in2 = (half*)malloc(size); \ - out = (half*)malloc(size); \ + cudaMalloc(reinterpret_cast(&d_in1), size); \ + cudaMalloc(reinterpret_cast(&d_in2), size); \ + cudaMalloc(reinterpret_cast(&d_out), size); \ + in1 = reinterpret_cast(malloc(size)); \ + in2 = reinterpret_cast(malloc(size)); \ + out = reinterpret_cast(malloc(size)); \ in1[0] = half(float16(v_in1)); \ in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \ - EXPECT_EQ(float(float16(out[0])), v_out); \ + EXPECT_EQ(static_cast(float16(out[0])), v_out); \ free(in1); \ free(in2); \ free(out); \ @@ -63,17 +63,17 @@ limitations under the License. */ half *in1, *in2; \ half *d_in1, *d_in2; \ int size = sizeof(half); \ - cudaMalloc((void**)&d_in1, size); \ - cudaMalloc((void**)&d_in2, size); \ - in1 = (half*)malloc(size); \ - in2 = (half*)malloc(size); \ + cudaMalloc(reinterpret_cast(&d_in1), size); \ + cudaMalloc(reinterpret_cast(&d_in2), size); \ + in1 = reinterpret_cast(malloc(size)); \ + in2 = reinterpret_cast(malloc(size)); \ in1[0] = half(float16(v_in1)); \ in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2); \ cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \ - EXPECT_EQ(float(float16(in1[0])), v_out); \ + EXPECT_EQ(static_cast(float16(in1[0])), v_out); \ free(in1); \ free(in2); \ cudaFree(d_in1); \ @@ -87,12 +87,12 @@ limitations under the License. */ half *d_in1, *d_in2; \ bool *out, *d_out; \ int size = sizeof(half); \ - cudaMalloc((void**)&d_in1, size); \ - cudaMalloc((void**)&d_in2, size); \ - cudaMalloc((void**)&d_out, 1); \ - in1 = (half*)malloc(size); \ - in2 = (half*)malloc(size); \ - out = (bool*)malloc(1); \ + cudaMalloc(reinterpret_cast(&d_in1), size); \ + cudaMalloc(reinterpret_cast(&d_in2), size); \ + cudaMalloc(reinterpret_cast(&d_out), 1); \ + in1 = reinterpret_cast(malloc(size)); \ + in2 = reinterpret_cast(malloc(size)); \ + out = reinterpret_cast(malloc(1)); \ in1[0] = half(float16(v_in1)); \ in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ @@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) { LOG(INFO) << "Test Neg on GPU!"; half *in, *d_in; int size = sizeof(half); - cudaMalloc((void**)&d_in, size); - in = (half*)malloc(size); + cudaMalloc(reinterpret_cast(&d_in), size); + in = reinterpret_cast(malloc(size)); in[0] = half(float16(v_in)); cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); Neg<<<1, 1>>>(d_in); cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); - EXPECT_EQ(float(float16(in[0])), v_out); + EXPECT_EQ(static_cast(float16(in[0])), v_out); free(in); cudaFree(d_in); } diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..8f222791edb016df65be5db75831f5f83cf63726 --- /dev/null +++ b/paddle/fluid/pybind/.gitignore @@ -0,0 +1 @@ +pybind.h diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 787925d9f8800b49de5b8b642304605ef4087d1e..884289a7fda65f9713392ec459219b4c89271e73 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -2,13 +2,13 @@ if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc - DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method + DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method parallel_executor ${GLOB_OP_LIB}) else() cc_library(paddle_pybind SHARED SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc - DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method + DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method parallel_executor ${GLOB_OP_LIB}) if(NOT APPLE AND NOT ANDROID) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 748ad75a99ea4955730327a10ae8468a107fed0a..bd8446df6650f5fb1c62e5370fd48216dbf31e17 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -544,13 +544,20 @@ All parameter, weight, gradient are variables in Paddle. [](ParallelExecutor &self, size_t num_threads, bool use_event, const std::vector &places, const std::unordered_set ¶ms, - const ProgramDesc &startup_program, + const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, - Scope *scope, bool allow_op_delay) { - new (&self) ParallelExecutor(num_threads, use_event, places, - params, startup_program, main_program, - loss_var_name, scope, allow_op_delay); + Scope *scope, std::vector &local_scopes, + bool allow_op_delay) { + new (&self) + ParallelExecutor(num_threads, use_event, places, params, + bcast_vars, main_program, loss_var_name, + scope, local_scopes, allow_op_delay); }) + .def("local_scopes", + [](ParallelExecutor &self) -> std::vector * { + return &self.GetLocalScopes(); + }, + py::return_value_policy::reference) .def("run", &ParallelExecutor::Run); BindRecordIOWriter(&m); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index fbe953b2d8f12ca529f3daa01cc8e2fe8875a416..4a9dbd324c90380e784cc9457845fabd858585be 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include #include diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index 77f84cd43bdf35ae6f54b0db2b5f720d24872878..a1f446817e0cbc1b4391398a82b0846d01bbec2c 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -6,6 +6,6 @@ if(WITH_TESTING) add_library(paddle_test_util STATIC TestUtil.cpp) add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) if(NOT MOBILE_INFERENCE) - cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags) + cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init memory gtest gflags) endif() endif() diff --git a/python/.gitignore b/python/.gitignore index 1ba1d4c9b0301ed920f5303089e75dd3a8e4e3fa..53a2b7a76b0dd2d9095f9582540e455e2c1174e2 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -1,6 +1,7 @@ *pyc build dist +paddlepaddle.egg-info paddle.egg-info paddlepaddle_gpu.egg-info .idea diff --git a/python/paddle/.gitignore b/python/paddle/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..98527864664d32f798edc06a53131e8d5a068295 --- /dev/null +++ b/python/paddle/.gitignore @@ -0,0 +1 @@ +version.py diff --git a/python/paddle/fluid/debuger.py b/python/paddle/fluid/debuger.py index 7b4afa9bf65e1369329cd4648c1f5c4bd8fa8357..1c56064a1e8bdc5d975837cb5a75a40d557765ad 100644 --- a/python/paddle/fluid/debuger.py +++ b/python/paddle/fluid/debuger.py @@ -16,6 +16,7 @@ import sys import re from graphviz import GraphPreviewGenerator import proto.framework_pb2 as framework_pb2 +from google.protobuf import text_format _vartype2str_ = [ "UNK", @@ -100,7 +101,7 @@ def repr_var(vardesc): def pprint_program_codes(program_desc): reprs = [] - for block_idx in range(program_desc.num_blocks()): + for block_idx in range(program_desc.desc.num_blocks()): block_desc = program_desc.block(block_idx) block_repr = pprint_block_codes(block_desc) reprs.append(block_repr) @@ -127,7 +128,7 @@ def pprint_block_codes(block_desc, show_backward=False): if type(block_desc) is not framework_pb2.BlockDesc: block_desc = framework_pb2.BlockDesc.FromString( - block_desc.serialize_to_string()) + block_desc.desc.serialize_to_string()) var_reprs = [] op_reprs = [] for var in block_desc.vars: @@ -237,13 +238,13 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"): # draw parameters and args vars = {} for var in desc.vars: - shape = [str(i) for i in var.lod_tensor.tensor.dims] - if not shape: - shape = ['null'] + # TODO(gongwb): format the var.type # create var if var.persistable: varn = graph.add_param( - var.name, var.type, shape, highlight=need_highlight(var.name)) + var.name, + str(var.type).replace("\n", "
", 1), + highlight=need_highlight(var.name)) else: varn = graph.add_arg(var.name, highlight=need_highlight(var.name)) vars[var.name] = varn @@ -268,4 +269,4 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"): for var in op.outputs: add_op_link_var(opn, var, True) - graph(path, show=True) + graph(path, show=False) diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py index 7a2a81be9f269f262160cd082ec3a1d8e8e46811..3c6be913200716ae4f70e2b48ee8faf8078223d2 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/distribute_transpiler.py @@ -102,6 +102,8 @@ def split_dense_variable(var_list, the parameter server side can gain better performance. By default minimum block size is 1024. The max block size is used to prevent very large blocks that may cause send error. + :return: A list of VarBlocks. Each VarBlock specifies a shard of + the var. """ blocks = [] for var in var_list: @@ -192,22 +194,24 @@ class DistributeTranspiler: self.trainer_id = trainer_id pserver_endpoints = pservers.split(",") - # step1 + # step1: For large parameters and gradients, split them into smaller + # blocks. param_list = [pg[0] for pg in params_grads] grad_list = [pg[1] for pg in params_grads] grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints)) param_blocks = split_dense_variable(param_list, len(pserver_endpoints)) - # step2 + # step2: Create new vars for the parameters and gradients blocks and + # add ops to do the split. grad_var_mapping = self._append_split_op(program, grad_blocks) - # step3 + param_var_mapping = self._create_vars_from_blocklist(program, + param_blocks) + # step3: Add gradients as send op inputs and parameters as send + # op outputs. send_inputs = [] send_outputs = [] for b in grad_blocks: # append by order varname, block_id, _ = b.split(":") send_inputs.append(grad_var_mapping[varname][int(block_id)]) - - param_var_mapping = self._create_vars_from_blocklist(program, - param_blocks) for b in param_blocks: varname, block_id, _ = b.split(":") send_outputs.append(param_var_mapping[varname][int(block_id)]) @@ -237,7 +241,7 @@ class DistributeTranspiler: "RPCClient": rpc_client_var}, attrs={"endpoints": pserver_endpoints, "epmap": eplist}) - # step4 + # step4: Concat the parameters splits together after recv. for varname, splited_var in param_var_mapping.iteritems(): if len(splited_var) <= 1: continue @@ -258,13 +262,14 @@ class DistributeTranspiler: def get_pserver_program(self, endpoint): """ Get pserver side program using the endpoint. + TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers. NOTE: assume blocks of the same variable is not distributed on the same pserver, only change param/grad varnames for trainers to fetch. """ # step1 pserver_program = Program() - # step2 + # step2: Create vars to receive vars at parameter servers. recv_inputs = [] for v in self.param_grad_ep_mapping[endpoint]["params"]: self._clone_var(pserver_program.global_block(), v) @@ -278,12 +283,6 @@ class DistributeTranspiler: orig_var_name = v.name[:suff_idx] else: orig_var_name = v.name - single_trainer_var = pserver_program.global_block().create_var( - name=orig_var_name, - persistable=True, - type=v.type, - dtype=v.dtype, - shape=v.shape) if self.trainers > 1: for trainer_id in xrange(self.trainers): var = pserver_program.global_block().create_var( @@ -294,6 +293,12 @@ class DistributeTranspiler: shape=v.shape) recv_inputs.append(var) else: + single_trainer_var = pserver_program.global_block().create_var( + name=orig_var_name, + persistable=True, + type=v.type, + dtype=v.dtype, + shape=v.shape) recv_inputs.append(single_trainer_var) # step3 @@ -344,7 +349,7 @@ class DistributeTranspiler: self._append_pserver_non_opt_ops(block, op) append_block = optimize_block - # append lr decay ops to the child block if exits + # append lr decay ops to the child block if exists lr_ops = self._get_lr_ops() if len(lr_ops) > 0: for _, op in enumerate(lr_ops): @@ -447,8 +452,10 @@ class DistributeTranspiler: block_list, add_trainer_suffix=False): """ + Create vars for each split. NOTE: only grads need to be named for different trainers, use add_trainer_suffix to rename the grad vars. + :return: A dict mapping from original var name to each var split. """ block_map = dict() var_mapping = dict() @@ -615,6 +622,7 @@ class DistributeTranspiler: type="sum", inputs={"X": vars2merge}, outputs={"Out": merged_var}) + # TODO(panyx0718): What if it's SELECTED_ROWS. if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS: optimize_block.append_op( type="scale", @@ -638,7 +646,7 @@ class DistributeTranspiler: shape=param_block.shape) new_inputs[key] = tmpvar elif key == "LearningRate": - # leraning rate variable has already be created by non-optimize op, + # learning rate variable has already be created by non-optimize op, # don't create it once again. lr_varname = opt_op.input(key)[0] if pserver_block.vars.has_key(lr_varname): @@ -773,6 +781,7 @@ class DistributeTranspiler: return False def _get_input_map_from_op(self, varmap, op): + """Returns a dict from op input name to the vars in varmap.""" iomap = dict() for key in op.input_names: vars = [] @@ -785,6 +794,7 @@ class DistributeTranspiler: return iomap def _get_output_map_from_op(self, varmap, op): + """Returns a dict from op output name to the vars in varmap.""" iomap = dict() for key in op.output_names: vars = [] @@ -812,6 +822,7 @@ class DistributeTranspiler: find_ops.append(op) # make a union find struct by the ops in default_main_program ufind = UnionFind(block.ops) + for op1 in block.ops: for op2 in block.ops: # NOTE: we need to skip all optimize ops, since it is connected diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 8bd9161fcb2c38fb71e4493afec2095c1b9833dd..33cf6918178ff746a6b130af0e23a69de0f532fe 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -640,12 +640,26 @@ class Operator(object): """ return self.desc.block_attr(name) + def all_attrs(self): + """ + Get the attribute dict + Returns(dict): The Operator's attribute dict + """ + attr_names = self.attr_names + attr_map = {} + for n in attr_names: + if n == 'sub_block': + attr_map[n] = self.block_attr(n) + else: + attr_map[n] = self.attr(n) + return attr_map + class Block(object): def __init__(self, program, idx): self.desc = program.desc.block(idx) self.vars = dict() # var_name --> var - self.ops = collections.deque() # operator list + self.ops = list() # operator list self.program = program self.removed_vars = dict() @@ -817,6 +831,13 @@ class Block(object): self.ops.append(op) return op + def insert_op(self, index, *args, **kwargs): + self.sync_with_cpp() + op_desc = self.desc.insert_op(index) + op = Operator(block=self, desc=op_desc, *args, **kwargs) + self.ops.insert(index, op) + return op + def delete_ops(self, ops): # remove from cpp # FIXME(typhoonzero): remove only the first occurrence. @@ -828,17 +849,17 @@ class Block(object): self.desc.remove_op(start, end + 1) def slice_ops(self, start, end): - return list(self.ops)[start:end] + return self.ops[start:end] def prepend_op(self, *args, **kwargs): op_desc = self.desc.prepend_op() op = Operator(self, op_desc, *args, **kwargs) - self.ops.appendleft(op) + self.ops.insert(0, op) return op def sync_with_cpp(self): """ - Sync with the desc on the c++ end. + Sync from the desc on the c++ end. This method is used to synchronize the c++ desc instance generated by backward. """ @@ -878,7 +899,7 @@ class Block(object): for index in range((start_index - 1 - 1), -1, -1): op_desc = ops_in_cpp[index] op = Operator(self, op_desc) - self.ops.appendleft(op) + self.ops.insert(0, op) # sync ops append to the end of cpp_ops for index in range((end_index + 1), len(ops_in_cpp)): diff --git a/python/paddle/fluid/graphviz.py b/python/paddle/fluid/graphviz.py index b8d21344fc8f65f4025f28a195dab2d371b30292..125b4efa9d476e561bd78d0365cd92bbf7e66605 100644 --- a/python/paddle/fluid/graphviz.py +++ b/python/paddle/fluid/graphviz.py @@ -83,7 +83,7 @@ class Graph(object): file = open(dot_path, 'w') file.write(self.__str__()) image_path = os.path.join( - os.path.dirname(__file__), dot_path[:-3] + "pdf") + os.path.dirname(dot_path), dot_path[:-3] + "pdf") cmd = ["dot", "-Tpdf", dot_path, "-o", image_path] subprocess.Popen( cmd, @@ -199,7 +199,7 @@ class GraphPreviewGenerator(object): else: self.graph.show(path) - def add_param(self, name, data_type, shape, highlight=False): + def add_param(self, name, data_type, highlight=False): label = '\n'.join([ '<', ' ', @@ -214,11 +214,6 @@ class GraphPreviewGenerator(object): str(data_type), ' ' ' ', - ' ', - ' ' - ' ', '
', - '[%s]' % 'x'.join(shape), - '
>', ]) return self.graph.node( diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index bd7e9c30fed2c38a206bf17a646d8a4433af4099..969398bda4cfd0b2f5e39f45d34a1da9b216901f 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -255,7 +255,32 @@ def _copy_reader_var_(block, var): new_var.desc.set_shapes(var.desc.shapes()) new_var.desc.set_dtypes(var.desc.dtypes()) new_var.persistable = True - return monkey_patch_reader_methods(new_var) + return new_var + + +def _copy_reader_create_op_(block, op): + input_param_names = op.input_names + new_input_map = {} + for param_name in input_param_names: + new_input_map[param_name] = [] + arg_names = op.input(param_name) + for arg_name in arg_names: + new_input_map[param_name].append(block.var(arg_name)) + + output_param_names = op.output_names + new_output_map = {} + for param_name in output_param_names: + new_output_map[param_name] = [] + arg_names = op.output(param_name) + for arg_name in arg_names: + new_output_map[param_name].append(block.var(arg_name)) + + new_op = block.append_op( + type=op.type, + inputs=new_input_map, + outputs=new_output_map, + attrs=op.all_attrs()) + return new_op def open_recordio_file(filename, shapes, lod_levels, dtypes): @@ -283,8 +308,9 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes): startup_var.desc.set_dtypes(dtypes) startup_var.persistable = True - return _copy_reader_var_(default_main_program().current_block(), - startup_var) + main_prog_var = _copy_reader_var_(default_main_program().current_block(), + startup_var) + return monkey_patch_reader_methods(main_prog_var) def open_files(filenames, thread_num, shapes, lod_levels, dtypes): @@ -313,22 +339,25 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes): startup_var.desc.set_dtypes(dtypes) startup_var.persistable = True - return _copy_reader_var_(default_main_program().current_block(), - startup_var) + main_prog_var = _copy_reader_var_(default_main_program().current_block(), + startup_var) + return monkey_patch_reader_methods(main_prog_var) def __create_decorated_reader__(op_type, reader, attrs): var_name = unique_name(op_type) startup_blk = default_startup_program().current_block() startup_var = startup_blk.create_var(name=var_name) - startup_blk.append_op( + startop_op = startup_blk.append_op( type=op_type, inputs={'UnderlyingReader': reader}, outputs={'Out': [startup_var]}, attrs=attrs) startup_var.persistable = True - return _copy_reader_var_(default_main_program().current_block(), - startup_var) + main_prog_block = default_main_program().current_block() + main_prog_var = _copy_reader_var_(main_prog_block, startup_var) + _copy_reader_create_op_(main_prog_block, startop_op) + return monkey_patch_reader_methods(main_prog_var) def create_shuffle_reader(reader, buffer_size): diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 1b3ba414ecb50cc4d75dcaecd1f31265334c9aec..b93f2f974ca28cfd8d03c0dbbf1d401620a15e53 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -22,10 +22,49 @@ __all__ = ['ParallelExecutor'] class ParallelExecutor(object): def __init__(self, - loss_name, use_cuda, + loss_name=None, + main_program=None, num_threads=None, - allow_op_delay=False): + allow_op_delay=False, + share_vars_from=None): + """ + ParallelExecutor can run program in parallel. + + Args: + use_cuda(bool): Whether to use CUDA or not. + loss_name(str, default None): The loss name must set in training. + main_program(Program, default None): The program that need to run, + if not provided, then default_main_program will be used. + num_threads(int, default None): How many threads are used for + training. + allow_op_delay(bool, default False): Whether to delay and buffer + some operators together for scheduling or not, which may + improve performance in some cases, defalut False. + share_vars_from(ParallelExecutor, default None): If provied, + it will share variables from the specified ParallelExecutor. + + Returns: + A ParallelExecutor object. + + Raises: + TypeError: If share_vars_from is provided, but not ParallelExecutor + object. + + Examples: + .. code-block:: python + + train_exe = fluid.ParallelExecutor( + use_cuda=True, loss_name=loss.name) + test_exe = fluid.ParallelExecutor( + use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + train_loss, = train_exe.run([loss.name], feed_dict=feed_dict) + test_loss, = test_exe.run([loss.name], feed_dict=feed_dict) + """ + self._places = [] self._act_places = [] if use_cuda: @@ -50,10 +89,21 @@ class ParallelExecutor(object): else: min(len(self._places) * 2, multiprocessing.cpu_count()) - startup = framework.default_startup_program() - main = framework.default_main_program() + main = main_program + main = main if main else framework.default_main_program() scope = executor.global_scope() + if share_vars_from and not isinstance(share_vars_from, + ParallelExecutor): + raise TypeError("share_vars_from must be ParallelExecutor.") + local_scopes = share_vars_from.executor.local_scopes( + ) if share_vars_from else [] + + persistable_vars = [ + v.name + for v in filter(lambda var: var.persistable, main.list_vars()) + ] + self.executor = core.ParallelExecutor( num_threads, True if use_cuda else False, # use_event @@ -62,10 +112,11 @@ class ParallelExecutor(object): p.name for p in main.global_block().iter_parameters() if not p.stop_gradient ]), - startup.desc, + set(persistable_vars), main.desc, - loss_name, + loss_name if loss_name else '', scope, + local_scopes, allow_op_delay) self.scope = scope diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index fb162f8b7315936824ad40aca0c99e4dd09f9734..c5b53902bca90ae2260a7cda43e6866f897233b3 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -535,9 +535,37 @@ class TestSwish(OpTest): #--------------------test MKLDNN-------------------- -class TestMKLDNNRelu(TestRelu): +class TestMKLDNNReluDim2(TestRelu): def setUp(self): - super(TestMKLDNNRelu, self).setUp() + super(TestMKLDNNReluDim2, self).setUp() + + self.attrs = {"use_mkldnn": True} + + +class TestMKLDNNTanhDim2(TestTanh): + def setUp(self): + super(TestMKLDNNTanhDim2, self).setUp() + + self.attrs = {"use_mkldnn": True} + + +class TestMKLDNNSqrtDim2(TestSqrt): + def setUp(self): + super(TestMKLDNNSqrtDim2, self).setUp() + + self.attrs = {"use_mkldnn": True} + + +class TestMKLDNNAbsDim2(TestAbs): + def setUp(self): + super(TestMKLDNNAbsDim2, self).setUp() + + self.attrs = {"use_mkldnn": True} + + +class TestMKLDNNReluDim4(TestRelu): + def setUp(self): + super(TestMKLDNNReluDim4, self).setUp() x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32") # The same reason with TestAbs @@ -549,9 +577,9 @@ class TestMKLDNNRelu(TestRelu): self.attrs = {"use_mkldnn": True} -class TestMKLDNNTanh(TestTanh): +class TestMKLDNNTanhDim4(TestTanh): def setUp(self): - super(TestMKLDNNTanh, self).setUp() + super(TestMKLDNNTanhDim4, self).setUp() self.inputs = { 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") @@ -560,9 +588,9 @@ class TestMKLDNNTanh(TestTanh): self.attrs = {"use_mkldnn": True} -class TestMKLDNNSqrt(TestSqrt): +class TestMKLDNNSqrtDim4(TestSqrt): def setUp(self): - super(TestMKLDNNSqrt, self).setUp() + super(TestMKLDNNSqrtDim4, self).setUp() self.inputs = { 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") @@ -571,9 +599,9 @@ class TestMKLDNNSqrt(TestSqrt): self.attrs = {"use_mkldnn": True} -class TestMKLDNNAbs(TestAbs): +class TestMKLDNNAbsDim4(TestAbs): def setUp(self): - super(TestMKLDNNAbs, self).setUp() + super(TestMKLDNNAbsDim4, self).setUp() x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32") # The same reason with TestAbs diff --git a/python/paddle/fluid/tests/unittests/test_debugger.py b/python/paddle/fluid/tests/unittests/test_debugger.py index 2b7bbf9218f9b8fd8f5b29ac3cbc2f9680f471eb..67b03f635b6f8a3003efabe5425325080d47f61c 100644 --- a/python/paddle/fluid/tests/unittests/test_debugger.py +++ b/python/paddle/fluid/tests/unittests/test_debugger.py @@ -51,7 +51,9 @@ class TestDebugger(unittest.TestCase): outputs={"Out": mul_out}, attrs={"x_num_col_dims": 1}) - print(debuger.pprint_program_codes(p.desc)) + print(debuger.pprint_program_codes(p)) + + debuger.draw_block_graphviz(p.block(0), path="./test.dot") if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 0f90e0e4df5da93f427b892d1be69f14625d2e29..8401716db88ef3dda68644a052d78b4476c9fdc7 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -207,7 +207,11 @@ class TestParallelExecutorBase(unittest.TestCase): if memory_opt: fluid.memory_optimize(main) - exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) + place = fluid.CUDAPlace(0) + startup_exe = fluid.Executor(place) + startup_exe.run(startup) + + exe = fluid.ParallelExecutor(True, loss_name=loss.name) if batch_size is not None: batch_size *= fluid.core.get_cuda_device_count() begin = time.time() @@ -453,3 +457,41 @@ class TestTransformer(TestParallelExecutorBase): @unittest.skip("transformer is buggy in multi gpu") def test_main(self): self.check_network_convergence(transformer) + + +class ParallelExecutorTestingDuringTraining(unittest.TestCase): + def test_parallel_testing(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = simple_fc_net(True) + test_program = main.clone(for_test=True) + + opt = fluid.optimizer.SGD(learning_rate=0.0001) + opt.minimize(loss) + + batch_size = 32 + image = numpy.random.normal(size=(batch_size, + 784)).astype('float32') + label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64") + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup) + feed_dict = {'image': image, 'label': label} + + train_exe = fluid.ParallelExecutor( + use_cuda=True, loss_name=loss.name, main_program=main) + + test_exe = fluid.ParallelExecutor( + use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + for i in xrange(5): + test_loss, = test_exe.run([loss.name], feed_dict=feed_dict) + test_loss = numpy.array(test_loss) + + train_loss, = train_exe.run([loss.name], feed_dict=feed_dict) + train_loss = numpy.array(train_loss) + self.assertTrue(numpy.allclose(train_loss, test_loss)) diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py index 640264d82f0dc7fa71bf882d5549e30b87b8d7c5..24a0074d9b9621d902d12eb8cb29d9b65be22ed3 100644 --- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py +++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py @@ -15,8 +15,8 @@ import unittest import paddle.fluid as fluid -import paddle -import paddle.dataset.mnist as mnist +import paddle.v2 as paddle +import paddle.v2.dataset.mnist as mnist class TestRecordIO(unittest.TestCase): diff --git a/python/setup.py.in b/python/setup.py.in index 2707d34a2ab327ab4282aa7473d78a3f5c08e890..5e7096e225e08d19e89051603bbc07eff945c78a 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -107,6 +107,7 @@ package_dir={ # So that package points to other directory. 'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform', 'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework', + 'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid', } if '${WITH_FLUID_ONLY}'== 'OFF': package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'